<font size="5">OECD Exchange Rates into Trino pipeline</font>

<font size="3">https://data.oecd.org/conversion/exchange-rates.htm</font>

Load Environment Variables

In [None]:
from pcaf_env import *
import csv

ingest_table = 'sf_oecd_exch_rates'
models_table = 'pcaf_dbt_models'

using connect string: trino://MichaelTiemannOSC@trino-secure-odh-trino.apps.odh-cl2.apps.os-climate.org:443/osc_datacommons_dev
create schema if not exists osc_datacommons_dev.mdt_sandbox


In [None]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('aicoe_osc_demo_results',)
('default',)
('demo_dv',)
('dera',)
('essd',)
('iceberg_demo',)
('information_schema',)
('ingest',)
('mdt_sandbox',)
('pcaf_sovereign_footprint',)
('rmi',)
('sandbox',)
('wri_gppd',)


In [6]:
oecd_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/OECD/DP_LIVE_19072022170858805.csv')
oecd_file.download_file(f'/tmp/DP_LIVE_19072022170858805.csv')
#df = parser.process('OECD_IMGR_FCO2.ini','OECD.csv')

# FIXME: why do we collect INDICATOR, which is always 'EXCH' for all 3963 entries?
df = pd.read_csv(f'/tmp/DP_LIVE_19072022170858805.csv', usecols=['INDICATOR', 'LOCATION','TIME','Value'])[['INDICATOR', 'LOCATION','TIME','Value']]
df = df.rename(columns={'INDICATOR':'attribute', 'LOCATION':'country_iso_code','TIME':'validity_date','Value':'value' }).convert_dtypes()
df.info(verbose=True)
display(df)

df[df['country_iso_code'] == 'DEU']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3963 entries, 0 to 3962
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   attribute         3963 non-null   string 
 1   country_iso_code  3963 non-null   string 
 2   validity_date     3963 non-null   Int64  
 3   value             3963 non-null   Float64
dtypes: Float64(1), Int64(1), string(2)
memory usage: 131.7 KB


Unnamed: 0,attribute,country_iso_code,validity_date,value
0,EXCH,AUS,1950,0.892857
1,EXCH,AUS,1951,0.892857
2,EXCH,AUS,1952,0.892857
3,EXCH,AUS,1953,0.892857
4,EXCH,AUS,1954,0.892857
...,...,...,...,...
3958,EXCH,SEN,2017,580.65675
3959,EXCH,SEN,2018,555.446458
3960,EXCH,SEN,2019,585.911013
3961,EXCH,SEN,2020,575.586005


Unnamed: 0,attribute,country_iso_code,validity_date,value
536,EXCH,DEU,1950,2.144861
537,EXCH,DEU,1951,2.144861
538,EXCH,DEU,1952,2.144861
539,EXCH,DEU,1953,2.147426
540,EXCH,DEU,1954,2.147426
...,...,...,...,...
603,EXCH,DEU,2017,0.885206
604,EXCH,DEU,2018,0.846773
605,EXCH,DEU,2019,0.893276
606,EXCH,DEU,2020,0.875506


In [6]:
try:
    os.mkdir("/opt/app-root/src/PCAF-sovereign-footprint/dbt/pcaf_transform/models", mode=0o755)
except FileExistsError:
    pass
shutil.rmtree("/opt/app-root/src/PCAF-sovereign-footprint/dbt/pcaf_transform/models/.ipynb_checkpoints", ignore_errors=True)

In [7]:
custom_meta_content = {
    'data provider': 'OECD',
    'title': 'Average Foreign Currency Exchange Rates vs USD',
    'author': 'OECD',
    'contact': 'stan.contact@oecd.org',
    'description': json.dumps("""
Average annual exchange rate converting subject currency to USD."""),
    'release_date': '2022-01-01 00:00:00',
    # How should we describe our transformative step here?
}

def description_is (s):
    return { 'Description': json.dumps(s)}

custom_meta_fields = {}
custom_meta_fields['country_iso_code'] = description_is("ISO-3166 Country Code (alpha_3)")
custom_meta_fields['attribute'] = description_is("Foreign Currency Exchange Rate")
custom_meta_fields['value'] = description_is("Units per 1.0 USD")
custom_meta_fields['validity_date'] = description_is("Year of measurement")

for f in custom_meta_fields:
    if 'ISO' in custom_meta_fields[f]['Description']:
        custom_meta_fields[f]['tags'] = ['ISO']
    elif 'IPCC' in custom_meta_fields[f]['Description']:
        custom_meta_fields[f]['tags'] = ['IPCC']
    elif 'WDI' in custom_meta_fields[f]['Description']:
        custom_meta_fields[f]['tags'] = ['WDI']
    elif f == 'year':
        custom_meta_fields[f]['tags'] = ['annual']

In [7]:
create_trino_table_and_dbt_metadata(ingest_table, df, ['country_iso_code'], custom_meta_content, custom_meta_fields, verbose=True)

drop table if exists mdt_sandbox.sf_oecd_exch_rates_source
enforcing dataframe partition column order

verifying existence of table osc_datacommons_dev.mdt_sandbox.sf_oecd_exch_rates_source
create table if not exists osc_datacommons_dev.mdt_sandbox.sf_oecd_exch_rates_source (
    attribute varchar,
    validity_date bigint,
    value double,
    country_iso_code varchar
) with (
    format = 'parquet',
    partitioning = array['country_iso_code']
)

staging dataframe parquet to s3 osc-datacommons-s3-bucket-dev02
/tmp/ingest_temp_566d37a3/country_iso_code=AUS/d9a99362ee3c43c9859503560dd41921-0.parquet  -->  trino/ingest/ingest_temp_566d37a3/country_iso_code=AUS/d9a99362ee3c43c9859503560dd41921-0.parquet
/tmp/ingest_temp_566d37a3/country_iso_code=AUT/d9a99362ee3c43c9859503560dd41921-0.parquet  -->  trino/ingest/ingest_temp_566d37a3/country_iso_code=AUT/d9a99362ee3c43c9859503560dd41921-0.parquet
/tmp/ingest_temp_566d37a3/country_iso_code=BEL/d9a99362ee3c43c9859503560dd41921-0.parquet  -->

In [8]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}_source
where validity_date=2020"""
pd.read_sql(sql, engine)

Unnamed: 0,attribute,validity_date,value,country_iso_code
0,EXCH,2020,0.875506,LUX
1,EXCH,2020,0.875506,LVA
2,EXCH,2020,575.586005,CMR
3,EXCH,2020,106.774582,JPN
4,EXCH,2020,1.379742,SGP
...,...,...,...,...
59,EXCH,2020,3694.854072,COL
60,EXCH,2020,3.109017,GEO
61,EXCH,2020,5.155179,BRA
62,EXCH,2020,72.104908,RUS


In [16]:
osc._do_sql(f"insert into {ingest_schema}.{models_table} values(\'{json.dumps(dbt_dict['models'])}\')", engine, verbose=True)

insert into mdt_sandbox.pcaf_dbt_models values('{"sf_wdi_gdp": {"description": "\"\\nThis indicator provides per capita values for gross domestic product (GDP) expressed in current international dollars converted by purchasing power parity (PPP) conversion factor. \\n\\nGDP is the sum of gross value added by all resident producers in the country plus any product taxes and minus any subsidies not included in the value of the products. conversion factor is a spatial price deflator and currency converter that controls for price level differences between countries. Total population is a mid-year population based on the de facto definition of population, which counts all residents regardless of legal status or citizenship.\"", "columns": {"rec_source": {"description": "\"API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv and API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv\""}, "data_provider": {"description": "\"WDI\"", "tags": ["WDI"]}, "country_iso_code": {"description": "\"ISO-3166 Country Code (alpha_3)\"", 

[(1,)]