<font size="5">Ingest WDI - "GDP per capita" data into Trino pipeline</font>

Load Environment Variables

In [1]:
from pcaf_env import *
import shutil

import ParseXLS as parser

ingest_table = 'sf_wdi_gdp'
models_table = 'pcaf_dbt_models'

using connect string: trino://MichaelTiemannOSC@trino-secure-odh-trino.apps.odh-cl2.apps.os-climate.org:443/osc_datacommons_dev
create schema if not exists osc_datacommons_dev.mdt_sandbox


In [2]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('aicoe_osc_demo_results',)
('default',)
('demo_dv',)
('dera',)
('essd',)
('iceberg_demo',)
('information_schema',)
('ingest',)
('mdt_sandbox',)
('pcaf_sovereign_footprint',)
('rmi',)
('sandbox',)
('wri_gppd',)


Load GDP file (updated sporadically from https://data.worldbank.org/indicator/NY.GDP.PCAP.PP.CD)

In [3]:
## GDP
gdp_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/WDI/API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv')
gdp_file.download_file(f'/tmp/API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv')

df = parser.process('WDI_GDP.ini','WDI_GDP.csv') 

## GDP PPP

gdp_ppp_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/WDI/API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv')
gdp_ppp_file.download_file(f'/tmp/API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv')

df2 = parser.process('WDI_GDP_PPP.ini','WDI_GDP_PPP.csv') 

# combine both dataframes

df=pd.concat([df,df2])

df=df.astype({'validity_date': 'int32'})
df= df.convert_dtypes()
df.info(verbose=True)
df= df[['rec_source','data_provider','country_iso_code','country_name','validity_date','attribute','value','value_units']].dropna(subset=['value'])
#df = df.convert_dtypes()
#print(df.info(verbose=True))
#df
#df


WDI_GDP.ini
file_list:
['/tmp/API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv']
/tmp/API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv
2
csv
/tmp/API_NY.GDP.MKTP.CD_DS2_en_csv_v
<configparser.ConfigParser object at 0x7fa0699e9be0>
                    Country Name Country Code     Indicator Name   
0                          Aruba          ABW  GDP (current US$)  \
1    Africa Eastern and Southern          AFE  GDP (current US$)   
2                    Afghanistan          AFG  GDP (current US$)   
3     Africa Western and Central          AFW  GDP (current US$)   
4                         Angola          AGO  GDP (current US$)   
..                           ...          ...                ...   
261                       Kosovo          XKX  GDP (current US$)   
262                  Yemen, Rep.          YEM  GDP (current US$)   
263                 South Africa          ZAF  GDP (current US$)   
264                       Zambia          ZMB  GDP (current US$)   
265                     Zimbabwe          ZWE

In [4]:
#%run -i TransposeXLS.py --config WDI.ini --output=WDI.csv 

In [6]:
try:
    os.mkdir("/opt/app-root/src/PCAF-sovereign-footprint/dbt/pcaf_transform/models", mode=0o755)
except FileExistsError:
    pass
shutil.rmtree("/opt/app-root/src/PCAF-sovereign-footprint/dbt/pcaf_transform/models/.ipynb_checkpoints", ignore_errors=True)

In [7]:
custom_meta_content = {
    'data provider': df[['data_provider']],
    'title': 'World Bank GDP per capita, PPP (current international $)',
    'author': 'International Comparison Program, World Bank | World Development Indicators database, World Bank | Eurostat-OECD PPP Programme.',
    'contact': 'data@worldbank.org',
    'description': json.dumps("""
This indicator provides per capita values for gross domestic product (GDP) expressed in current international dollars converted by purchasing power parity (PPP) conversion factor. 

GDP is the sum of gross value added by all resident producers in the country plus any product taxes and minus any subsidies not included in the value of the products. conversion factor is a spatial price deflator and currency converter that controls for price level differences between countries. Total population is a mid-year population based on the de facto definition of population, which counts all residents regardless of legal status or citizenship."""),
    'release_date': '2022-01-01 00:00:00',
    # How should we describe our transformative step here?
}

def description_is (s):
    return { 'Description': json.dumps(s)}

custom_meta_fields = {}
custom_meta_fields['rec_source'] = description_is("API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv and API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv")
custom_meta_fields['data_provider'] = description_is("WDI")
custom_meta_fields['country_iso_code'] = description_is("ISO-3166 Country Code (alpha_3)")
custom_meta_fields['country_name'] = description_is("ISO-3166 Country Name")
custom_meta_fields['attribute'] = description_is("GDP or GDP PPP")
custom_meta_fields['value'] = description_is("value of GDP or GDP PPP")
custom_meta_fields['value_units'] = description_is("USD")
custom_meta_fields['validity_date'] = description_is("Year of measurement")

for f in custom_meta_fields:
    if 'ISO' in custom_meta_fields[f]['Description']:
        custom_meta_fields[f]['tags'] = ['ISO']
    elif 'IPCC' in custom_meta_fields[f]['Description']:
        custom_meta_fields[f]['tags'] = ['IPCC']
    elif 'WDI' in custom_meta_fields[f]['Description']:
        custom_meta_fields[f]['tags'] = ['WDI']
    elif f == 'year':
        custom_meta_fields[f]['tags'] = ['annual']

In [8]:
create_trino_table_and_dbt_metadata(ingest_table, df, ['validity_date'], custom_meta_content, custom_meta_fields, verbose=True)

drop table if exists mdt_sandbox.sf_wdi_gdp_source
enforcing dataframe partition column order

verifying existence of table osc_datacommons_dev.mdt_sandbox.sf_wdi_gdp_source
create table if not exists osc_datacommons_dev.mdt_sandbox.sf_wdi_gdp_source (
    rec_source varchar,
    data_provider varchar,
    country_iso_code varchar,
    country_name varchar,
    attribute varchar,
    value double,
    value_units varchar,
    validity_date integer
) with (
    format = 'parquet',
    partitioning = array['validity_date']
)

staging dataframe parquet to s3 osc-datacommons-s3-bucket-dev02
/tmp/ingest_temp_a5d3d73f/validity_date=1960/de29c59213424aeda59787da7ce2d75f-0.parquet  -->  trino/ingest/ingest_temp_a5d3d73f/validity_date=1960/de29c59213424aeda59787da7ce2d75f-0.parquet
/tmp/ingest_temp_a5d3d73f/validity_date=1961/de29c59213424aeda59787da7ce2d75f-0.parquet  -->  trino/ingest/ingest_temp_a5d3d73f/validity_date=1961/de29c59213424aeda59787da7ce2d75f-0.parquet
/tmp/ingest_temp_a5d3d73f/

In [9]:
import pandas as pd
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}_source
where country_iso_code='BHS' order by validity_date desc"""
pd.read_sql(sql, engine)


Unnamed: 0,rec_source,data_provider,country_iso_code,country_name,attribute,value,value_units,validity_date
0,API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv,WDI,BHS,"Bahamas, The",GDP (current US$),1.120860e+10,USD,2021
1,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,BHS,"Bahamas, The","GDP, PPP (current international $)",1.353788e+10,USD,2021
2,API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv,WDI,BHS,"Bahamas, The",GDP (current US$),9.699500e+09,USD,2020
3,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,BHS,"Bahamas, The","GDP, PPP (current international $)",1.142966e+10,USD,2020
4,API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv,WDI,BHS,"Bahamas, The",GDP (current US$),1.319280e+10,USD,2019
...,...,...,...,...,...,...,...,...
89,API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv,WDI,BHS,"Bahamas, The",GDP (current US$),2.666667e+08,USD,1964
90,API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv,WDI,BHS,"Bahamas, The",GDP (current US$),2.377451e+08,USD,1963
91,API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv,WDI,BHS,"Bahamas, The",GDP (current US$),2.122549e+08,USD,1962
92,API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv,WDI,BHS,"Bahamas, The",GDP (current US$),1.900980e+08,USD,1961


In [16]:
osc._do_sql(f"insert into {ingest_schema}.{models_table} values(\'{json.dumps(dbt_dict['models'])}\')", engine, verbose=True)

insert into mdt_sandbox.pcaf_dbt_models values('{"sf_wdi_gdp": {"description": "\"\\nThis indicator provides per capita values for gross domestic product (GDP) expressed in current international dollars converted by purchasing power parity (PPP) conversion factor. \\n\\nGDP is the sum of gross value added by all resident producers in the country plus any product taxes and minus any subsidies not included in the value of the products. conversion factor is a spatial price deflator and currency converter that controls for price level differences between countries. Total population is a mid-year population based on the de facto definition of population, which counts all residents regardless of legal status or citizenship.\"", "columns": {"rec_source": {"description": "\"API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv and API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv\""}, "data_provider": {"description": "\"WDI\"", "tags": ["WDI"]}, "country_iso_code": {"description": "\"ISO-3166 Country Code (alpha_3)\"", 

[(1,)]