<font size="5">Ingest WDI - "Population" data into Trino pipeline</font>

In [1]:
from pcaf_env import *
import shutil
import openpyxl
import ParseXLS as parser

pcaf_table_prefix = ''
ingest_table = 'sf_wdi_population'
models_table = 'pcaf_dbt_models'

using connect string: trino://MichaelTiemannOSC@trino-secure-odh-trino.apps.odh-cl2.apps.os-climate.org:443/osc_datacommons_dev
create schema if not exists osc_datacommons_dev.mdt_sandbox


Load Environment Variables

In [2]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('aicoe_osc_demo_results',)
('default',)
('demo_dv',)
('dera',)
('essd',)
('iceberg_demo',)
('information_schema',)
('ingest',)
('mdt_sandbox',)
('pcaf_sovereign_footprint',)
('rmi',)
('sandbox',)
('wri_gppd',)


Load GDP file (updated sporadically from https://data.worldbank.org/indicator/NY.GDP.PCAP.PP.CD)

In [3]:
## GDP
pop_data_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/WDI/Population_Data.csv')
pop_data_file.download_file(f'/tmp/Population_Data.csv')

df = parser.process('WDI_population.ini','WDI_population.csv') 

# year values are delivered twice . remove the unnecessary part

df[['validity_date', 'column_to_drop']] = df['validity_date'].str.split(' ', expand=True)

df.drop('column_to_drop', axis=1, inplace=True)


## GDP PPP

#df=df.astype({'validity_date': 'int32'})
df= df.convert_dtypes()
df.info(verbose=True)
df= df[['rec_source','data_provider','country_iso_code','country_name','validity_date','attribute','value','value_units']].dropna(subset=['value'])
#df = df.convert_dtypes()
df.info(verbose=True)

#df


WDI_population.ini
file_list:
['/tmp/Population_Data.csv']
/tmp/Population_Data.csv
2
csv
/tmp/Population_Dat
<configparser.ConfigParser object at 0x7fd5b7765ac0>
           Series Name  Series Code   
0    Population, total  SP.POP.TOTL  \
1    Population, total  SP.POP.TOTL   
2    Population, total  SP.POP.TOTL   
3    Population, total  SP.POP.TOTL   
4    Population, total  SP.POP.TOTL   
..                 ...          ...   
261  Population, total  SP.POP.TOTL   
262  Population, total  SP.POP.TOTL   
263  Population, total  SP.POP.TOTL   
264  Population, total  SP.POP.TOTL   
265  Population, total  SP.POP.TOTL   

                                   Country Name Country Code 1990 [YR1990]   
0                                   Afghanistan          AFG      12412311  \
1                                       Albania          ALB       3286542   
2                                       Algeria          DZA      25758872   
3                                American Samoa         

In [4]:
#%run -i TransposeXLS.py --config WDI.ini --output=WDI.csv 

In [6]:
try:
    os.mkdir("/opt/app-root/src/PCAF-sovereign-footprint/dbt/pcaf_transform/models", mode=0o755)
except FileExistsError:
    pass
shutil.rmtree("/opt/app-root/src/PCAF-sovereign-footprint/dbt/pcaf_transform/models/.ipynb_checkpoints", ignore_errors=True)

In [13]:
custom_meta_content = {
    'data provider': df[['data_provider']],
    'title': 'World Bank GDP per capita, Population',
    'author': 'International Comparison Program, World Bank | World Development Indicators database, World Bank | Eurostat-OECD PPP Programme.',
    'contact': 'data@worldbank.org',
    'description': json.dumps("""
Total population is a mid-year population based on the de facto definition of population, which counts all residents regardless of legal status or citizenship."""),
    'release_date': '2022-01-01 00:00:00',
    # How should we describe our transformative step here?
}

def description_is (s):
    return { 'Description': json.dumps(s)}

custom_meta_fields = {}
custom_meta_fields['rec_source'] = description_is("Population_Data.csv")
custom_meta_fields['data_provider'] = description_is("WDI")
custom_meta_fields['country_iso_code'] = description_is("ISO-3166 Country Code (alpha_3)")
custom_meta_fields['country_name'] = description_is("ISO-3166 Country Name")
custom_meta_fields['attribute'] = description_is("Population, total")
custom_meta_fields['value'] = description_is("Population")
custom_meta_fields['value_units'] = description_is("People, whether citizens or not")
custom_meta_fields['validity_date'] = description_is("Year of measurement")

for f in custom_meta_fields:
    if 'ISO' in custom_meta_fields[f]['Description']:
        custom_meta_fields[f]['tags'] = ['ISO']
    elif 'IPCC' in custom_meta_fields[f]['Description']:
        custom_meta_fields[f]['tags'] = ['IPCC']
    elif 'WDI' in custom_meta_fields[f]['Description']:
        custom_meta_fields[f]['tags'] = ['WDI']
    elif f == 'year':
        custom_meta_fields[f]['tags'] = ['annual']

rec_source
data_provider
country_iso_code
country_name
attribute
value
value_units
validity_date


In [8]:
create_trino_table_and_dbt_metadata(ingest_table, df, ['validity_date'], custom_meta_content, custom_meta_fields, verbose=True)

drop table if exists mdt_sandbox.sf_wdi_population_source
enforcing dataframe partition column order

verifying existence of table osc_datacommons_dev.mdt_sandbox.sf_wdi_population_source
create table if not exists osc_datacommons_dev.mdt_sandbox.sf_wdi_population_source (
    rec_source varchar,
    data_provider varchar,
    country_iso_code varchar,
    country_name varchar,
    attribute varchar,
    value varchar,
    value_units varchar,
    validity_date varchar
) with (
    format = 'parquet',
    partitioning = array['validity_date']
)

staging dataframe parquet to s3 osc-datacommons-s3-bucket-dev02
/tmp/ingest_temp_e560117a/validity_date=1990/bafc673a9a23494e8539a9196bc99265-0.parquet  -->  trino/ingest/ingest_temp_e560117a/validity_date=1990/bafc673a9a23494e8539a9196bc99265-0.parquet
/tmp/ingest_temp_e560117a/validity_date=2000/bafc673a9a23494e8539a9196bc99265-0.parquet  -->  trino/ingest/ingest_temp_e560117a/validity_date=2000/bafc673a9a23494e8539a9196bc99265-0.parquet
/tmp

In [9]:
print(f"ingest_table = {ingest_table}")

sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}_source
where country_iso_code='BHS' order by validity_date desc"""
pd.read_sql(sql, engine)

ingest_table = sf_wdi_population


Unnamed: 0,rec_source,data_provider,country_iso_code,country_name,attribute,value,value_units,validity_date
0,Population_Data.csv,WDI,BHS,"Bahamas, The","Population, total",396914,,2021
1,Population_Data.csv,WDI,BHS,"Bahamas, The","Population, total",393248,,2020
2,Population_Data.csv,WDI,BHS,"Bahamas, The","Population, total",389486,,2019
3,Population_Data.csv,WDI,BHS,"Bahamas, The","Population, total",385635,,2018
4,Population_Data.csv,WDI,BHS,"Bahamas, The","Population, total",381749,,2017
5,Population_Data.csv,WDI,BHS,"Bahamas, The","Population, total",377923,,2016
6,Population_Data.csv,WDI,BHS,"Bahamas, The","Population, total",374200,,2015
7,Population_Data.csv,WDI,BHS,"Bahamas, The","Population, total",370625,,2014
8,Population_Data.csv,WDI,BHS,"Bahamas, The","Population, total",367162,,2013
9,Population_Data.csv,WDI,BHS,"Bahamas, The","Population, total",363581,,2012


In [16]:
osc._do_sql(f"insert into {ingest_schema}.{models_table} values(\'{json.dumps(dbt_dict['models'])}\')", engine, verbose=True)

insert into mdt_sandbox.pcaf_dbt_models values('{"sf_wdi_population": {"description": "\"\\nTotal population is a mid-year population based on the de facto definition of population, which counts all residents regardless of legal status or citizenship.\"", "columns": {"rec_source": {"description": "\"Population_Data.csv\""}, "data_provider": {"description": "\"WDI\"", "tags": ["WDI"]}, "country_iso_code": {"description": "\"ISO-3166 Country Code (alpha_3)\"", "tags": ["ISO"]}, "country_name": {"description": "\"ISO-3166 Country Name\"", "tags": ["ISO"]}, "attribute": {"description": "\"Population, total\""}, "value": {"description": "\"Population\""}, "value_units": {"description": "\"People, whether citizens or not\""}, "validity_date": {"description": "\"Year of measurement\""}}}}')
[(1,)]


[(1,)]