In [None]:
# Ingest WDI - "GDP per capita" data into Trino pipeline

In [None]:
Run these in a notebook cell if you need to install onto your nb env

In [2]:
# 'capture' magic prevents long outputs from spamming your notebook
#%%capture pipoutput

# For loading predefined environment variables from files
# Typically used to load sensitive access credentials
%pip install python-dotenv

# Standard python package for interacting with S3 buckets
%pip install boto3

# Interacting with Trino and using Trino with sqlalchemy
%pip install trino sqlalchemy sqlalchemy-trino

# Pandas and parquet file i/o
%pip install pandas pyarrow fastparquet

# OS-Climate utilities to make data ingest easier
%pip install osc-ingest-tools

You should consider upgrading via the '/opt/app-root/bin/python3.8 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/opt/app-root/bin/python3.8 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Collecting sqlalchemy-trino
  Downloading sqlalchemy_trino-0.4.1-py3-none-any.whl (14 kB)
Installing collected packages: sqlalchemy-trino
Successfully installed sqlalchemy-trino-0.4.1
You should consider upgrading via the '/opt/app-root/bin/python3.8 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Collecting fastparquet
  Downloading fastparquet-0.8.0-cp38-cp38-manylinux2010_x86_64.whl (1.6 MB)
     |████████████████████████████████| 1.6 MB 32.5 MB/s            
Collecting cramjam>=2.3.0
  Downloading cramjam-2.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.6 MB)
    

In [6]:
from dotenv import dotenv_values, load_dotenv
import osc_ingest_trino as osc
import os
import pathlib

In [5]:
Load Environment Variables

SyntaxError: invalid syntax (315529106.py, line 1)

In [7]:
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [8]:
# use a catalog that is configured for iceberg
ingest_catalog = 'osc_datacommons_iceberg_dev'
ingest_schema = 'pcaf_sovereign_footprint'
ingest_table = 'pcaf_sovereign_wdi'

In [9]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_iceberg_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

trino_bucket = osc.attach_s3_bucket("S3_DEV")

In [10]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

In [None]:
Open a Trino connection using JWT for authentication

In [25]:
# make sure schema exists, or table creation below will fail in weird ways
sql = f"""
create schema if not exists {ingest_catalog}.{ingest_schema}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [10]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('aicoe_osc_demo',)
('company_data',)
('default',)
('defaultschema1',)
('demo',)
('eje_test_iceberg',)
('epa_frs',)
('epa_ghgrp',)
('epacems',)
('epacems_y95_al',)
('essd',)
('ghgrp_demo',)
('gleif',)
('gleif_mdt',)
('iceberg_demo',)
('information_schema',)
('ingest_schema',)
('iso3166',)
('itr_mdt',)
('metastore',)
('metastore_iceberg',)
('osc_corp_data',)
('pcaf_sovereign_footprint',)
('physical_risk_project',)
('pudl',)
('rmi_20210929',)
('rmi_20211120',)
('rmi_20220119',)
('rmi_utility_transition_hub',)
('sec_dera',)
('sfi_geoasset',)
('team1',)
('team2',)
('testaccessschema1',)
('testdb',)
('urgentem',)
('us_census',)
('wri',)
('wri_demo',)
('wri_dev',)
('wri_gppd',)
('wri_gppd_md',)
('wri_new',)
('wri_test',)


In [None]:
Load GDP file (updated sporadically from https://data.worldbank.org/indicator/NY.GDP.PCAP.PP.CD)

In [20]:
import pandas as pd

ticker_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/WDI/API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv')
ticker_file.download_file(f'/tmp/API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv')
#ticker_df = pd.read_csv(f'/tmp/API_NY.GDP.PCAP.PP.CD_DS2_en_csv_v2.csv',sep=",",skiprows=4)
#ticker_df
%run TransposeXLS.py --config WDI.ini --output=WDI.csv 
df = pd.read_csv('WDI.csv')
#df = df[df['country_name'] == 'Germany']
#df_germany = df_germany[['data_provider','country_iso_code','validity_date','attribute','value']]
df= df[['rec_source','data_provider','country_iso_code','country_name','validity_date','attribute','value']].dropna(subset=['value'])
df = df.convert_dtypes()
print(df.info(verbose=True))
df
#df


WDI.ini
file_list:
['/tmp/API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv']
/tmp/API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv
2
csv
/tmp/API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v
<configparser.ConfigParser object at 0x7fd007734f70>
['0', '4']
['0', '4']
eval_components
['Indicator Name']
['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', 'Unnamed: 65']
Indicator Name
265
265
['country_iso_code', 'country_name', 'attribute', 'year', 'value']
['country_iso_code', 'country_name', 'attribute', 'value', 'rec_sourc

Unnamed: 0,rec_source,data_provider,country_iso_code,country_name,validity_date,attribute,value
7950,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,ABW,Aruba,1990,"GDP, PPP (current international $)",1447708861.20673
7951,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,AFE,Africa Eastern and Southern,1990,"GDP, PPP (current international $)",565349520935.696045
7953,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,AFW,Africa Western and Central,1990,"GDP, PPP (current international $)",354456408577.700012
7954,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,AGO,Angola,1990,"GDP, PPP (current international $)",38853486198.221001
7955,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,ALB,Albania,1990,"GDP, PPP (current international $)",8374478544.59225
...,...,...,...,...,...,...,...
16159,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,WSM,Samoa,2020,"GDP, PPP (current international $)",1342987496.78682
16160,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,XKX,Kosovo,2020,"GDP, PPP (current international $)",20217375007.3176
16162,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,ZAF,South Africa,2020,"GDP, PPP (current international $)",792398142071.171021
16163,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,ZMB,Zambia,2020,"GDP, PPP (current international $)",63564551973.879303


In [36]:
import osc_ingest_trino as osc
columnschema = osc.create_table_schema_pairs(df) 

sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
print(sql)
qres = engine.execute(sql)
print(qres.fetchall())



drop table if exists osc_datacommons_iceberg_dev.pcaf_sovereign_footprint.pcaf_sovereign_values

[(True,)]


In [73]:
tabledef = f"""
create table if not exists {ingest_catalog}.{ingest_schema}.{ingest_table}(
{columnschema}
) with (
    format = 'ORC',
    partitioning = array['validity_date']
)
"""
print(tabledef)
qres = engine.execute(tabledef)
print(qres.fetchall())


create table if not exists osc_datacommons_iceberg_dev.pcaf_sovereign_footprint.pcaf_sovereign_wdi(
    rec_source varchar,
    data_provider varchar,
    country_iso_code varchar,
    country_name varchar,
    validity_date varchar,
    attribute varchar,
    value double
) with (
    format = 'ORC',
    partitioning = array['validity_date']
)

[(True,)]


In [15]:
# Delete all data from our db, so we start with empty table
sql=f"""
delete from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(None,)]


In [16]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
pd.read_sql(sql, engine)


Unnamed: 0,rec_source,data_provider,country_iso_code,country_name,validity_date,attribute,value


In [17]:
print(ingest_catalog)
#df=df.drop(df[df.country_name=="cote d'ivoire"].index)
df.to_sql(ingest_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method=osc.TrinoBatchInsert(batch_size = 1000, verbose = True))

osc_datacommons_iceberg_dev
inserting 1000 records
constructed fully qualified table name as: "pcaf_sovereign_footprint.pcaf_sovereign_wdi"
batch insert result: [(1000,)]
inserting 1000 records
constructed fully qualified table name as: "pcaf_sovereign_footprint.pcaf_sovereign_wdi"
batch insert result: [(1000,)]
inserting 1000 records
constructed fully qualified table name as: "pcaf_sovereign_footprint.pcaf_sovereign_wdi"
batch insert result: [(1000,)]
inserting 1000 records
constructed fully qualified table name as: "pcaf_sovereign_footprint.pcaf_sovereign_wdi"
batch insert result: [(1000,)]
inserting 1000 records
constructed fully qualified table name as: "pcaf_sovereign_footprint.pcaf_sovereign_wdi"
batch insert result: [(1000,)]
inserting 1000 records
constructed fully qualified table name as: "pcaf_sovereign_footprint.pcaf_sovereign_wdi"
batch insert result: [(1000,)]
inserting 1000 records
constructed fully qualified table name as: "pcaf_sovereign_footprint.pcaf_sovereign_wdi"
ba

In [18]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}" + "where validity_date='2020'"""
pd.read_sql(sql, engine)


Unnamed: 0,rec_source,data_provider,country_iso_code,country_name,validity_date,attribute,value
0,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,AFE,Africa Eastern and Southern,2020,"GDP, PPP (current international $)",2.495345e+12
1,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,AFG,Afghanistan,2020,"GDP, PPP (current international $)",8.091834e+10
2,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,AFW,Africa Western and Central,2020,"GDP, PPP (current international $)",1.946297e+12
3,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,AGO,Angola,2020,"GDP, PPP (current international $)",2.118373e+11
4,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,ALB,Albania,2020,"GDP, PPP (current international $)",3.813832e+10
...,...,...,...,...,...,...,...
232,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,WSM,Samoa,2020,"GDP, PPP (current international $)",1.342987e+09
233,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,XKX,Kosovo,2020,"GDP, PPP (current international $)",2.021738e+10
234,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,ZAF,South Africa,2020,"GDP, PPP (current international $)",7.923981e+11
235,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,ZMB,Zambia,2020,"GDP, PPP (current international $)",6.356455e+10
