<font size="5">Ingest WDI - "GDP per capita" data into Trino pipeline</font>

In [2]:
# 'capture' magic prevents long outputs from spamming your notebook
#%%capture pipoutput

# For loading predefined environment variables from files
# Typically used to load sensitive access credentials
%pip install python-dotenv

# Standard python package for interacting with S3 buckets
%pip install boto3

# Interacting with Trino and using Trino with sqlalchemy
%pip install trino sqlalchemy sqlalchemy-trino

# Pandas and parquet file i/o
%pip install pandas pyarrow fastparquet

# OS-Climate utilities to make data ingest easier
%pip install osc-ingest-tools

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
from dotenv import dotenv_values, load_dotenv
import osc_ingest_trino as osc
import os
import pathlib

Load Environment Variables

In [4]:
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [5]:
# use a catalog that is configured for iceberg
ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'sandbox'
ingest_table = 'pcaf_wdi_gdp'

In [6]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

trino_bucket = osc.attach_s3_bucket("S3_DEV")

  res = connection.execute(sql.text(query)).scalar()


In [7]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

Open a Trino connection using JWT for authentication

In [8]:
# make sure schema exists, or table creation below will fail in weird ways
sql = f"""
create schema if not exists {ingest_catalog}.{ingest_schema}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [9]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('default',)
('demo_dv',)
('iceberg_demo',)
('information_schema',)
('pcaf_sovereign_footprint',)
('sandbox',)


Load GDP file (updated sporadically from https://data.worldbank.org/indicator/NY.GDP.PCAP.PP.CD)

In [10]:
import pandas as pd

ticker_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/WDI/API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv')
ticker_file.download_file(f'/tmp/API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv')
#ticker_df = pd.read_csv(f'/tmp/API_NY.GDP.PCAP.PP.CD_DS2_en_csv_v2.csv',sep=",",skiprows=4)
#ticker_df
%run TransposeXLS.py --config WDI.ini --output=WDI.csv 
df = pd.read_csv('WDI.csv')
#df = df[df['country_name'] == 'Germany']
#df_germany = df_germany[['data_provider','country_iso_code','validity_date','attribute','value']]
df= df[['rec_source','data_provider','country_iso_code','country_name','validity_date','attribute','value','unit']].dropna(subset=['value'])
df = df.convert_dtypes()
print(df.info(verbose=True))
df
#df


WDI.ini
file_list:
['/tmp/API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv']
/tmp/API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv
2
csv
/tmp/API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v
<configparser.ConfigParser object at 0x7fb46be55eb0>
['0', '4']
eval_components
['Indicator Name']
['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', 'Unnamed: 65']
Indicator Name
265
265
['country_iso_code', 'country_name', 'attribute', 'unit', 'year', 'value']
['country_iso_code', 'country_name', 'attribute', 'unit', 'value', 'rec_

Unnamed: 0,rec_source,data_provider,country_iso_code,country_name,validity_date,attribute,value,unit
7950,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,ABW,Aruba,1990,"GDP, PPP (current international $)",1447708861.20673,current US$
7951,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,AFE,Africa Eastern and Southern,1990,"GDP, PPP (current international $)",565349520935.696045,current US$
7953,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,AFW,Africa Western and Central,1990,"GDP, PPP (current international $)",354456408577.700012,current US$
7954,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,AGO,Angola,1990,"GDP, PPP (current international $)",38853486198.221001,current US$
7955,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,ALB,Albania,1990,"GDP, PPP (current international $)",8374478544.59225,current US$
...,...,...,...,...,...,...,...,...
16159,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,WSM,Samoa,2020,"GDP, PPP (current international $)",1342987496.78682,current US$
16160,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,XKX,Kosovo,2020,"GDP, PPP (current international $)",20217375007.3176,current US$
16162,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,ZAF,South Africa,2020,"GDP, PPP (current international $)",792398142071.171021,current US$
16163,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,ZMB,Zambia,2020,"GDP, PPP (current international $)",63564551973.879303,current US$


In [11]:
import osc_ingest_trino as osc
columnschema = osc.create_table_schema_pairs(df) 

sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
print(sql)
qres = engine.execute(sql)
print(qres.fetchall())



drop table if exists osc_datacommons_dev.sandbox.pcaf_wdi_gdp

[(True,)]


In [12]:
tabledef = f"""
create table if not exists {ingest_catalog}.{ingest_schema}.{ingest_table}(
{columnschema}
) with (
    format = 'ORC',
    partitioning = array['validity_date']
)
"""
print(tabledef)
qres = engine.execute(tabledef)
print(qres.fetchall())


create table if not exists osc_datacommons_dev.sandbox.pcaf_wdi_gdp(
    rec_source varchar,
    data_provider varchar,
    country_iso_code varchar,
    country_name varchar,
    validity_date varchar,
    attribute varchar,
    value double,
    unit varchar
) with (
    format = 'ORC',
    partitioning = array['validity_date']
)

[(True,)]


In [13]:
# Delete all data from our db, so we start with empty table
sql=f"""
delete from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(None,)]


In [14]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
pd.read_sql(sql, engine)


Unnamed: 0,rec_source,data_provider,country_iso_code,country_name,validity_date,attribute,value,unit


In [15]:
print(ingest_catalog)
#df=df.drop(df[df.country_name=="cote d'ivoire"].index)
df.to_sql(ingest_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method=osc.TrinoBatchInsert(batch_size = 1000, verbose = True))

osc_datacommons_dev
constructed fully qualified table name as: "sandbox.pcaf_wdi_gdp"
inserting 1000 records
  ('API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv', 'WDI', 'ABW', 'Aruba', '1990', 'GDP, PPP (current international $)', 1447708861.20673, 'current US$')
  ('API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv', 'WDI', 'AFE', 'Africa Eastern and Southern', '1990', 'GDP, PPP (current international $)', 565349520935.696, 'current US$')
  ('API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv', 'WDI', 'AFW', 'Africa Western and Central', '1990', 'GDP, PPP (current international $)', 354456408577.7, 'current US$')
  ...
  ('API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv', 'WDI', 'ROU', 'Romania', '1994', 'GDP, PPP (current international $)', 113487054061.437, 'current US$')
batch insert result: [(1000,)]
inserting 1000 records
  ('API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv', 'WDI', 'RUS', 'Russian Federation', '1994', 'GDP, PPP (current international $)', 851031811041.719, 'current US$')
  ('API_NY.GDP.MKTP.PP.CD_DS2_en_cs

In [16]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}" + "where validity_date='2007'"""
pd.read_sql(sql, engine)


Unnamed: 0,rec_source,data_provider,country_iso_code,country_name,validity_date,attribute,value,unit
0,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,NZL,New Zealand,2007,"GDP, PPP (current international $)",1.240522e+11,current US$
1,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,OED,OECD members,2007,"GDP, PPP (current international $)",4.205440e+13,current US$
2,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,OMN,Oman,2007,"GDP, PPP (current international $)",1.264527e+11,current US$
3,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,OSS,Other small states,2007,"GDP, PPP (current international $)",4.567513e+11,current US$
4,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,PAK,Pakistan,2007,"GDP, PPP (current international $)",6.321089e+11,current US$
...,...,...,...,...,...,...,...,...
234,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,LCN,Latin America & Caribbean,2007,"GDP, PPP (current international $)",6.889557e+12,current US$
235,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,LDC,Least developed countries: UN classification,2007,"GDP, PPP (current international $)",1.464202e+12,current US$
236,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,LIC,Low income,2007,"GDP, PPP (current international $)",6.661773e+11,current US$
237,API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv,WDI,LKA,Sri Lanka,2007,"GDP, PPP (current international $)",1.348335e+11,current US$
