In [None]:
# Ingest country iso mapping data into Trino pipeline

In [None]:
Run these in a notebook cell if you need to install onto your nb env

In [37]:
# 'capture' magic prevents long outputs from spamming your notebook
#%%capture pipoutput

# For loading predefined environment variables from files
# Typically used to load sensitive access credentials
%pip install python-dotenv

# Standard python package for interacting with S3 buckets
%pip install boto3

# Interacting with Trino and using Trino with sqlalchemy
%pip install trino sqlalchemy sqlalchemy-trino

# Pandas and parquet file i/o
%pip install pandas pyarrow fastparquet

# OS-Climate utilities to make data ingest easier
%pip install osc-ingest-tools

You should consider upgrading via the '/opt/app-root/bin/python3.8 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/opt/app-root/bin/python3.8 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/opt/app-root/bin/python3.8 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/opt/app-root/bin/python3.8 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/opt/app-root/bin/python3.8 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [38]:
from dotenv import dotenv_values, load_dotenv
import osc_ingest_trino as osc
import os
import pathlib

In [None]:
Load Environment Variables

In [39]:
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [40]:
# use a catalog that is configured for iceberg
ingest_catalog = 'osc_datacommons_iceberg_dev'
ingest_schema = 'pcaf_sovereign_footprint'
ingest_table = 'pcaf_sovereign_map_country_iso'

In [41]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_iceberg_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

trino_bucket = osc.attach_s3_bucket("S3_DEV")

In [42]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

In [None]:
Open a Trino connection using JWT for authentication

In [25]:
# make sure schema exists, or table creation below will fail in weird ways
sql = f"""
create schema if not exists {ingest_catalog}.{ingest_schema}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [10]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('aicoe_osc_demo',)
('company_data',)
('default',)
('defaultschema1',)
('demo',)
('eje_test_iceberg',)
('epa_frs',)
('epa_ghgrp',)
('epacems',)
('epacems_y95_al',)
('essd',)
('ghgrp_demo',)
('gleif',)
('gleif_mdt',)
('iceberg_demo',)
('information_schema',)
('ingest_schema',)
('iso3166',)
('itr_mdt',)
('metastore',)
('metastore_iceberg',)
('osc_corp_data',)
('pcaf_sovereign_footprint',)
('physical_risk_project',)
('pudl',)
('rmi_20210929',)
('rmi_20211120',)
('rmi_20220119',)
('rmi_utility_transition_hub',)
('sec_dera',)
('sfi_geoasset',)
('team1',)
('team2',)
('testaccessschema1',)
('testdb',)
('urgentem',)
('us_census',)
('wri',)
('wri_demo',)
('wri_dev',)
('wri_gppd',)
('wri_gppd_md',)
('wri_new',)
('wri_test',)


In [None]:
Load GDP file (updated sporadically from https://data.worldbank.org/indicator/NY.GDP.PCAP.PP.CD)

In [49]:
import pandas as pd

ticker_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/GENERAL/PCAF_map_iso_country.csv')
ticker_file.download_file(f'/tmp/PCAF_map_iso_country.csv')
df = pd.read_csv(f'/tmp/PCAF_map_iso_country.csv',sep=";",encoding='latin-1')
df = df.convert_dtypes()
#df = df.drop(df[(df.ISO_CODE2=='CI')].index)
#df = df.drop(df[(df.ISO_CODE2=='KP')].index)
#df = df.drop(df[(df.ISO_CODE2=='LA')].index)

df


Unnamed: 0,ISO_CODE2,ISO_CODE3,COUNTRY_NAME
0,AD,AND,Andorra
1,AE,ARE,United Arab Emirates
2,AE,ARE,United Arab Em
3,AF,AFG,Afghanistan
4,AG,ATG,Antigua and Barbuda
...,...,...,...
242,WS,WSM,Samoa
243,YE,YEM,Yemen
244,ZA,ZAF,South Africa
245,ZM,ZMB,Zambia


In [22]:
import osc_ingest_trino as osc
columnschema = osc.create_table_schema_pairs(df) 

sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
print(sql)
qres = engine.execute(sql)
print(qres.fetchall())



drop table if exists osc_datacommons_iceberg_dev.pcaf_sovereign_footprint.pcaf_sovereign_map_country_iso

[(True,)]


In [44]:
tabledef = f"""
create table if not exists {ingest_catalog}.{ingest_schema}.{ingest_table}(
{columnschema}
) with (
    format = 'ORC',
    partitioning = array['iso_code2']
)
"""
print(tabledef)
qres = engine.execute(tabledef)
print(qres.fetchall())


create table if not exists osc_datacommons_iceberg_dev.pcaf_sovereign_footprint.pcaf_sovereign_map_country_iso(
    ISO_CODE2 varchar,
    ISO_CODE3 varchar,
    COUNTRY_NAME varchar
) with (
    format = 'ORC',
    partitioning = array['iso_code2']
)

[(True,)]


In [72]:
# Delete all data from our db, so we start with empty table
sql=f"""
delete from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
qres = engine.execute(sql)
print(qres.fetchall())

TrinoUserError: TrinoUserError(type=USER_ERROR, name=TABLE_NOT_FOUND, message="line 2:1: Table 'osc_datacommons_iceberg_dev.pcaf_sovereign_footprint.pcaf_sovereign_wdi' does not exist", query_id=20220321_153522_00458_s473c)

In [46]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
pd.read_sql(sql, engine)


Unnamed: 0,iso_code2,iso_code3,country_name
0,CK,COK,Cook Islands
1,ER,ERI,Eritrea
2,GT,GTM,Guatemala
3,AG,ATG,Antigua and Barbuda
4,LC,LCA,Saint Lucia
...,...,...,...
147,KI,KIR,Kiribati
148,MK,MKD,Macedonia
149,CG,COG,Congo Rep
150,CU,CUB,Cuba


In [47]:
print(ingest_catalog)
#df=df.drop(df[df.country_name=="cote d'ivoire"].index)
df.to_sql(ingest_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method=osc.TrinoBatchInsert(batch_size = 1, verbose = True))

osc_datacommons_iceberg_dev
inserting 1 records
constructed fully qualified table name as: "pcaf_sovereign_footprint.pcaf_sovereign_map_country_iso"
batch insert result: [(1,)]
inserting 1 records
constructed fully qualified table name as: "pcaf_sovereign_footprint.pcaf_sovereign_map_country_iso"
batch insert result: [(1,)]
inserting 1 records
constructed fully qualified table name as: "pcaf_sovereign_footprint.pcaf_sovereign_map_country_iso"
batch insert result: [(1,)]
inserting 1 records
constructed fully qualified table name as: "pcaf_sovereign_footprint.pcaf_sovereign_map_country_iso"
batch insert result: [(1,)]
inserting 1 records
constructed fully qualified table name as: "pcaf_sovereign_footprint.pcaf_sovereign_map_country_iso"
batch insert result: [(1,)]
inserting 1 records
constructed fully qualified table name as: "pcaf_sovereign_footprint.pcaf_sovereign_map_country_iso"
batch insert result: [(1,)]
inserting 1 records
constructed fully qualified table name as: "pcaf_sovereign

TrinoUserError: TrinoUserError(type=USER_ERROR, name=COLUMN_NOT_FOUND, message="line 2:15: Column 'cote d'ivoire' cannot be resolved", query_id=20220322_142239_00158_iechj)

In [48]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}"""
pd.read_sql(sql, engine)


Unnamed: 0,iso_code2,iso_code3,country_name
0,LY,LBY,Libya
1,LI,LIE,Liechtenstein
2,IN,IND,India
3,GW,GNB,Guinea-Bissau
4,HN,HND,Honduras
...,...,...,...
191,AO,AGO,Angola
192,GA,GAB,Gabon
193,CG,COG,"Congo, Rep."
194,DK,DNK,Denmark
