<font size="5">OECD CO2 emissions into Trino pipeline</font>

In [1]:
# 'capture' magic prevents long outputs from spamming your notebook
#%%capture pipoutput

# For loading predefined environment variables from files
# Typically used to load sensitive access credentials
%pip install python-dotenv

# Standard python package for interacting with S3 buckets
%pip install boto3

# Interacting with Trino and using Trino with sqlalchemy
%pip install trino sqlalchemy sqlalchemy-trino

# Pandas and parquet file i/o
%pip install pandas pyarrow fastparquet

# OS-Climate utilities to make data ingest easier
%pip install osc-ingest-tools
%pip install country_converter --upgrade
%pip install pint-pandas
%pip install openscm-units
%pip install pint

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import dotenv_values, load_dotenv
import osc_ingest_trino as osc
import os
import pathlib

Load Environment Variables

In [3]:
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [4]:
# use a catalog that is configured for iceberg
ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'pcaf_sovereign_footprint'
ingest_table = 'oecd_co2_emissions'

In [5]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

trino_bucket = osc.attach_s3_bucket("S3_DEV")

  res = connection.execute(sql.text(query))


In [6]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

Open a Trino connection using JWT for authentication

In [7]:
# make sure schema exists, or table creation below will fail in weird ways
sql = f"""
create schema if not exists {ingest_catalog}.{ingest_schema}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [8]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('default',)
('demo_dv',)
('iceberg_demo',)
('information_schema',)
('pcaf_sovereign_footprint',)
('sandbox',)


Load CO2 emissions file (updated from https://stats.oecd.org/Index.aspx?DataSetCode=IO_GHG_2021)

In [9]:
import pandas as pd
import ParseXLS as parser

ticker_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/OECD/OECD_FD_C02.xlsx')
ticker_file.download_file(f'/tmp/OECD_FD_C02.xlsx')
#ticker_df = pd.read_csv(f'/tmp/API_NY.GDP.PCAP.PP.CD_DS2_en_csv_v2.csv',sep=",",skiprows=4)
#ticker_df

df = parser.process('OECD.ini','OECD.csv') 
df=df.astype({'validity_date': 'int32'}).astype({'country_iso_code': 'str'})
df= df.convert_dtypes()
df[['col1','country_name']] = df['country_name'].str.split(':', expand=True)
df.drop('col1', axis=1, inplace=True)
df= df[['rec_source','data_provider','country_iso_code','country_name','validity_date','attribute','value','value_units']].dropna(subset=['value'])
#df = df.convert_dtypes()
#print(df.info(verbose=True))



Country not found in regex
WLD: World not found in regex
OECD: OECD member countries not found in regex
NONOECD: Non-OECD economies and aggregates not found in regex
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China


OECD.ini
file_list:
['/tmp/OECD_FD_C02.xlsx']
/tmp/OECD_FD_C02.xlsx
2
xls
/tmp/OECD_FD_C0
<configparser.ConfigParser object at 0x7ff9f02c5760>
['0', '6']
86


More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular expression match for HKG: Hong Kong, China
More then one regular exp

86
list columns
['country_iso_code', 'country_name', 'attribute', 'units', 'year', 'value']
['country_iso_code', 'country_name', 'attribute', 'value_units', 'value', 'rec_source', 'data_provider', 'validity_date']


In [10]:
#%run -i TransposeXLS.py --config WDI.ini --output=WDI.csv 

In [11]:
import osc_ingest_trino as osc
columnschema = osc.create_table_schema_pairs(df) 

sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
print(sql)
qres = engine.execute(sql)
print(qres.fetchall())



drop table if exists osc_datacommons_dev.pcaf_sovereign_footprint.oecd_co2_emissions

[(True,)]


In [12]:
print(ingest_catalog)
#df=df.drop(df[df.country_name=="cote d'ivoire"].index)
df.to_sql(ingest_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method=osc.TrinoBatchInsert(batch_size = 1000, verbose = True))

osc_datacommons_dev
constructed fully qualified table name as: "pcaf_sovereign_footprint.oecd_co2_emissions"
inserting 1000 records
  ('OECD_FD_C02.xlsx', 'OECD', 'AUS', ' Australia', 1995, 'FD_CO2\: CO2 emissions embodied in domestic final demand, by source country and industry', 278.026, 'Tonnes, Millions')
  ('OECD_FD_C02.xlsx', 'OECD', 'AUT', ' Austria', 1995, 'FD_CO2\: CO2 emissions embodied in domestic final demand, by source country and industry', 85.139, 'Tonnes, Millions')
  ('OECD_FD_C02.xlsx', 'OECD', 'BEL', ' Belgium', 1995, 'FD_CO2\: CO2 emissions embodied in domestic final demand, by source country and industry', 129.903, 'Tonnes, Millions')
  ...
  ('OECD_FD_C02.xlsx', 'OECD', 'EST', ' Estonia', 2010, 'FD_CO2\: CO2 emissions embodied in domestic final demand, by source country and industry', 14.14, 'Tonnes, Millions')
batch insert result: [(1000,)]
inserting 584 records
  ('OECD_FD_C02.xlsx', 'OECD', 'FIN', ' Finland', 2010, 'FD_CO2\: CO2 emissions embodied in domestic f

In [13]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}" + "where validity_date=2007"""
pd.read_sql(sql, engine)


Unnamed: 0,rec_source,data_provider,country_iso_code,country_name,validity_date,attribute,value,value_units
0,OECD_FD_C02.xlsx,OECD,AUS,Australia,2007,FD_CO2: CO2 emissions embodied in domestic fin...,419.981,"Tonnes, Millions"
1,OECD_FD_C02.xlsx,OECD,AUT,Austria,2007,FD_CO2: CO2 emissions embodied in domestic fin...,98.034,"Tonnes, Millions"
2,OECD_FD_C02.xlsx,OECD,BEL,Belgium,2007,FD_CO2: CO2 emissions embodied in domestic fin...,138.776,"Tonnes, Millions"
3,OECD_FD_C02.xlsx,OECD,CAN,Canada,2007,FD_CO2: CO2 emissions embodied in domestic fin...,581.650,"Tonnes, Millions"
4,OECD_FD_C02.xlsx,OECD,CHL,Chile,2007,FD_CO2: CO2 emissions embodied in domestic fin...,69.470,"Tonnes, Millions"
...,...,...,...,...,...,...,...,...
61,OECD_FD_C02.xlsx,OECD,ZAF,South Africa,2007,FD_CO2: CO2 emissions embodied in domestic fin...,287.801,"Tonnes, Millions"
62,OECD_FD_C02.xlsx,OECD,TWN,Chinese Taipei,2007,FD_CO2: CO2 emissions embodied in domestic fin...,194.809,"Tonnes, Millions"
63,OECD_FD_C02.xlsx,OECD,THA,Thailand,2007,FD_CO2: CO2 emissions embodied in domestic fin...,194.272,"Tonnes, Millions"
64,OECD_FD_C02.xlsx,OECD,TUN,Tunisia,2007,FD_CO2: CO2 emissions embodied in domestic fin...,22.176,"Tonnes, Millions"
