<font size="5">OECD CO2 emissions into Trino pipeline</font>

In [37]:
# 'capture' magic prevents long outputs from spamming your notebook
#%%capture pipoutput

# For loading predefined environment variables from files
# Typically used to load sensitive access credentials
%pip install python-dotenv

# Standard python package for interacting with S3 buckets
%pip install boto3

# Interacting with Trino and using Trino with sqlalchemy
%pip install trino sqlalchemy sqlalchemy-trino

# Pandas and parquet file i/o
%pip install pandas pyarrow fastparquet

# OS-Climate utilities to make data ingest easier
%pip install osc-ingest-tools
%pip install country_converter --upgrade
%pip install pint-pandas
%pip install openscm-units
%pip install pint
%pip install pycountry


^C
[31mERROR: Operation cancelled by user[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting pycountry
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m288.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.t

Load Environment Variables

In [38]:
from dotenv import dotenv_values, load_dotenv
import osc_ingest_trino as osc
import os
import pathlib
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [39]:
# use a catalog that is configured for iceberg
ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'pcaf_sovereign_footprint'
ingest_table = 'sf_oecd_imgr_fco2'

In [40]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

trino_bucket = osc.attach_s3_bucket("S3_DEV")

  res = connection.execute(sql.text(query))


In [41]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

Open a Trino connection using JWT for authentication

In [34]:
# make sure schema exists, or table creation below will fail in weird ways
sql = f"""
create schema if not exists {ingest_catalog}.{ingest_schema}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [35]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('default',)
('demo_dv',)
('iceberg_demo',)
('information_schema',)
('pcaf_sovereign_footprint',)
('sandbox',)


Load CO2 emissions file (updated from https://stats.oecd.org/Index.aspx?DataSetCode=IO_GHG_2021)

In [42]:
import pandas as pd
import csv
import ParseXLS as parser


oecd_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/OECD/IMGR_FCO2.csv')
oecd_file.download_file(f'/tmp/IMGR_FCO2.csv')
#df = parser.process('OECD_IMGR_FCO2.ini','OECD.csv') 
df =pd.read_csv(f'/tmp/IMGR_FCO2.csv')
df= df[['Indicator','COU','Country','PAR','IND','Industry','TIME','Unit','PowerCode','Value']]
df=df.convert_dtypes()
dict = {'Indicator': 'attribute',
        'COU': 'country_iso_code',
        'Country': 'country_name',
        'PAR': 'partner_iso_code',
        'IND': 'industry_code',
        'Industry' : 'industry_name',
        'TIME'   : 'validity_date',
        'Value' : 'value' }
 
# call rename () method
df.rename(columns=dict,
          inplace=True)


df['value_units'] = df['PowerCode'] + ' ' + df['Unit']

df.drop(['PowerCode', 'Unit'], axis=1, inplace=True)

df
         


        






Unnamed: 0,attribute,country_iso_code,country_name,partner_iso_code,industry_code,industry_name,validity_date,value,value_units
0,Foreign CO2 emissions embodied in gross imports,OECD,OECD member countries,WLD,DTOTAL,TOTAL,1995,1518.407,Millions Tonnes
1,Foreign CO2 emissions embodied in gross imports,OECD,OECD member countries,WLD,DTOTAL,TOTAL,1996,1447.641,Millions Tonnes
2,Foreign CO2 emissions embodied in gross imports,OECD,OECD member countries,WLD,DTOTAL,TOTAL,1997,1434.138,Millions Tonnes
3,Foreign CO2 emissions embodied in gross imports,OECD,OECD member countries,WLD,DTOTAL,TOTAL,1998,1620.643,Millions Tonnes
4,Foreign CO2 emissions embodied in gross imports,OECD,OECD member countries,WLD,DTOTAL,TOTAL,1999,1797.149,Millions Tonnes
...,...,...,...,...,...,...,...,...,...
334651,Foreign CO2 emissions embodied in gross imports,ZOTH,Other regions,MMR,D35,"Electricity, gas, steam and air conditioning s...",2014,0.0,Millions Tonnes
334652,Foreign CO2 emissions embodied in gross imports,ZOTH,Other regions,MMR,D35,"Electricity, gas, steam and air conditioning s...",2015,0.0,Millions Tonnes
334653,Foreign CO2 emissions embodied in gross imports,ZOTH,Other regions,MMR,D35,"Electricity, gas, steam and air conditioning s...",2016,0.0,Millions Tonnes
334654,Foreign CO2 emissions embodied in gross imports,ZOTH,Other regions,MMR,D35,"Electricity, gas, steam and air conditioning s...",2017,0.0,Millions Tonnes


In [45]:
import pycountry
df_country = pd.DataFrame([country.__dict__['_fields'] for country in pycountry.countries])
df_country = df_country["alpha_3"].rename({'alpha_3':'country_iso_code'})
df_country
                                           
                                                      


0      ABW
1      AFG
2      AGO
3      AIA
4      ALA
      ... 
244    WSM
245    YEM
246    ZAF
247    ZMB
248    ZWE
Name: alpha_3, Length: 249, dtype: object

In [10]:
import osc_ingest_trino as osc
columnschema = osc.create_table_schema_pairs(df) 

sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
print(sql)
qres = engine.execute(sql)
print(qres.fetchall())


drop table if exists osc_datacommons_dev.pcaf_sovereign_footprint.sf_oecd_imgr_fco2

[(True,)]


In [11]:
df.info(verbose=True)
        
        

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334656 entries, 0 to 334655
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   attribute         334656 non-null  string 
 1   country_iso_code  334656 non-null  string 
 2   partner_iso_code  334656 non-null  string 
 3   industry_code     334656 non-null  string 
 4   industry_name     334656 non-null  string 
 5   validity_date     334656 non-null  Int64  
 6   value             334656 non-null  Float64
 7   value_units       334656 non-null  string 
dtypes: Float64(1), Int64(1), string(6)
memory usage: 21.1 MB


In [12]:
df.to_sql(ingest_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method=osc.TrinoBatchInsert(batch_size = 5000, verbose = True))

constructed fully qualified table name as: "pcaf_sovereign_footprint.sf_oecd_imgr_fco2"
inserting 5000 records
  ('Foreign CO2 emissions embodied in gross imports', 'OECD', 'WLD', 'DTOTAL', 'TOTAL', 1995, 1518.407, 'Millions Tonnes')
  ('Foreign CO2 emissions embodied in gross imports', 'OECD', 'WLD', 'DTOTAL', 'TOTAL', 1996, 1447.641, 'Millions Tonnes')
  ('Foreign CO2 emissions embodied in gross imports', 'OECD', 'WLD', 'DTOTAL', 'TOTAL', 1997, 1434.138, 'Millions Tonnes')
  ...
  ('Foreign CO2 emissions embodied in gross imports', 'AUS', 'NONOECD', 'DTOTAL', 'TOTAL', 2002, 44.56, 'Millions Tonnes')
batch insert result: [(5000,)]
inserting 5000 records
  ('Foreign CO2 emissions embodied in gross imports', 'AUS', 'NONOECD', 'DTOTAL', 'TOTAL', 2003, 56.854, 'Millions Tonnes')
  ('Foreign CO2 emissions embodied in gross imports', 'AUS', 'NONOECD', 'DTOTAL', 'TOTAL', 2004, 69.185, 'Millions Tonnes')
  ('Foreign CO2 emissions embodied in gross imports', 'AUS', 'NONOECD', 'DTOTAL', 'TOTAL'

In [13]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}" + "where validity_date=2007"""
pd.read_sql(sql, engine)

Unnamed: 0,attribute,country_iso_code,partner_iso_code,industry_code,industry_name,validity_date,value,value_units
0,Foreign CO2 emissions embodied in gross imports,FIN,NLD,DTOTAL,TOTAL,2007,0.841,Millions Tonnes
1,Foreign CO2 emissions embodied in gross imports,FIN,NZL,DTOTAL,TOTAL,2007,0.015,Millions Tonnes
2,Foreign CO2 emissions embodied in gross imports,FIN,NOR,DTOTAL,TOTAL,2007,0.871,Millions Tonnes
3,Foreign CO2 emissions embodied in gross imports,FIN,POL,DTOTAL,TOTAL,2007,0.971,Millions Tonnes
4,Foreign CO2 emissions embodied in gross imports,FIN,PRT,DTOTAL,TOTAL,2007,0.174,Millions Tonnes
...,...,...,...,...,...,...,...,...
13939,Foreign CO2 emissions embodied in gross imports,ITA,KAZ,D35,"Electricity, gas, steam and air conditioning s...",2007,0.005,Millions Tonnes
13940,Foreign CO2 emissions embodied in gross imports,ITA,LAO,D35,"Electricity, gas, steam and air conditioning s...",2007,0.000,Millions Tonnes
13941,Foreign CO2 emissions embodied in gross imports,ITA,MYS,D35,"Electricity, gas, steam and air conditioning s...",2007,0.005,Millions Tonnes
13942,Foreign CO2 emissions embodied in gross imports,ITA,MLT,D35,"Electricity, gas, steam and air conditioning s...",2007,0.000,Millions Tonnes
