<font size="5">OECD CO2 emissions into Trino pipeline</font>

Load Environment Variables

In [2]:
from dotenv import dotenv_values, load_dotenv
import osc_ingest_trino as osc
import os
import pathlib
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [3]:
# use a catalog that is configured for iceberg
ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'pcaf_sovereign_footprint'
ingest_table = 'sf_oecd_imgr_fco2'

In [4]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

trino_bucket = osc.attach_s3_bucket("S3_DEV")

In [5]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

Open a Trino connection using JWT for authentication

In [6]:
# make sure schema exists, or table creation below will fail in weird ways
sql = f"""
create schema if not exists {ingest_catalog}.{ingest_schema}
"""
qres = engine.execute(sql)
#print(qres.fetchall())

In [6]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('aicoe_osc_demo_results',)
('default',)
('demo_dv',)
('dera',)
('essd',)
('iceberg_demo',)
('information_schema',)
('ingest',)
('mdt_sandbox',)
('pcaf_sovereign_footprint',)
('rmi',)
('sandbox',)
('wri_gppd',)


In [8]:
oecd_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/OECD/IMGR_FCO2.csv')
oecd_file.download_file('IMGR_FCO2.csv')

Load CO2 emissions file (updated from https://stats.oecd.org/Index.aspx?DataSetCode=IO_GHG_2021)

In [7]:
import pandas as pd
import csv
import ParseXLS as parser


oecd_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/OECD/IMGR_FCO2.csv')
oecd_file.download_file(f'/tmp/IMGR_FCO2.csv')


#df = parser.process('OECD_IMGR_FCO2.ini','OECD.csv') 
df =pd.read_csv(f'/tmp/IMGR_FCO2.csv')
df= df[['Indicator','COU','Country','PAR','IND','Industry','TIME','Value']]
df=df.convert_dtypes()
dict = {'Indicator': 'attribute',
        'COU': 'country_iso_code',
        'Country': 'country_name',
        'PAR': 'partner_iso_code',
        'IND': 'industry_code',
        'Industry' : 'industry_name',
        'TIME'   : 'validity_date',
        'Value' : 'value' }
 
# call rename () method
df.rename(columns=dict,
          inplace=True)


df['value_units'] = 'Mt CO2'



df[df['country_iso_code']=='GBR'].head(100)
         


        






Unnamed: 0,attribute,country_iso_code,country_name,partner_iso_code,industry_code,industry_name,validity_date,value,value_units
149184,Foreign CO2 emissions embodied in gross imports,GBR,United Kingdom,WLD,DTOTAL,TOTAL,1995,219.299,Mt CO2
149185,Foreign CO2 emissions embodied in gross imports,GBR,United Kingdom,WLD,DTOTAL,TOTAL,1996,230.305,Mt CO2
149186,Foreign CO2 emissions embodied in gross imports,GBR,United Kingdom,WLD,DTOTAL,TOTAL,1997,246.581,Mt CO2
149187,Foreign CO2 emissions embodied in gross imports,GBR,United Kingdom,WLD,DTOTAL,TOTAL,1998,271.334,Mt CO2
149188,Foreign CO2 emissions embodied in gross imports,GBR,United Kingdom,WLD,DTOTAL,TOTAL,1999,286.115,Mt CO2
...,...,...,...,...,...,...,...,...,...
149279,Foreign CO2 emissions embodied in gross imports,GBR,United Kingdom,AUT,DTOTAL,TOTAL,2018,1.194,Mt CO2
149280,Foreign CO2 emissions embodied in gross imports,GBR,United Kingdom,BEL,DTOTAL,TOTAL,1995,5.629,Mt CO2
149281,Foreign CO2 emissions embodied in gross imports,GBR,United Kingdom,BEL,DTOTAL,TOTAL,1996,5.985,Mt CO2
149282,Foreign CO2 emissions embodied in gross imports,GBR,United Kingdom,BEL,DTOTAL,TOTAL,1997,7.032,Mt CO2


In [8]:
import pycountry
df_country = pd.DataFrame([country.__dict__['_fields'] for country in pycountry.countries])
df_country = df_country["alpha_3"].rename({'alpha_3':'country_iso_code'})
df_country
                                           
                                                      


0      ABW
1      AFG
2      AGO
3      AIA
4      ALA
      ... 
244    WSM
245    YEM
246    ZAF
247    ZMB
248    ZWE
Name: alpha_3, Length: 249, dtype: object

In [13]:
import pycountry
for country in pycountry.countries:
    print(country)

Country(alpha_2='AW', alpha_3='ABW', flag='🇦🇼', name='Aruba', numeric='533')
Country(alpha_2='AF', alpha_3='AFG', flag='🇦🇫', name='Afghanistan', numeric='004', official_name='Islamic Republic of Afghanistan')
Country(alpha_2='AO', alpha_3='AGO', flag='🇦🇴', name='Angola', numeric='024', official_name='Republic of Angola')
Country(alpha_2='AI', alpha_3='AIA', flag='🇦🇮', name='Anguilla', numeric='660')
Country(alpha_2='AX', alpha_3='ALA', flag='🇦🇽', name='Åland Islands', numeric='248')
Country(alpha_2='AL', alpha_3='ALB', flag='🇦🇱', name='Albania', numeric='008', official_name='Republic of Albania')
Country(alpha_2='AD', alpha_3='AND', flag='🇦🇩', name='Andorra', numeric='020', official_name='Principality of Andorra')
Country(alpha_2='AE', alpha_3='ARE', flag='🇦🇪', name='United Arab Emirates', numeric='784')
Country(alpha_2='AR', alpha_3='ARG', flag='🇦🇷', name='Argentina', numeric='032', official_name='Argentine Republic')
Country(alpha_2='AM', alpha_3='ARM', flag='🇦🇲', name='Armenia', num

In [9]:
import osc_ingest_trino as osc
df = df.convert_dtypes()
columnschema = osc.create_table_schema_pairs(df) 

sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
print(sql)
qres = engine.execute(sql)
#print(qres.fetchall())


drop table if exists osc_datacommons_dev.pcaf_sovereign_footprint.sf_oecd_imgr_fco2



In [10]:
df.info(verbose=True)
        
        

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334656 entries, 0 to 334655
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   attribute         334656 non-null  string 
 1   country_iso_code  334656 non-null  string 
 2   country_name      334656 non-null  string 
 3   partner_iso_code  334656 non-null  string 
 4   industry_code     334656 non-null  string 
 5   industry_name     334656 non-null  string 
 6   validity_date     334656 non-null  Int64  
 7   value             334656 non-null  Float64
 8   value_units       334656 non-null  string 
dtypes: Float64(1), Int64(1), string(7)
memory usage: 23.6 MB


In [11]:
df.to_sql(ingest_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method=osc.TrinoBatchInsert(batch_size = 5000, verbose = True))

constructed fully qualified table name as: "pcaf_sovereign_footprint.sf_oecd_imgr_fco2"
inserting 5000 records
  ('Foreign CO2 emissions embodied in gross imports', 'OECD', 'OECD member countries', 'WLD', 'DTOTAL', 'TOTAL', 1995, 1518.407, 'Mt CO2')
  ('Foreign CO2 emissions embodied in gross imports', 'OECD', 'OECD member countries', 'WLD', 'DTOTAL', 'TOTAL', 1996, 1447.641, 'Mt CO2')
  ('Foreign CO2 emissions embodied in gross imports', 'OECD', 'OECD member countries', 'WLD', 'DTOTAL', 'TOTAL', 1997, 1434.138, 'Mt CO2')
  ...
  ('Foreign CO2 emissions embodied in gross imports', 'AUS', 'Australia', 'NONOECD', 'DTOTAL', 'TOTAL', 2002, 44.56, 'Mt CO2')
batch insert result: [(5000,)]
inserting 5000 records
  ('Foreign CO2 emissions embodied in gross imports', 'AUS', 'Australia', 'NONOECD', 'DTOTAL', 'TOTAL', 2003, 56.854, 'Mt CO2')
  ('Foreign CO2 emissions embodied in gross imports', 'AUS', 'Australia', 'NONOECD', 'DTOTAL', 'TOTAL', 2004, 69.185, 'Mt CO2')
  ('Foreign CO2 emissions emb

In [20]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}" + "where validity_date=2010 and partner_iso_code='ARG' and industry_code='D35'"""
df_exp = pd.read_sql(sql, engine)
pd.set_option('display.max_rows', df_exp.shape[0]+1)
df_exp

Unnamed: 0,attribute,country_iso_code,country_name,partner_iso_code,industry_code,industry_name,validity_date,value,value_units
0,Foreign CO2 emissions embodied in gross imports,MMR,Myanmar,ARG,D35,"Electricity, gas, steam and air conditioning s...",2010,0.0,Mt CO2
1,Foreign CO2 emissions embodied in gross imports,PHL,Philippines,ARG,D35,"Electricity, gas, steam and air conditioning s...",2010,0.0,Mt CO2
2,Foreign CO2 emissions embodied in gross imports,ROU,Romania,ARG,D35,"Electricity, gas, steam and air conditioning s...",2010,0.0,Mt CO2
3,Foreign CO2 emissions embodied in gross imports,GBR,United Kingdom,ARG,D35,"Electricity, gas, steam and air conditioning s...",2010,0.0,Mt CO2
4,Foreign CO2 emissions embodied in gross imports,LAO,Lao PDR,ARG,D35,"Electricity, gas, steam and air conditioning s...",2010,0.0,Mt CO2
5,Foreign CO2 emissions embodied in gross imports,EU13,EU28 excluding EU15,ARG,D35,"Electricity, gas, steam and air conditioning s...",2010,0.0,Mt CO2
6,Foreign CO2 emissions embodied in gross imports,EA19,Euro area (19 countries),ARG,D35,"Electricity, gas, steam and air conditioning s...",2010,0.0,Mt CO2
7,Foreign CO2 emissions embodied in gross imports,ROW,Rest of the World,ARG,D35,"Electricity, gas, steam and air conditioning s...",2010,0.039,Mt CO2
8,Foreign CO2 emissions embodied in gross imports,G20,Group of Twenty,ARG,D35,"Electricity, gas, steam and air conditioning s...",2010,0.0,Mt CO2
9,Foreign CO2 emissions embodied in gross imports,IND,India,ARG,D35,"Electricity, gas, steam and air conditioning s...",2010,0.0,Mt CO2
