<font size="5">OECD CO2 emissions into Trino pipeline</font>

https://stats.oecd.org/Index.aspx?DataSetCode=IO_GHG_2019#


In [1]:
from dotenv import dotenv_values, load_dotenv
import osc_ingest_trino as osc
import os
import pathlib
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [2]:
# use a catalog that is configured for iceberg
ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'pcaf_sovereign_footprint'
ingest_table = 'sf_oecd_exgr_dco2'

In [3]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

trino_bucket = osc.attach_s3_bucket("S3_DEV")

In [4]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

Open a Trino connection using JWT for authentication

In [5]:
# make sure schema exists, or table creation below will fail in weird ways
sql = f"""
create schema if not exists {ingest_catalog}.{ingest_schema}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [6]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('default',)
('demo_dv',)
('iceberg_demo',)
('information_schema',)
('ingest',)
('mdt_sandbox',)
('pcaf_sovereign_footprint',)
('sandbox',)


#Load CO2 emissions file (updated from https://stats.oecd.org/Index.aspx)

In [7]:
import pandas as pd
import csv
import ParseXLS as parser


oecd_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/OECD/EXGR_DCO2.csv')
oecd_file.download_file(f'/tmp/EXGR_DCO2.csv')
df =pd.read_csv(f'/tmp/EXGR_DCO2.csv')
df= df[['Indicator','COU','Country','PAR','IND','Industry','TIME','Value']]
df=df.convert_dtypes()
dict = {'Indicator': 'attribute',
        'COU': 'country_iso_code',
        'Country': 'country_name',
        'PAR': 'partner_iso_code',
        'IND': 'industry_code',
        'Industry' : 'industry_name',
        'TIME'   : 'validity_date',
        'Value' : 'value' }

df.rename(columns=dict,
          inplace=True)
df['value_units'] = 'Mt CO2'
df


Unnamed: 0,attribute,country_iso_code,country_name,partner_iso_code,industry_code,industry_name,validity_date,value,value_units
0,Domestic CO2 emissions embodied in gross exports,OECD,OECD member countries,WLD,DTOTAL,TOTAL,1995,530.637,Mt CO2
1,Domestic CO2 emissions embodied in gross exports,OECD,OECD member countries,WLD,DTOTAL,TOTAL,1996,580.962,Mt CO2
2,Domestic CO2 emissions embodied in gross exports,OECD,OECD member countries,WLD,DTOTAL,TOTAL,1997,638.161,Mt CO2
3,Domestic CO2 emissions embodied in gross exports,OECD,OECD member countries,WLD,DTOTAL,TOTAL,1998,605.956,Mt CO2
4,Domestic CO2 emissions embodied in gross exports,OECD,OECD member countries,WLD,DTOTAL,TOTAL,1999,576.039,Mt CO2
...,...,...,...,...,...,...,...,...,...
334651,Domestic CO2 emissions embodied in gross exports,ZOTH,Other regions,MMR,D35,"Electricity, gas, steam and air conditioning s...",2014,0.027,Mt CO2
334652,Domestic CO2 emissions embodied in gross exports,ZOTH,Other regions,MMR,D35,"Electricity, gas, steam and air conditioning s...",2015,0.073,Mt CO2
334653,Domestic CO2 emissions embodied in gross exports,ZOTH,Other regions,MMR,D35,"Electricity, gas, steam and air conditioning s...",2016,0.051,Mt CO2
334654,Domestic CO2 emissions embodied in gross exports,ZOTH,Other regions,MMR,D35,"Electricity, gas, steam and air conditioning s...",2017,0.214,Mt CO2


In [8]:
import osc_ingest_trino as osc
df = df.convert_dtypes()
columnschema = osc.create_table_schema_pairs(df) 

sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
print(sql)
qres = engine.execute(sql)
print(qres.fetchall())


drop table if exists osc_datacommons_dev.pcaf_sovereign_footprint.sf_oecd_exgr_dco2

[(True,)]


In [9]:
df.info(verbose=True)
        
        

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334656 entries, 0 to 334655
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   attribute         334656 non-null  string 
 1   country_iso_code  334656 non-null  string 
 2   country_name      334656 non-null  string 
 3   partner_iso_code  334656 non-null  string 
 4   industry_code     334656 non-null  string 
 5   industry_name     334656 non-null  string 
 6   validity_date     334656 non-null  Int64  
 7   value             334656 non-null  Float64
 8   value_units       334656 non-null  string 
dtypes: Float64(1), Int64(1), string(7)
memory usage: 23.6 MB


In [10]:
df.to_sql(ingest_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method=osc.TrinoBatchInsert(batch_size = 5000, verbose = True))

constructed fully qualified table name as: "pcaf_sovereign_footprint.sf_oecd_exgr_dco2"
inserting 5000 records
  ('Domestic CO2 emissions embodied in gross exports ', 'OECD', 'OECD member countries', 'WLD', 'DTOTAL', 'TOTAL', 1995, 530.637, 'Mt CO2')
  ('Domestic CO2 emissions embodied in gross exports ', 'OECD', 'OECD member countries', 'WLD', 'DTOTAL', 'TOTAL', 1996, 580.962, 'Mt CO2')
  ('Domestic CO2 emissions embodied in gross exports ', 'OECD', 'OECD member countries', 'WLD', 'DTOTAL', 'TOTAL', 1997, 638.161, 'Mt CO2')
  ...
  ('Domestic CO2 emissions embodied in gross exports ', 'AUS', 'Australia', 'NONOECD', 'DTOTAL', 'TOTAL', 2002, 32.168, 'Mt CO2')
batch insert result: [(5000,)]
inserting 5000 records
  ('Domestic CO2 emissions embodied in gross exports ', 'AUS', 'Australia', 'NONOECD', 'DTOTAL', 'TOTAL', 2003, 33.225, 'Mt CO2')
  ('Domestic CO2 emissions embodied in gross exports ', 'AUS', 'Australia', 'NONOECD', 'DTOTAL', 'TOTAL', 2004, 34.556, 'Mt CO2')
  ('Domestic CO2 em

KeyboardInterrupt: 

In [None]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table} where partner_iso_code='CAN'"""
df = pd.read_sql(sql, engine)
df
#df.to_csv("exp.csv",index=False,encoding='utf-8')