<font size="5">OECD CO2 emissions into Trino pipeline</font>

In [1]:
import pandas as pd
import csv
import ParseXLS as parser
import pycountry

Importing pcaf_env sets up all the global variables we need for Trino, S3, etc.

In [2]:
from pcaf_env import *

ingest_table = 'sf_oecd_imgr_fco2'
models_table = 'pcaf_dbt_models'

using connect string: trino://MichaelTiemannOSC@trino-secure-odh-trino.apps.odh-cl2.apps.os-climate.org:443/osc_datacommons_dev
create schema if not exists osc_datacommons_dev.mdt_sandbox


In [3]:
# Show available schemas to ensure trino connection is set correctly
schema_read = osc._do_sql(f'show schemas in {ingest_catalog}', engine, verbose=False)
for row in schema_read:
    print(row)

('aicoe_osc_demo_results',)
('default',)
('demo_dv',)
('dera',)
('essd',)
('iceberg_demo',)
('information_schema',)
('ingest',)
('mdt_sandbox',)
('pcaf_sovereign_footprint',)
('rmi',)
('sandbox',)
('wri_gppd',)


Load CO2 emissions file (updated from https://stats.oecd.org/Index.aspx?DataSetCode=IO_GHG_2021)

In [4]:
oecd_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/OECD/IMGR_FCO2.csv')
oecd_file.download_file(f'/tmp/IMGR_FCO2.csv')
#df = parser.process('OECD_IMGR_FCO2.ini','OECD.csv') 
df =pd.read_csv(f'/tmp/IMGR_FCO2.csv')
df= df[['Indicator','COU','Country','PAR','IND','Industry','TIME','Value']]
df=df.convert_dtypes()
dict = {'Indicator': 'attribute',
        'COU': 'country_iso_code',
        'Country': 'country_name',
        'PAR': 'partner_iso_code',
        'IND': 'industry_code',
        'Industry' : 'industry_name',
        'TIME'   : 'validity_date',
        'Value' : 'value' }
 
# call rename () method
df.rename(columns=dict,
          inplace=True)

df['value_units'] = 'Mt CO2'

df

Unnamed: 0,attribute,country_iso_code,country_name,partner_iso_code,industry_code,industry_name,validity_date,value,value_units
0,Foreign CO2 emissions embodied in gross imports,OECD,OECD member countries,WLD,DTOTAL,TOTAL,1995,1518.407,Mt CO2
1,Foreign CO2 emissions embodied in gross imports,OECD,OECD member countries,WLD,DTOTAL,TOTAL,1996,1447.641,Mt CO2
2,Foreign CO2 emissions embodied in gross imports,OECD,OECD member countries,WLD,DTOTAL,TOTAL,1997,1434.138,Mt CO2
3,Foreign CO2 emissions embodied in gross imports,OECD,OECD member countries,WLD,DTOTAL,TOTAL,1998,1620.643,Mt CO2
4,Foreign CO2 emissions embodied in gross imports,OECD,OECD member countries,WLD,DTOTAL,TOTAL,1999,1797.149,Mt CO2
...,...,...,...,...,...,...,...,...,...
334651,Foreign CO2 emissions embodied in gross imports,ZOTH,Other regions,MMR,D35,"Electricity, gas, steam and air conditioning s...",2014,0.0,Mt CO2
334652,Foreign CO2 emissions embodied in gross imports,ZOTH,Other regions,MMR,D35,"Electricity, gas, steam and air conditioning s...",2015,0.0,Mt CO2
334653,Foreign CO2 emissions embodied in gross imports,ZOTH,Other regions,MMR,D35,"Electricity, gas, steam and air conditioning s...",2016,0.0,Mt CO2
334654,Foreign CO2 emissions embodied in gross imports,ZOTH,Other regions,MMR,D35,"Electricity, gas, steam and air conditioning s...",2017,0.0,Mt CO2


In [5]:
# We don't add Rest of World (ROW) here
df_country = pd.DataFrame([ country.alpha_3 for country in pycountry.countries ], columns=['country_iso_code'], dtype='string')
df_country

0      ABW
1      AFG
2      AGO
3      AIA
4      ALA
      ... 
244    WSM
245    YEM
246    ZAF
247    ZMB
248    ZWE
Name: alpha_3, Length: 249, dtype: object

In [6]:
df = df.convert_dtypes()
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334656 entries, 0 to 334655
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   attribute         334656 non-null  string 
 1   country_iso_code  334656 non-null  string 
 2   country_name      334656 non-null  string 
 3   partner_iso_code  334656 non-null  string 
 4   industry_code     334656 non-null  string 
 5   industry_name     334656 non-null  string 
 6   validity_date     334656 non-null  Int64  
 7   value             334656 non-null  Float64
 8   value_units       334656 non-null  string 
dtypes: Float64(1), Int64(1), string(7)
memory usage: 23.6 MB


None


drop table if exists osc_datacommons_dev.mdt_sandbox.sf_oecd_imgr_fco2



In [6]:
try:
    os.mkdir("/opt/app-root/src/PCAF-sovereign-footprint/dbt/pcaf_transform/models", mode=0o755)
except FileExistsError:
    pass
shutil.rmtree("/opt/app-root/src/PCAF-sovereign-footprint/dbt/pcaf_transform/models/.ipynb_checkpoints", ignore_errors=True)

In [7]:
custom_meta_content = {
    'data provider': 'OECD',
    'title': 'Foreign CO2 emissions embodied in gross imports',
    'author': 'OECD',
    'contact': 'stan.contact@oecd.org',
    'description': json.dumps("""
Foreign CO2 emissions embodied in gross imports captures the foreign CO2 emissions embodied in gross imports of country/region c with origin in exporting industry i in exporting country/region partner p.
Note: Regions are treated as a unit, i.e. IMGR_FCO2 excludes intra-regional trade, as so, intra-region flows are considered as domestic."""),
    'release_date': '2022-01-01 00:00:00',
    # How should we describe our transformative step here?
}

def description_is (s):
    return { 'Description': json.dumps(s)}

custom_meta_fields = {}
custom_meta_fields['data_provider'] = description_is("OECD")
custom_meta_fields['country_iso_code'] = description_is("ISO-3166 Country Code (alpha_3)")
custom_meta_fields['partner_iso_code'] = description_is("ISO-3166 Country Code (alpha_3) of the trading partner")
custom_meta_fields['industry_name'] = description_is("OECD Industry or Aggregation Name")
custom_meta_fields['industry_code'] = description_is("OECD Industry or Aggregation Code")
custom_meta_fields['country_name'] = description_is("ISO-3166 Country Name")
custom_meta_fields['attribute'] = description_is("Foreign CO2 emissions embodied in gross imports")
custom_meta_fields['value'] = description_is("Mt CO2 emissions embodied in gross imports")
custom_meta_fields['value_units'] = description_is("Mt CO2")
custom_meta_fields['validity_date'] = description_is("Year of measurement")

for f in custom_meta_fields:
    if 'ISO' in custom_meta_fields[f]['Description']:
        custom_meta_fields[f]['tags'] = ['ISO']
    elif 'IPCC' in custom_meta_fields[f]['Description']:
        custom_meta_fields[f]['tags'] = ['IPCC']
    elif 'WDI' in custom_meta_fields[f]['Description']:
        custom_meta_fields[f]['tags'] = ['WDI']
    elif f == 'year':
        custom_meta_fields[f]['tags'] = ['annual']

In [7]:
create_trino_table_and_dbt_metadata(ingest_table, df, ['country_iso_code'], custom_meta_content, custom_meta_fields, verbose=True)

drop table if exists mdt_sandbox.sf_oecd_imgr_fco2_source
enforcing dataframe partition column order

verifying existence of table osc_datacommons_dev.mdt_sandbox.sf_oecd_imgr_fco2_source
create table if not exists osc_datacommons_dev.mdt_sandbox.sf_oecd_imgr_fco2_source (
    attribute varchar,
    country_name varchar,
    partner_iso_code varchar,
    industry_code varchar,
    industry_name varchar,
    validity_date bigint,
    value double,
    value_units varchar,
    country_iso_code varchar
) with (
    format = 'parquet',
    partitioning = array['country_iso_code']
)

staging dataframe parquet to s3 osc-datacommons-s3-bucket-dev02
/tmp/ingest_temp_909b9445/country_iso_code=AUT/c095ddbf3fac40e2a353d08596e1d320-0.parquet  -->  trino/ingest/ingest_temp_909b9445/country_iso_code=AUT/c095ddbf3fac40e2a353d08596e1d320-0.parquet
/tmp/ingest_temp_909b9445/country_iso_code=OECD/c095ddbf3fac40e2a353d08596e1d320-0.parquet  -->  trino/ingest/ingest_temp_909b9445/country_iso_code=OECD/c09

In [8]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}_source
where validity_date=2007"""
pd.read_sql(sql, engine)

Unnamed: 0,attribute,country_name,partner_iso_code,industry_code,industry_name,validity_date,value,value_units,country_iso_code
0,Foreign CO2 emissions embodied in gross imports,Chinese Taipei,WLD,DTOTAL,TOTAL,2007,158.452,Mt CO2,TWN
1,Foreign CO2 emissions embodied in gross imports,Chinese Taipei,OECD,DTOTAL,TOTAL,2007,62.820,Mt CO2,TWN
2,Foreign CO2 emissions embodied in gross imports,Chinese Taipei,AUS,DTOTAL,TOTAL,2007,3.925,Mt CO2,TWN
3,Foreign CO2 emissions embodied in gross imports,Chinese Taipei,AUT,DTOTAL,TOTAL,2007,0.263,Mt CO2,TWN
4,Foreign CO2 emissions embodied in gross imports,Chinese Taipei,BEL,DTOTAL,TOTAL,2007,0.237,Mt CO2,TWN
...,...,...,...,...,...,...,...,...,...
13939,Foreign CO2 emissions embodied in gross imports,Peru,ZEUR,D35,"Electricity, gas, steam and air conditioning s...",2007,0.017,Mt CO2,PER
13940,Foreign CO2 emissions embodied in gross imports,Peru,ZASI,D35,"Electricity, gas, steam and air conditioning s...",2007,0.001,Mt CO2,PER
13941,Foreign CO2 emissions embodied in gross imports,Peru,ZSCA,D35,"Electricity, gas, steam and air conditioning s...",2007,0.006,Mt CO2,PER
13942,Foreign CO2 emissions embodied in gross imports,Peru,ZOTH,D35,"Electricity, gas, steam and air conditioning s...",2007,0.000,Mt CO2,PER


In [16]:
osc._do_sql(f"insert into {ingest_schema}.{models_table} values(\'{json.dumps(dbt_dict['models'])}\')", engine, verbose=True)

insert into mdt_sandbox.pcaf_dbt_models values('{"sf_wdi_gdp": {"description": "\"\\nThis indicator provides per capita values for gross domestic product (GDP) expressed in current international dollars converted by purchasing power parity (PPP) conversion factor. \\n\\nGDP is the sum of gross value added by all resident producers in the country plus any product taxes and minus any subsidies not included in the value of the products. conversion factor is a spatial price deflator and currency converter that controls for price level differences between countries. Total population is a mid-year population based on the de facto definition of population, which counts all residents regardless of legal status or citizenship.\"", "columns": {"rec_source": {"description": "\"API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv and API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv\""}, "data_provider": {"description": "\"WDI\"", "tags": ["WDI"]}, "country_iso_code": {"description": "\"ISO-3166 Country Code (alpha_3)\"", 

[(1,)]