<font size="6">Calculate GHG Intensity per GDP, PPP </font>

In [30]:
from dotenv import dotenv_values, load_dotenv
import osc_ingest_trino as osc
import os
import pathlib

<font size="4">Load Environment Variables</font>

In [31]:
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [32]:
import trino
from sqlalchemy.engine import create_engine
from sqlalchemy import text

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
#connection = engine.connect()

trino_bucket = osc.attach_s3_bucket("S3_DEV")

In [33]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

Open a Trino connection using JWT for authentication

In [34]:
# Show available schemas to ensure trino connection is set correctly
ingest_catalog = 'osc_datacommons_dev'
with engine.connect() as conn:
    schema_read = conn.execute(text(f'show schemas in {ingest_catalog}'))
    for row in schema_read.fetchall():
        print(row)

('aicoe_osc_demo_results',)
('default',)
('demo_dv',)
('dera',)
('essd',)
('iceberg_demo',)
('information_schema',)
('ingest',)
('mdt_sandbox',)
('pcaf_sovereign_footprint',)
('rmi',)
('sandbox',)
('wri_gppd',)


In [35]:
# define source and destination tables
# LULUCF (Land Use, Land-Use Change and Forestry)

ingest_schema = 'pcaf_sovereign_footprint'
ingest_table = 'sf_unfccc_results'
src_table_1 = 'sf_unfccc_with_lulucf'
src_table_2 = 'sf_unfccc_without_lulucf'
src_table_3 = 'sf_wdi_gdp'
src_table_4 = 'sf_primap_emissions_without_LULUCF'
src_table_5 = 'sf_primap_emissions_with_LULUCF'

src_country_table = 'sf_unfccc_countries'

In [36]:
########################
essd_schema = 'mdt_sandbox'
essd_src_table = 'gwp100_data'
#essd_src_table = 'ghg_data' 

In [37]:
def requantify_df(df):
    units_col = None
    columns_reversed = reversed(df.columns)
    for col in columns_reversed:
        if col.endswith("_units"):
            if units_col:
                # We expect _units column to follow a non-units column
                raise ValueError
            units_col = col
            continue
        if units_col:
            if col + '_units' != units_col:
                raise ValueError
            if (df[units_col]==df[units_col][0]).all():
                # Make a PintArray
                new_col = PintArray(df[col], dtype=f"pint[{ureg(df[units_col][0]).u}]")
            else:
                # Make a pd.Series of Quantity in a way that does not throw UnitStrippedWarning
                new_col = pd.Series(data=df[col], name=col) * pd.Series(data=df[units_col].map(lambda x: ureg(x).u), name=col)
            df = df.drop(columns=units_col)
            df[col] = new_col
            units_col = None
    return df


In [38]:
from openscm_units import unit_registry
#PintType.ureg = unit_registry
ureg = unit_registry
Q_ = ureg.Quantity
ureg.define("CO2e=CO2=CO2eq=CO2_eq")
ureg.define("USD=[currency]=$")

In [39]:
from pint import UnitRegistry, set_application_registry
set_application_registry(ureg)

In [40]:
import pandas as pd
from functools import reduce
import pandas as pd
import pint
from pint import set_application_registry, Quantity
from pint_pandas import PintArray, PintType
from pint_pandas.pint_array import is_pint_type

In [41]:
### ESSD data is not needed at the moment (maybe in the future)

'''
sql=f"""
select iso as country_iso_code,year(year) as validity_date, co2/1000 as value,t.* from {ingest_catalog}.{essd_schema}.{essd_src_table} t """ 
df_essd = pd.read_sql(sql, engine)
#df1["units"] = "kt"
df_essd = df_essd.convert_dtypes()
df_essd[df_essd['country_iso_code'] == 'ZWE'][df_essd['validity_date'] ==2020]
rename_columns = {'value':'total'}
agg_columns = { 'value' : 'sum'}
columns_order = ['attribute','industry_code','country_iso_code','validity_date','total','total_units']
df_essd = df_essd.groupby(['country_iso_code','validity_date'],as_index=False).agg(agg_columns).rename(columns=rename_columns)
df_essd
'''



#df_essd.info(verbose=True)

'\nsql=f"""\nselect iso as country_iso_code,year(year) as validity_date, co2/1000 as value,t.* from {ingest_catalog}.{essd_schema}.{essd_src_table} t """ \ndf_essd = pd.read_sql(sql, engine)\n#df1["units"] = "kt"\ndf_essd = df_essd.convert_dtypes()\ndf_essd[df_essd[\'country_iso_code\'] == \'ZWE\'][df_essd[\'validity_date\'] ==2020]\nrename_columns = {\'value\':\'total\'}\nagg_columns = { \'value\' : \'sum\'}\ncolumns_order = [\'attribute\',\'industry_code\',\'country_iso_code\',\'validity_date\',\'total\',\'total_units\']\ndf_essd = df_essd.groupby([\'country_iso_code\',\'validity_date\'],as_index=False).agg(agg_columns).rename(columns=rename_columns)\ndf_essd\n'

In [42]:
sql=f"""
select country_iso_code,country_name,validity_date,attribute,value,value_units from {ingest_catalog}.{ingest_schema}.{src_table_1} """ 
df1 = pd.read_sql(sql, engine)
#df1["units"] = "kt"
#df1 = requantify_df(df1).convert_dtypes()
df1
df1.info(verbose=True)
sql=f"""
select t.country_iso_code,country_name,validity_date,attribute,value,value_units from {ingest_catalog}.{ingest_schema}.{src_table_2} t, {ingest_catalog}.{ingest_schema}.{src_country_table} c
where t.country_iso_code= c.country_iso_code and c.annex1_flag='Y'"""
df2 = pd.read_sql(sql, engine)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2298 entries, 0 to 2297
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   country_iso_code  2298 non-null   object 
 1   country_name      2298 non-null   object 
 2   validity_date     2298 non-null   int64  
 3   attribute         2298 non-null   object 
 4   value             2298 non-null   float64
 5   value_units       2298 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 107.8+ KB


In [43]:
#df2 = requantify_df(df2).convert_dtypes()
#df2
# gdp
sql=f"""
select country_iso_code,validity_date,value,value_units from {ingest_catalog}.{ingest_schema}.{src_table_3} where attribute='GDP (current US$)' """
df3 = pd.read_sql(sql, engine)
df3
df3 = requantify_df(df3).convert_dtypes()

In [44]:
# gdp ppp
sql=f"""
select country_iso_code,validity_date,value,value_units from {ingest_catalog}.{ingest_schema}.{src_table_3} where attribute='GDP, PPP (current international $)'"""
df4 = pd.read_sql(sql, engine)
df4 = requantify_df(df4).convert_dtypes()
df4

Unnamed: 0,country_iso_code,validity_date,value
0,ABW,2018,4417064467.95204
1,AFE,2018,2420342050881.65
2,AFG,2018,77417896456.4508
3,AFW,2018,1842199923197.45
4,AGO,2018,220289370395.961
...,...,...,...
7723,WSM,2007,889451139.909009
7724,YEM,2007,80080965747.4347
7725,ZAF,2007,608598165269.567
7726,ZMB,2007,32096043406.3915


In [45]:
# PRIMAP data 

sql=f"""
select t.country_iso_code,validity_date,attribute,value,value_units from {ingest_catalog}.{ingest_schema}.{src_table_4} t
, {ingest_catalog}.{ingest_schema}.{src_country_table} c
where t.country_iso_code= c.country_iso_code 
and attribute='KYOTOGHG (AR6GWP100)' and validity_date >=1960  """ 
#and c.annex1_flag='N'

df_primap_without_lulucf = pd.read_sql(sql, engine)
#df_primap['value_units'] = 'kt CO2'
#df_primap = requantify_df(df_primap).convert_dtypes()
# convert to CO2 units to t
#df_primap['value'] = df_primap['value'].pint.to("t CO2e")
df_primap_without_lulucf

Unnamed: 0,country_iso_code,validity_date,attribute,value,value_units
0,MCO,1982,KYOTOGHG (AR6GWP100),115.0,CO2 * gigagram / a
1,MDA,1982,KYOTOGHG (AR6GWP100),43900.0,CO2 * gigagram / a
2,MDG,1982,KYOTOGHG (AR6GWP100),23500.0,CO2 * gigagram / a
3,MDV,1982,KYOTOGHG (AR6GWP100),56.8,CO2 * gigagram / a
4,MEX,1982,KYOTOGHG (AR6GWP100),447000.0,CO2 * gigagram / a
...,...,...,...,...,...
12154,LSO,1982,KYOTOGHG (AR6GWP100),1670.0,CO2 * gigagram / a
12155,LTU,1982,KYOTOGHG (AR6GWP100),51900.0,CO2 * gigagram / a
12156,LUX,1982,KYOTOGHG (AR6GWP100),11400.0,CO2 * gigagram / a
12157,LVA,1982,KYOTOGHG (AR6GWP100),28200.0,CO2 * gigagram / a


In [46]:
# PRIMAP data 

sql=f"""
select t.country_iso_code,validity_date,attribute,value,value_units from {ingest_catalog}.{ingest_schema}.{src_table_5} t
, {ingest_catalog}.{ingest_schema}.{src_country_table} c
where t.country_iso_code= c.country_iso_code
and attribute='KYOTOGHG (AR6GWP100)' and validity_date >=1960  """
#and c.annex1_flag='N'

df_primap_with_lulucf = pd.read_sql(sql, engine)
#df_primap['value_units'] = 'kt CO2'
#df_primap = requantify_df(df_primap).convert_dtypes()
# convert to CO2 units to t
#df_primap['value'] = df_primap['value'].pint.to("t CO2e")
df_primap_with_lulucf

Unnamed: 0,country_iso_code,validity_date,attribute,value,value_units
0,AFG,1960,KYOTOGHG (AR6GWP100),442.939520,CO2 * gigagram / a
1,AGO,1960,KYOTOGHG (AR6GWP100),72106.885000,CO2 * gigagram / a
2,ALB,1960,KYOTOGHG (AR6GWP100),1102.416600,CO2 * gigagram / a
3,AND,1960,KYOTOGHG (AR6GWP100),-4.556391,CO2 * gigagram / a
4,ARE,1960,KYOTOGHG (AR6GWP100),0.000000,CO2 * gigagram / a
...,...,...,...,...,...
12154,WSM,2022,KYOTOGHG (AR6GWP100),-864.270000,CO2 * gigagram / a
12155,YEM,2022,KYOTOGHG (AR6GWP100),-1747.233000,CO2 * gigagram / a
12156,ZAF,2022,KYOTOGHG (AR6GWP100),-28612.451000,CO2 * gigagram / a
12157,ZMB,2022,KYOTOGHG (AR6GWP100),-34171.400000,CO2 * gigagram / a


In [47]:
# calc Scope1 with LULUCF
df_primap_with_lulucf = pd.merge(df_primap_with_lulucf,df_primap_without_lulucf[['country_iso_code','validity_date','value']],on=['country_iso_code','validity_date'],how='outer')
df_primap_with_lulucf['value'] = df_primap_with_lulucf['value_x'] + df_primap_with_lulucf['value_y']
df_primap_with_lulucf.drop(columns=['value_x','value_y'])
df_primap_with_lulucf = df_primap_with_lulucf[['country_iso_code','validity_date','attribute','value','value_units']]

In [48]:
#df2 = pd.merge(df2,df_essd,on=['country_iso_code','validity_date'],how='outer')  
#df2.rename(columns={"value": "ghg_total_without_lulucf", "total": "ghg_total_without_lulucf_essd"},inplace=True)
df2 = pd.merge(df2,df_primap_without_lulucf,on=['country_iso_code','validity_date'],how='outer')  
df2.rename(columns={"value_units_x":"value_units","value_x": "ghg_total_without_lulucf_unfccc","value_y": "ghg_total_without_lulucf_primap"},inplace=True)
df2=df2.convert_dtypes()
df2.info(verbose=True)
df_primap_without_lulucf.info(verbose=True)
df2

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12159 entries, 0 to 12158
Data columns (total 9 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   country_iso_code                 12159 non-null  string 
 1   country_name                     1375 non-null   string 
 2   validity_date                    12159 non-null  Int64  
 3   attribute_x                      1375 non-null   string 
 4   ghg_total_without_lulucf_unfccc  1375 non-null   Float64
 5   value_units                      1375 non-null   string 
 6   attribute_y                      12159 non-null  string 
 7   ghg_total_without_lulucf_primap  12159 non-null  Float64
 8   value_units_y                    12159 non-null  string 
dtypes: Float64(2), Int64(1), string(6)
memory usage: 890.7 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12159 entries, 0 to 12158
Data columns (total 5 columns):
 #   Column            Non-Null Count  

Unnamed: 0,country_iso_code,country_name,validity_date,attribute_x,ghg_total_without_lulucf_unfccc,value_units,attribute_y,ghg_total_without_lulucf_primap,value_units_y
0,DEU,Germany,2013,"Time Series - GHG total without LULUCF, in kt ...",933505.372995,kt CO2e,KYOTOGHG (AR6GWP100),936000.0,CO2 * gigagram / a
1,DEU,Germany,2014,"Time Series - GHG total without LULUCF, in kt ...",893394.420024,kt CO2e,KYOTOGHG (AR6GWP100),896000.0,CO2 * gigagram / a
2,DEU,Germany,2015,"Time Series - GHG total without LULUCF, in kt ...",896657.872363,kt CO2e,KYOTOGHG (AR6GWP100),899000.0,CO2 * gigagram / a
3,DEU,Germany,2016,"Time Series - GHG total without LULUCF, in kt ...",898559.814641,kt CO2e,KYOTOGHG (AR6GWP100),901000.0,CO2 * gigagram / a
4,DEU,Germany,2017,"Time Series - GHG total without LULUCF, in kt ...",881582.778919,kt CO2e,KYOTOGHG (AR6GWP100),884000.0,CO2 * gigagram / a
...,...,...,...,...,...,...,...,...,...
12154,LSO,,1982,,,,KYOTOGHG (AR6GWP100),1670.0,CO2 * gigagram / a
12155,LTU,,1982,,,,KYOTOGHG (AR6GWP100),51900.0,CO2 * gigagram / a
12156,LUX,,1982,,,,KYOTOGHG (AR6GWP100),11400.0,CO2 * gigagram / a
12157,LVA,,1982,,,,KYOTOGHG (AR6GWP100),28200.0,CO2 * gigagram / a


In [49]:
#df2 = pd.merge(df2,df_essd,on=['country_iso_code','validity_date'],how='outer')  
#df2.rename(columns={"value": "ghg_total_without_lulucf", "total": "ghg_total_without_lulucf_essd"},inplace=True)
df_primap_with_lulucf.info(verbose=True)
df1.info(verbose=True)
df1 = pd.merge(df1,df_primap_with_lulucf,on=['country_iso_code','validity_date'],how='outer')  
df1.rename(columns={"value_units_x":"value_units","value_x": "ghg_total_with_lulucf_unfccc","value_y": "ghg_total_with_lulucf_primap"},inplace=True)
df1=df1.convert_dtypes()
df1.info(verbose=True)
df1

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12159 entries, 0 to 12158
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   country_iso_code  12159 non-null  object 
 1   validity_date     12159 non-null  int64  
 2   attribute         12159 non-null  object 
 3   value             12159 non-null  float64
 4   value_units       12159 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 475.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2298 entries, 0 to 2297
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   country_iso_code  2298 non-null   object 
 1   country_name      2298 non-null   object 
 2   validity_date     2298 non-null   int64  
 3   attribute         2298 non-null   object 
 4   value             2298 non-null   float64
 5   value_units       2298 non-null   object 
dtypes: float64(1

Unnamed: 0,country_iso_code,country_name,validity_date,attribute_x,ghg_total_with_lulucf_unfccc,value_units,attribute_y,ghg_total_with_lulucf_primap,value_units_y
0,GNB,Guinea-Bissau,2006,"Time Series - GHG total with LULUCF, in kt CO₂...",7607.222009,kt CO2e,KYOTOGHG (AR6GWP100),3456.23106,CO2 * gigagram / a
1,GNB,Guinea-Bissau,2010,"Time Series - GHG total with LULUCF, in kt CO₂...",2346.986,kt CO2e,KYOTOGHG (AR6GWP100),3666.23106,CO2 * gigagram / a
2,LBN,Lebanon,2011,"Time Series - GHG total with LULUCF, in kt CO₂...",21284.4607,kt CO2e,KYOTOGHG (AR6GWP100),23699.2846,CO2 * gigagram / a
3,LBN,Lebanon,2012,"Time Series - GHG total with LULUCF, in kt CO₂...",23187.847,kt CO2e,KYOTOGHG (AR6GWP100),25663.2411,CO2 * gigagram / a
4,LBN,Lebanon,2013,"Time Series - GHG total with LULUCF, in kt CO₂...",22765.91,kt CO2e,KYOTOGHG (AR6GWP100),25030.9152,CO2 * gigagram / a
...,...,...,...,...,...,...,...,...,...
12155,WSM,,2022,,,,KYOTOGHG (AR6GWP100),-316.27,CO2 * gigagram / a
12156,YEM,,2022,,,,KYOTOGHG (AR6GWP100),44052.767,CO2 * gigagram / a
12157,ZAF,,2022,,,,KYOTOGHG (AR6GWP100),506387.549,CO2 * gigagram / a
12158,ZMB,,2022,,,,KYOTOGHG (AR6GWP100),3928.6,CO2 * gigagram / a


In [50]:
import numpy as np
#df2['scope1_excl_source'] = np.where(df2.ghg_total_without_lulucf_unfccc.notnull(),
#         'UNFCCC',
#         np.where(df2.ghg_total_without_lulucf_primap.notnull(),
#                  'PRIMAP','UNFCCC'))
df2['scope1_excl_source'] = 'PRIMAP'
#df2['scope1_source'] = df2['ghg_total_without_lulucf'].apply(lambda x: 'UNFCCC' if x.isnull() == True else 'PRIMAP')
#df2["value"] = df2["ghg_total_without_lulucf_unfccc"].fillna(df2["ghg_total_without_lulucf_primap"])
df2["value"] = df2["ghg_total_without_lulucf_primap"]
df2.drop(columns=['attribute_x','attribute_y',"value_units_y","ghg_total_without_lulucf_unfccc","ghg_total_without_lulucf_primap"],inplace=True)

columns_order = ['country_iso_code','country_name','validity_date','value','value_units','scope1_excl_source']
df2 = df2.reindex(columns=columns_order).convert_dtypes()
df2.info(verbose=True)
##df2[df2["scope1_excl_source"] =='PRIMAP']
#df2 = requantify_df(df2)

#df2

#df2.sort_values(by=['country_iso_code','validity_date'], ascending=False).to_csv("tmp_scope1_waterfall.csv")
df2[df2["country_iso_code"] == 'CAN']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12159 entries, 0 to 12158
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country_iso_code    12159 non-null  string 
 1   country_name        1375 non-null   string 
 2   validity_date       12159 non-null  Int64  
 3   value               12159 non-null  Float64
 4   value_units         1375 non-null   string 
 5   scope1_excl_source  12159 non-null  string 
dtypes: Float64(1), Int64(1), string(4)
memory usage: 593.8 KB


Unnamed: 0,country_iso_code,country_name,validity_date,value,value_units,scope1_excl_source
786,CAN,Canada,1990,599000.0,kt CO2e,PRIMAP
787,CAN,Canada,1991,593000.0,kt CO2e,PRIMAP
788,CAN,Canada,1992,610000.0,kt CO2e,PRIMAP
789,CAN,Canada,1993,613000.0,kt CO2e,PRIMAP
790,CAN,Canada,1994,633000.0,kt CO2e,PRIMAP
...,...,...,...,...,...,...
11309,CAN,,1978,526000.0,,PRIMAP
11502,CAN,,1979,555000.0,,PRIMAP
11695,CAN,,1980,554000.0,,PRIMAP
11888,CAN,,1981,543000.0,,PRIMAP


In [51]:
import numpy as np
#df1['scope1_incl_source'] = np.where(df1.ghg_total_with_lulucf_unfccc.notnull(),
#         'UNFCCC',
#         np.where(df1.ghg_total_with_lulucf_primap.notnull(),
#                  'PRIMAP','UNFCCC'))
df1['scope1_incl_source'] = 'PRIMAP'
#df2['scope1_source'] = df2['ghg_total_without_lulucf'].apply(lambda x: 'UNFCCC' if x.isnull() == True else 'PRIMAP')
#df1["value"] = df1["ghg_total_with_lulucf_unfccc"].fillna(df1["ghg_total_with_lulucf_primap"])
df1["value"] = df1["ghg_total_with_lulucf_primap"]
df1.drop(columns=['attribute_x','attribute_y',"ghg_total_with_lulucf_unfccc","ghg_total_with_lulucf_primap","value_units_y"],inplace=True)

columns_order = ['country_iso_code','country_name','validity_date','value','value_units','scope1_incl_source']
df1 = df1.reindex(columns=columns_order).convert_dtypes()
df1.info(verbose=True)
##df2[df2["scope1_excl_source"] =='PRIMAP']
#df2 = requantify_df(df2)

#df2

#df2.sort_values(by=['country_iso_code','validity_date'], ascending=False).to_csv("tmp_scope1_waterfall.csv")
df1
#df1[df1["country_iso_code"] == 'CAN']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12160 entries, 0 to 12159
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country_iso_code    12160 non-null  string 
 1   country_name        2298 non-null   string 
 2   validity_date       12160 non-null  Int64  
 3   value               12159 non-null  Float64
 4   value_units         2298 non-null   string 
 5   scope1_incl_source  12160 non-null  string 
dtypes: Float64(1), Int64(1), string(4)
memory usage: 593.9 KB


Unnamed: 0,country_iso_code,country_name,validity_date,value,value_units,scope1_incl_source
0,GNB,Guinea-Bissau,2006,3456.23106,kt CO2e,PRIMAP
1,GNB,Guinea-Bissau,2010,3666.23106,kt CO2e,PRIMAP
2,LBN,Lebanon,2011,23699.2846,kt CO2e,PRIMAP
3,LBN,Lebanon,2012,25663.2411,kt CO2e,PRIMAP
4,LBN,Lebanon,2013,25030.9152,kt CO2e,PRIMAP
...,...,...,...,...,...,...
12155,WSM,,2022,-316.27,,PRIMAP
12156,YEM,,2022,44052.767,,PRIMAP
12157,ZAF,,2022,506387.549,,PRIMAP
12158,ZMB,,2022,3928.6,,PRIMAP


Read the source tables into dataframes  

<font size="4">Merge the source dataframes in a single dataframe and calculate ghg intensity values</font>

In [52]:
df2 = df2.assign(value=np.where(df2.value.isnull(), 0, df2.value))
df2 = requantify_df(df2)
df2



Unnamed: 0,country_iso_code,country_name,validity_date,value,scope1_excl_source
0,DEU,Germany,2013,936000.0,PRIMAP
1,DEU,Germany,2014,896000.0,PRIMAP
2,DEU,Germany,2015,899000.0,PRIMAP
3,DEU,Germany,2016,901000.0,PRIMAP
4,DEU,Germany,2017,884000.0,PRIMAP
...,...,...,...,...,...
12154,LSO,,1982,1670.0,PRIMAP
12155,LTU,,1982,51900.0,PRIMAP
12156,LUX,,1982,11400.0,PRIMAP
12157,LVA,,1982,28200.0,PRIMAP


In [53]:
df1 = df1.assign(value=np.where(df1.value.isnull(), 0, df1.value))
df1 = requantify_df(df1)
df1



Unnamed: 0,country_iso_code,country_name,validity_date,value,scope1_incl_source
0,GNB,Guinea-Bissau,2006,3456.23106,PRIMAP
1,GNB,Guinea-Bissau,2010,3666.23106,PRIMAP
2,LBN,Lebanon,2011,23699.2846,PRIMAP
3,LBN,Lebanon,2012,25663.2411,PRIMAP
4,LBN,Lebanon,2013,25030.9152,PRIMAP
...,...,...,...,...,...
12155,WSM,,2022,-316.27,PRIMAP
12156,YEM,,2022,44052.767,PRIMAP
12157,ZAF,,2022,506387.549,PRIMAP
12158,ZMB,,2022,3928.5999999999985,PRIMAP


In [54]:

df_result = pd.merge(df1,df2,on=['country_iso_code','validity_date'],how='outer')  
df_result=df_result.convert_dtypes()
df_result.info(verbose=True)
#df_result.drop(columns=['country_name_y','attribute'],inplace=True)
df_result.drop(columns=['country_name_y'],inplace=True)
df_result.rename(columns={"country_name_x":"country_name","value_x": "ghg_total_with_lulucf","value_y": "ghg_total_without_lulucf"},inplace=True)

###

df_result = pd.merge(df_result,df3,on=['country_iso_code','validity_date'],how="outer")  
df_result.rename(columns={"value":"gdp"},inplace=True)
###
df_result = pd.merge(df_result,df4,on=['country_iso_code','validity_date'],how="outer")  
df_result.rename(columns={"value":"gdp_ppp"},inplace=True)



df_result["ghg_intensity_with_lulucf_per_gdp"] = df_result["ghg_total_with_lulucf"]/df_result["gdp_ppp"]*1000000
df_result["ghg_intensity_without_lulucf_per_gdp"] = df_result["ghg_total_without_lulucf"]/df_result["gdp_ppp"]*1000000
df_result.info(verbose=True)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12160 entries, 0 to 12159
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype          
---  ------              --------------  -----          
 0   country_iso_code    12160 non-null  string         
 1   country_name_x      2298 non-null   string         
 2   validity_date       12160 non-null  Int64          
 3   value_x             12160 non-null  pint[CO2e * kt]
 4   scope1_incl_source  12160 non-null  string         
 5   country_name_y      1375 non-null   string         
 6   value_y             12159 non-null  pint[CO2e * kt]
 7   scope1_excl_source  12159 non-null  string         
dtypes: Int64(1), pint[CO2e * kt](2), string(5)
memory usage: 772.0 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15805 entries, 0 to 15804
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype                
---  ------                                --------------  --

In [55]:
# If DF_COL contains Pint quantities (because it is a PintArray or an array of Pint Quantities),
# return a two-column dataframe of magnitudes and units.
# If DF_COL contains no Pint quanities, return it unchanged.

def dequantify_column(df_col: pd.Series):
    if type(df_col.values)==PintArray:
        return pd.DataFrame({df_col.name: df_col.values.quantity.m,
                             df_col.name + "_units": str(df_col.values.dtype.units)},
                            index=df_col.index)
    elif df_col.size==0:
        return df_col
    elif isinstance(df_col.iloc[0], Quantity):
        values = df_col.map(lambda x: (x.m, x.u))
        return pd.DataFrame({df_col.name: df_col.map(lambda x: x.m),
                             df_col.name + "_units": df_col.map(lambda x: str(x.u))},
                            index=df_col.index)
    else:
        return df_col

# Rewrite dataframe DF so that columns containing Pint quantities are represented by a column for the Magnitude and column for the Units.
# The magnitude column retains the original column name and the units column is renamed with a _units suffix.
def dequantify_df(df):
    return pd.concat([dequantify_column(df[col]) for col in df.columns], axis=1)

In [56]:
df1 = dequantify_df(df1)
df2 = dequantify_df(df2)
df3 = dequantify_df(df3)
df_result = dequantify_df(df_result)
df_result=df_result.convert_dtypes()
df_result.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15805 entries, 0 to 15804
Data columns (total 17 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   country_iso_code                            15805 non-null  string 
 1   country_name                                2298 non-null   string 
 2   validity_date                               15805 non-null  Int64  
 3   ghg_total_with_lulucf                       12160 non-null  Float64
 4   ghg_total_with_lulucf_units                 15805 non-null  string 
 5   scope1_incl_source                          12160 non-null  string 
 6   ghg_total_without_lulucf                    12159 non-null  Float64
 7   ghg_total_without_lulucf_units              15805 non-null  string 
 8   scope1_excl_source                          12159 non-null  string 
 9   gdp                                         13365 non-null  Float64
 10  gdp_units 

In [57]:

# make sure schema exists, or table creation below will fail in weird ways
sql = f"""
CREATE SCHEMA if not exists {ingest_catalog}.{ingest_schema}
 AUTHORIZATION USER mersin35
 WITH (
     location = 's3a://osc-datacommons-s3-bucket-dev02/data/pcaf_covereign.db'
 )
"""
print(sql)
with engine.connect() as conn:
    qres = conn.execute(text(sql))
    #print(qres.fetchall())


CREATE SCHEMA if not exists osc_datacommons_dev.pcaf_sovereign_footprint
 AUTHORIZATION USER mersin35
 WITH (
     location = 's3a://osc-datacommons-s3-bucket-dev02/data/pcaf_covereign.db'
 )



Run these in a notebook cell if you need to install onto your nb env

<font size="5">Save the results in Trino</font>


In [58]:
import osc_ingest_trino as osc
#df_result['validity_date']=pd.to_datetime(df_result['validity_date'], unit='D')
#df_result=df_result.convert_dtypes()
#df_result.info()
columnschema = osc.create_table_schema_pairs(df_result) 
sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
print(sql)
with engine.connect() as conn:
    qres = conn.execute(text(sql))
    #print(qres.fetchall())




drop table if exists osc_datacommons_dev.pcaf_sovereign_footprint.sf_unfccc_results



In [59]:
tabledef = f"""
create table if not exists {ingest_catalog}.{ingest_schema}.{ingest_table}(
{columnschema}
) with (
    format = 'ORC',
    partitioning = array['country_iso_code']
)
"""
print(tabledef)
with engine.connect() as conn:
    qres = conn.execute(text(tabledef))
    #print(qres.fetchall())


create table if not exists osc_datacommons_dev.pcaf_sovereign_footprint.sf_unfccc_results(
    country_iso_code varchar,
    country_name varchar,
    validity_date bigint,
    ghg_total_with_lulucf double,
    ghg_total_with_lulucf_units varchar,
    scope1_incl_source varchar,
    ghg_total_without_lulucf double,
    ghg_total_without_lulucf_units varchar,
    scope1_excl_source varchar,
    gdp double,
    gdp_units varchar,
    gdp_ppp double,
    gdp_ppp_units varchar,
    ghg_intensity_with_lulucf_per_gdp double,
    ghg_intensity_with_lulucf_per_gdp_units varchar,
    ghg_intensity_without_lulucf_per_gdp double,
    ghg_intensity_without_lulucf_per_gdp_units varchar
) with (
    format = 'ORC',
    partitioning = array['country_iso_code']
)



In [60]:
# Delete all data from our db, so we start with empty table
sql=f"""
delete from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
with engine.connect() as conn:
    qres = conn.execute(text(sql))
    #print(qres.fetchall())

In [61]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
pd.read_sql(sql, engine)


Unnamed: 0,country_iso_code,country_name,validity_date,ghg_total_with_lulucf,ghg_total_with_lulucf_units,scope1_incl_source,ghg_total_without_lulucf,ghg_total_without_lulucf_units,scope1_excl_source,gdp,gdp_units,gdp_ppp,gdp_ppp_units,ghg_intensity_with_lulucf_per_gdp,ghg_intensity_with_lulucf_per_gdp_units,ghg_intensity_without_lulucf_per_gdp,ghg_intensity_without_lulucf_per_gdp_units


In [62]:
print(ingest_catalog)
#df=df.drop(df[df.country_name=="cote d'ivoire"].index)
df_result.to_sql(ingest_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method=osc.TrinoBatchInsert(batch_size = 1000, verbose = True))

osc_datacommons_dev
constructed fully qualified table name as: "pcaf_sovereign_footprint.sf_unfccc_results"
inserting 1000 records
  ('GNB', 'Guinea-Bissau', 2006, 3456.23106, 'CO2e * kt', 'PRIMAP', 2560.0, 'CO2e * kt', 'PRIMAP', 592365659.824007, 'USD', 1659614919.76232, 'USD', 2.082550005331948, 'CO2e * kt / USD', 1.5425265038992466, 'CO2e * kt / USD')
  ('GNB', 'Guinea-Bissau', 2010, 3666.23106, 'CO2e * kt', 'PRIMAP', 2770.0, 'CO2e * kt', 'PRIMAP', 849878489.113364, 'USD', 2038960051.21636, 'USD', 1.7980887157710015, 'CO2e * kt / USD', 1.3585356899697625, 'CO2e * kt / USD')
  ('LBN', 'Lebanon', 2011, 23699.2846, 'CO2e * kt', 'PRIMAP', 26900.0, 'CO2e * kt', 'PRIMAP', 39927125961.194, 'USD', 76310623815.5428, 'USD', 0.3105633713240983, 'CO2e * kt / USD', 0.35250661906554953, 'CO2e * kt / USD')
  ...
  ('CAN', 'Canada', 2017, 708967.821, 'CO2e * kt', 'PRIMAP', 726000.0, 'CO2e * kt', 'PRIMAP', 1649265644244.09, 'USD', 1765762548007.76, 'USD', 0.40150801805140807, 'CO2e * kt / USD', 0.41

In [63]:
#df_result.to_csv(ingest_table+".csv")
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table} where country_iso_code='TWN' and validity_date > 2015"""
pd.read_sql(sql, engine)


Unnamed: 0,country_iso_code,country_name,validity_date,ghg_total_with_lulucf,ghg_total_with_lulucf_units,scope1_incl_source,ghg_total_without_lulucf,ghg_total_without_lulucf_units,scope1_excl_source,gdp,gdp_units,gdp_ppp,gdp_ppp_units,ghg_intensity_with_lulucf_per_gdp,ghg_intensity_with_lulucf_per_gdp_units,ghg_intensity_without_lulucf_per_gdp,ghg_intensity_without_lulucf_per_gdp_units
0,TWN,,2022,274058.25,CO2e * kt,PRIMAP,296000.0,CO2e * kt,PRIMAP,,USD,,USD,,CO2e * kt / USD,,CO2e * kt / USD
1,TWN,,2016,278074.0,CO2e * kt,PRIMAP,300000.0,CO2e * kt,PRIMAP,,USD,,USD,,CO2e * kt / USD,,CO2e * kt / USD
2,TWN,,2017,283039.0,CO2e * kt,PRIMAP,305000.0,CO2e * kt,PRIMAP,,USD,,USD,,CO2e * kt / USD,,CO2e * kt / USD
3,TWN,,2018,281016.0,CO2e * kt,PRIMAP,303000.0,CO2e * kt,PRIMAP,,USD,,USD,,CO2e * kt / USD,,CO2e * kt / USD
4,TWN,,2019,269083.0,CO2e * kt,PRIMAP,291000.0,CO2e * kt,PRIMAP,,USD,,USD,,CO2e * kt / USD,,CO2e * kt / USD
5,TWN,,2020,267095.0,CO2e * kt,PRIMAP,289000.0,CO2e * kt,PRIMAP,,USD,,USD,,CO2e * kt / USD,,CO2e * kt / USD
6,TWN,,2021,282058.25,CO2e * kt,PRIMAP,304000.0,CO2e * kt,PRIMAP,,USD,,USD,,CO2e * kt / USD,,CO2e * kt / USD
