<font size="6">Calculate GHG Intensity per GDP, PPP </font>

In [1]:
from dotenv import dotenv_values, load_dotenv
import osc_ingest_trino as osc
import os
import pathlib

<font size="4">Load Environment Variables</font>

In [2]:
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [3]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

trino_bucket = osc.attach_s3_bucket("S3_DEV")

In [4]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

Open a Trino connection using JWT for authentication

In [5]:
# Show available schemas to ensure trino connection is set correctly
ingest_catalog = 'osc_datacommons_dev'
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('aicoe_osc_demo_results',)
('default',)
('demo_dv',)
('dera',)
('essd',)
('iceberg_demo',)
('information_schema',)
('ingest',)
('mdt_sandbox',)
('pcaf_sovereign_footprint',)
('rmi',)
('sandbox',)
('wri_gppd',)


In [6]:
# define source and destination tables
# LULUCF (Land Use, Land-Use Change and Forestry)

ingest_schema = 'pcaf_sovereign_footprint'
ingest_table = 'sf_unfccc_results'
src_table_1 = 'sf_unfccc_with_lulucf'
src_table_2 = 'sf_unfccc_without_lulucf'
src_table_3 = 'sf_wdi_gdp'
src_table_4 = 'sf_primap_emissions_without_LULUCF'
src_table_5 = 'sf_primap_emissions_with_LULUCF'

src_country_table = 'sf_unfccc_countries'

In [7]:
########################
essd_schema = 'mdt_sandbox'
essd_src_table = 'gwp100_data'
#essd_src_table = 'ghg_data' 

In [8]:
def requantify_df(df):
    units_col = None
    columns_reversed = reversed(df.columns)
    for col in columns_reversed:
        if col.endswith("_units"):
            if units_col:
                # We expect _units column to follow a non-units column
                raise ValueError
            units_col = col
            continue
        if units_col:
            if col + '_units' != units_col:
                raise ValueError
            if (df[units_col]==df[units_col][0]).all():
                # Make a PintArray
                new_col = PintArray(df[col], dtype=f"pint[{ureg(df[units_col][0]).u}]")
            else:
                # Make a pd.Series of Quantity in a way that does not throw UnitStrippedWarning
                new_col = pd.Series(data=df[col], name=col) * pd.Series(data=df[units_col].map(lambda x: ureg(x).u), name=col)
            df = df.drop(columns=units_col)
            df[col] = new_col
            units_col = None
    return df


In [9]:
from openscm_units import unit_registry
#PintType.ureg = unit_registry
ureg = unit_registry
Q_ = ureg.Quantity
ureg.define("CO2e=CO2=CO2eq=CO2_eq")
ureg.define("USD=[currency]=$")

In [10]:
from pint import UnitRegistry, set_application_registry
set_application_registry(ureg)

In [11]:
import pandas as pd
from functools import reduce
import pandas as pd
import pint
from pint import set_application_registry, Quantity
from pint_pandas import PintArray, PintType
from pint_pandas.pint_array import is_pint_type

In [12]:
### ESSD data is not needed at the moment (maybe in the future)

'''
sql=f"""
select iso as country_iso_code,year(year) as validity_date, co2/1000 as value,t.* from {ingest_catalog}.{essd_schema}.{essd_src_table} t """ 
df_essd = pd.read_sql(sql, engine)
#df1["units"] = "kt"
df_essd = df_essd.convert_dtypes()
df_essd[df_essd['country_iso_code'] == 'ZWE'][df_essd['validity_date'] ==2020]
rename_columns = {'value':'total'}
agg_columns = { 'value' : 'sum'}
columns_order = ['attribute','industry_code','country_iso_code','validity_date','total','total_units']
df_essd = df_essd.groupby(['country_iso_code','validity_date'],as_index=False).agg(agg_columns).rename(columns=rename_columns)
df_essd
'''



#df_essd.info(verbose=True)

'\nsql=f"""\nselect iso as country_iso_code,year(year) as validity_date, co2/1000 as value,t.* from {ingest_catalog}.{essd_schema}.{essd_src_table} t """ \ndf_essd = pd.read_sql(sql, engine)\n#df1["units"] = "kt"\ndf_essd = df_essd.convert_dtypes()\ndf_essd[df_essd[\'country_iso_code\'] == \'ZWE\'][df_essd[\'validity_date\'] ==2020]\nrename_columns = {\'value\':\'total\'}\nagg_columns = { \'value\' : \'sum\'}\ncolumns_order = [\'attribute\',\'industry_code\',\'country_iso_code\',\'validity_date\',\'total\',\'total_units\']\ndf_essd = df_essd.groupby([\'country_iso_code\',\'validity_date\'],as_index=False).agg(agg_columns).rename(columns=rename_columns)\ndf_essd\n'

In [13]:
sql=f"""
select country_iso_code,country_name,validity_date,attribute,value,value_units from {ingest_catalog}.{ingest_schema}.{src_table_1} """ 
df1 = pd.read_sql(sql, engine)
#df1["units"] = "kt"
#df1 = requantify_df(df1).convert_dtypes()
df1
df1.info(verbose=True)
sql=f"""
select t.country_iso_code,country_name,validity_date,attribute,value,value_units from {ingest_catalog}.{ingest_schema}.{src_table_2} t, {ingest_catalog}.{ingest_schema}.{src_country_table} c
where t.country_iso_code= c.country_iso_code and c.annex1_flag='Y'"""
df2 = pd.read_sql(sql, engine)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2256 entries, 0 to 2255
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   country_iso_code  2256 non-null   object 
 1   country_name      2256 non-null   object 
 2   validity_date     2256 non-null   int64  
 3   attribute         2256 non-null   object 
 4   value             2256 non-null   float64
 5   value_units       2256 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 105.9+ KB


In [14]:
#df2 = requantify_df(df2).convert_dtypes()
#df2
# gdp
sql=f"""
select country_iso_code,validity_date,value,value_units from {ingest_catalog}.{ingest_schema}.{src_table_3} where attribute='GDP (current US$)' """
df3 = pd.read_sql(sql, engine)
df3
df3 = requantify_df(df3).convert_dtypes()

In [15]:
# gdp ppp
sql=f"""
select country_iso_code,validity_date,value,value_units from {ingest_catalog}.{ingest_schema}.{src_table_3} where attribute='GDP, PPP (current international $)'"""
df4 = pd.read_sql(sql, engine)
df4 = requantify_df(df4).convert_dtypes()
df4

Unnamed: 0,country_iso_code,validity_date,value
0,HIC,2015,54750384812661.8
1,HKG,2015,411294477052.779
2,HND,2015,43955139193.462
3,HPC,2015,1695791359063.76
4,HRV,2015,97949323815.94
...,...,...,...
7467,WSM,2016,1230684174.87219
7468,XKX,2016,17789238052.0725
7469,ZAF,2016,772768693550.772
7470,ZMB,2016,55712209199.1273


In [16]:
# PRIMAP data 

sql=f"""
select t.country_iso_code,validity_date,attribute,value,value_units from {ingest_catalog}.{ingest_schema}.{src_table_4} t
, {ingest_catalog}.{ingest_schema}.{src_country_table} c
where t.country_iso_code= c.country_iso_code and c.annex1_flag='N'
and attribute='KYOTOGHG (AR4GWP100)' and validity_date >=1960  """ 
df_primap_without_lulucf = pd.read_sql(sql, engine)
#df_primap['value_units'] = 'kt CO2'
#df_primap = requantify_df(df_primap).convert_dtypes()
# convert to CO2 units to t
#df_primap['value'] = df_primap['value'].pint.to("t CO2e")
df_primap_without_lulucf

Unnamed: 0,country_iso_code,validity_date,attribute,value,value_units
0,MDA,1982,KYOTOGHG (AR4GWP100),43800.0,CO2 * gigagram / a
1,MDG,1982,KYOTOGHG (AR4GWP100),21300.0,CO2 * gigagram / a
2,MDV,1982,KYOTOGHG (AR4GWP100),54.8,CO2 * gigagram / a
3,MEX,1982,KYOTOGHG (AR4GWP100),438000.0,CO2 * gigagram / a
4,MHL,1982,KYOTOGHG (AR4GWP100),74.7,CO2 * gigagram / a
...,...,...,...,...,...
9295,LBR,1982,KYOTOGHG (AR4GWP100),1680.0,CO2 * gigagram / a
9296,LCA,1982,KYOTOGHG (AR4GWP100),214.0,CO2 * gigagram / a
9297,LKA,1982,KYOTOGHG (AR4GWP100),16100.0,CO2 * gigagram / a
9298,LSO,1982,KYOTOGHG (AR4GWP100),2410.0,CO2 * gigagram / a


In [25]:
# PRIMAP data 

sql=f"""
select t.country_iso_code,validity_date,attribute,value,value_units from {ingest_catalog}.{ingest_schema}.{src_table_5} t
, {ingest_catalog}.{ingest_schema}.{src_country_table} c
where t.country_iso_code= c.country_iso_code and c.annex1_flag='N'
and attribute='KYOTOGHG (AR4GWP100)' and validity_date >=1960  """ 
df_primap_with_lulucf = pd.read_sql(sql, engine)
#df_primap['value_units'] = 'kt CO2'
#df_primap = requantify_df(df_primap).convert_dtypes()
# convert to CO2 units to t
#df_primap['value'] = df_primap['value'].pint.to("t CO2e")
df_primap_with_lulucf

Unnamed: 0,country_iso_code,validity_date,attribute,value,value_units
0,ERI,2002,KYOTOGHG (AR4GWP100),457.698500,CO2 * gigagram / a
1,ETH,2002,KYOTOGHG (AR4GWP100),109288.290000,CO2 * gigagram / a
2,FJI,2002,KYOTOGHG (AR4GWP100),-2545.406000,CO2 * gigagram / a
3,FSM,2002,KYOTOGHG (AR4GWP100),-29.573100,CO2 * gigagram / a
4,GAB,2002,KYOTOGHG (AR4GWP100),3164.882900,CO2 * gigagram / a
...,...,...,...,...,...
9295,LBR,1982,KYOTOGHG (AR4GWP100),7831.837300,CO2 * gigagram / a
9296,LCA,1982,KYOTOGHG (AR4GWP100),0.000000,CO2 * gigagram / a
9297,LKA,1982,KYOTOGHG (AR4GWP100),18542.540000,CO2 * gigagram / a
9298,LSO,1982,KYOTOGHG (AR4GWP100),11.838761,CO2 * gigagram / a


In [26]:
# calc Scope1 with LULUCF
df_primap_with_lulucf = pd.merge(df_primap_with_lulucf,df_primap_without_lulucf[['country_iso_code','validity_date','value']],on=['country_iso_code','validity_date'],how='outer')
df_primap_with_lulucf['value'] = df_primap_with_lulucf['value_x'] + df_primap_with_lulucf['value_y']
df_primap_with_lulucf.drop(columns=['value_x','value_y'])
df_primap_with_lulucf = df_primap_with_lulucf[['country_iso_code','validity_date','attribute','value','value_units']]

In [27]:
#df2 = pd.merge(df2,df_essd,on=['country_iso_code','validity_date'],how='outer')  
#df2.rename(columns={"value": "ghg_total_without_lulucf", "total": "ghg_total_without_lulucf_essd"},inplace=True)
df2 = pd.merge(df2,df_primap_without_lulucf,on=['country_iso_code','validity_date'],how='outer')  
df2.rename(columns={"value_units_x":"value_units","value_x": "ghg_total_without_lulucf_unfccc","value_y": "ghg_total_without_lulucf_primap"},inplace=True)
df2=df2.convert_dtypes()
df2.info(verbose=True)
df_primap_without_lulucf.info(verbose=True)
df2

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10633 entries, 0 to 10632
Data columns (total 12 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   country_iso_code                 10633 non-null  string 
 1   country_name                     1333 non-null   string 
 2   validity_date                    10633 non-null  Int64  
 3   attribute_x                      1333 non-null   string 
 4   ghg_total_without_lulucf_unfccc  1333 non-null   Float64
 5   value_units                      1333 non-null   string 
 6   attribute_y                      9300 non-null   string 
 7   ghg_total_without_lulucf_primap  9300 non-null   Float64
 8   value_units_y                    9300 non-null   string 
 9   attribute                        9300 non-null   string 
 10  value                            9300 non-null   Float64
 11  value_units_y                    9300 non-null   string 
dtypes: Float64(3), Int

Unnamed: 0,country_iso_code,country_name,validity_date,attribute_x,ghg_total_without_lulucf_unfccc,value_units,attribute_y,ghg_total_without_lulucf_primap,value_units_y,attribute,value,value_units_y.1
0,UKR,Ukraine,2013,"Time Series - GHG total without LULUCF, in kt ...",408987.882387,kt CO2e,,,,,,
1,UKR,Ukraine,2014,"Time Series - GHG total without LULUCF, in kt ...",362562.343583,kt CO2e,,,,,,
2,UKR,Ukraine,2015,"Time Series - GHG total without LULUCF, in kt ...",319107.601157,kt CO2e,,,,,,
3,UKR,Ukraine,2016,"Time Series - GHG total without LULUCF, in kt ...",337413.39412,kt CO2e,,,,,,
4,UKR,Ukraine,2017,"Time Series - GHG total without LULUCF, in kt ...",322998.685177,kt CO2e,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
10628,LBR,,1982,,,,KYOTOGHG (AR4GWP100),1680.0,CO2 * gigagram / a,KYOTOGHG (AR4GWP100),1680.0,CO2 * gigagram / a
10629,LCA,,1982,,,,KYOTOGHG (AR4GWP100),214.0,CO2 * gigagram / a,KYOTOGHG (AR4GWP100),214.0,CO2 * gigagram / a
10630,LKA,,1982,,,,KYOTOGHG (AR4GWP100),16100.0,CO2 * gigagram / a,KYOTOGHG (AR4GWP100),16100.0,CO2 * gigagram / a
10631,LSO,,1982,,,,KYOTOGHG (AR4GWP100),2410.0,CO2 * gigagram / a,KYOTOGHG (AR4GWP100),2410.0,CO2 * gigagram / a


In [28]:
#df2 = pd.merge(df2,df_essd,on=['country_iso_code','validity_date'],how='outer')  
#df2.rename(columns={"value": "ghg_total_without_lulucf", "total": "ghg_total_without_lulucf_essd"},inplace=True)
df_primap_with_lulucf.info(verbose=True)
df1.info(verbose=True)
df1 = pd.merge(df1,df_primap_with_lulucf,on=['country_iso_code','validity_date'],how='outer')  
df1.rename(columns={"value_units_x":"value_units","value_x": "ghg_total_with_lulucf_unfccc","value_y": "ghg_total_with_lulucf_primap"},inplace=True)
df1=df1.convert_dtypes()
df1.info(verbose=True)
df1

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9300 entries, 0 to 9299
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   country_iso_code  9300 non-null   object 
 1   validity_date     9300 non-null   int64  
 2   attribute         9300 non-null   object 
 3   value             9300 non-null   float64
 4   value_units       9300 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 363.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2256 entries, 0 to 2255
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   country_iso_code  2256 non-null   object 
 1   country_name      2256 non-null   object 
 2   validity_date     2256 non-null   int64  
 3   attribute         2256 non-null   object 
 4   value             2256 non-null   float64
 5   value_units       2256 non-null   object 
dtypes: float64(1),

Unnamed: 0,country_iso_code,country_name,validity_date,attribute_x,ghg_total_with_lulucf_unfccc,value_units,attribute_y,ghg_total_with_lulucf_primap,value_units_y
0,BRA,Brazil,1990,"Time Series - GHG total with LULUCF, in kt CO₂...",1346971.12,kt CO2e,KYOTOGHG (AR4GWP100),1380668.06,CO2 * gigagram / a
1,BRA,Brazil,1991,"Time Series - GHG total with LULUCF, in kt CO₂...",1232547.09,kt CO2e,KYOTOGHG (AR4GWP100),1272269.38,CO2 * gigagram / a
2,BRA,Brazil,1992,"Time Series - GHG total with LULUCF, in kt CO₂...",1384444.15,kt CO2e,KYOTOGHG (AR4GWP100),1419851.04,CO2 * gigagram / a
3,BRA,Brazil,1993,"Time Series - GHG total with LULUCF, in kt CO₂...",1458270.7,kt CO2e,KYOTOGHG (AR4GWP100),1493950.96,CO2 * gigagram / a
4,BRA,Brazil,1994,"Time Series - GHG total with LULUCF, in kt CO₂...",1476109.9,kt CO2e,KYOTOGHG (AR4GWP100),1514181.7,CO2 * gigagram / a
...,...,...,...,...,...,...,...,...,...
10629,LBR,,1982,,,,KYOTOGHG (AR4GWP100),9511.8373,CO2 * gigagram / a
10630,LCA,,1982,,,,KYOTOGHG (AR4GWP100),214.0,CO2 * gigagram / a
10631,LKA,,1982,,,,KYOTOGHG (AR4GWP100),34642.54,CO2 * gigagram / a
10632,LSO,,1982,,,,KYOTOGHG (AR4GWP100),2421.838761,CO2 * gigagram / a


In [29]:
import numpy as np
df2['scope1_excl_source'] = np.where(df2.ghg_total_without_lulucf_unfccc.notnull(),
         'UNFCCC',
         np.where(df2.ghg_total_without_lulucf_primap.notnull(),
                  'PRIMAP','UNFCCC'))
#df2['scope1_source'] = df2['ghg_total_without_lulucf'].apply(lambda x: 'UNFCCC' if x.isnull() == True else 'PRIMAP')
df2["value"] = df2["ghg_total_without_lulucf_unfccc"].fillna(df2["ghg_total_without_lulucf_primap"])
df2.drop(columns=['attribute_x','attribute_y',"value_units_y","ghg_total_without_lulucf_unfccc","ghg_total_without_lulucf_primap"],inplace=True)

columns_order = ['country_iso_code','country_name','validity_date','value','value_units','scope1_excl_source']
df2 = df2.reindex(columns=columns_order).convert_dtypes()
df2.info(verbose=True)
##df2[df2["scope1_excl_source"] =='PRIMAP']
#df2 = requantify_df(df2)

#df2

#df2.sort_values(by=['country_iso_code','validity_date'], ascending=False).to_csv("tmp_scope1_waterfall.csv")
df2[df2["country_iso_code"] == 'CAN']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10633 entries, 0 to 10632
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country_iso_code    10633 non-null  string 
 1   country_name        1333 non-null   string 
 2   validity_date       10633 non-null  Int64  
 3   value               10633 non-null  Float64
 4   value_units         1333 non-null   string 
 5   scope1_excl_source  10633 non-null  string 
dtypes: Float64(1), Int64(1), string(4)
memory usage: 519.3 KB


Unnamed: 0,country_iso_code,country_name,validity_date,value,value_units,scope1_excl_source
544,CAN,Canada,1990,594722.24342,kt CO2e,UNFCCC
545,CAN,Canada,1991,587905.793885,kt CO2e,UNFCCC
546,CAN,Canada,1992,605290.24882,kt CO2e,UNFCCC
547,CAN,Canada,1993,607681.076914,kt CO2e,UNFCCC
548,CAN,Canada,1994,628145.869194,kt CO2e,UNFCCC
549,CAN,Canada,1995,645455.296776,kt CO2e,UNFCCC
550,CAN,Canada,1996,666701.91023,kt CO2e,UNFCCC
551,CAN,Canada,1997,682298.998433,kt CO2e,UNFCCC
552,CAN,Canada,1998,688613.931025,kt CO2e,UNFCCC
553,CAN,Canada,1999,701758.711729,kt CO2e,UNFCCC


In [30]:
import numpy as np
df1['scope1_incl_source'] = np.where(df1.ghg_total_with_lulucf_unfccc.notnull(),
         'UNFCCC',
         np.where(df1.ghg_total_with_lulucf_primap.notnull(),
                  'PRIMAP','UNFCCC'))
#df2['scope1_source'] = df2['ghg_total_without_lulucf'].apply(lambda x: 'UNFCCC' if x.isnull() == True else 'PRIMAP')
df1["value"] = df1["ghg_total_with_lulucf_unfccc"].fillna(df1["ghg_total_with_lulucf_primap"])
df1.drop(columns=['attribute_x','attribute_y',"ghg_total_with_lulucf_unfccc","ghg_total_with_lulucf_primap","value_units_y"],inplace=True)

columns_order = ['country_iso_code','country_name','validity_date','value','value_units','scope1_incl_source']
df1 = df1.reindex(columns=columns_order).convert_dtypes()
df1.info(verbose=True)
##df2[df2["scope1_excl_source"] =='PRIMAP']
#df2 = requantify_df(df2)

#df2

#df2.sort_values(by=['country_iso_code','validity_date'], ascending=False).to_csv("tmp_scope1_waterfall.csv")
df1
#df1[df1["country_iso_code"] == 'CAN']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10634 entries, 0 to 10633
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country_iso_code    10634 non-null  string 
 1   country_name        2256 non-null   string 
 2   validity_date       10634 non-null  Int64  
 3   value               10634 non-null  Float64
 4   value_units         2256 non-null   string 
 5   scope1_incl_source  10634 non-null  string 
dtypes: Float64(1), Int64(1), string(4)
memory usage: 519.4 KB


Unnamed: 0,country_iso_code,country_name,validity_date,value,value_units,scope1_incl_source
0,BRA,Brazil,1990,1346971.12,kt CO2e,UNFCCC
1,BRA,Brazil,1991,1232547.09,kt CO2e,UNFCCC
2,BRA,Brazil,1992,1384444.15,kt CO2e,UNFCCC
3,BRA,Brazil,1993,1458270.7,kt CO2e,UNFCCC
4,BRA,Brazil,1994,1476109.9,kt CO2e,UNFCCC
...,...,...,...,...,...,...
10629,LBR,,1982,9511.8373,,PRIMAP
10630,LCA,,1982,214.0,,PRIMAP
10631,LKA,,1982,34642.54,,PRIMAP
10632,LSO,,1982,2421.838761,,PRIMAP


Read the source tables into dataframes  

<font size="4">Merge the source dataframes in a single dataframe and calculate ghg intensity values</font>

In [31]:
df2 = df2.assign(value=np.where(df2.value.isnull(), 0, df2.value))
df2 = requantify_df(df2)
df2



Unnamed: 0,country_iso_code,country_name,validity_date,value,scope1_excl_source
0,UKR,Ukraine,2013,408987.8823865379,UNFCCC
1,UKR,Ukraine,2014,362562.3435831411,UNFCCC
2,UKR,Ukraine,2015,319107.6011569923,UNFCCC
3,UKR,Ukraine,2016,337413.3941197296,UNFCCC
4,UKR,Ukraine,2017,322998.68517685524,UNFCCC
...,...,...,...,...,...
10628,LBR,,1982,1680.0,PRIMAP
10629,LCA,,1982,214.0,PRIMAP
10630,LKA,,1982,16100.0,PRIMAP
10631,LSO,,1982,2410.0,PRIMAP


In [32]:
df1 = df1.assign(value=np.where(df1.value.isnull(), 0, df1.value))
df1 = requantify_df(df1)
df1



Unnamed: 0,country_iso_code,country_name,validity_date,value,scope1_incl_source
0,BRA,Brazil,1990,1346971.12,UNFCCC
1,BRA,Brazil,1991,1232547.09,UNFCCC
2,BRA,Brazil,1992,1384444.15,UNFCCC
3,BRA,Brazil,1993,1458270.7,UNFCCC
4,BRA,Brazil,1994,1476109.9,UNFCCC
...,...,...,...,...,...
10629,LBR,,1982,9511.8373,PRIMAP
10630,LCA,,1982,214.0,PRIMAP
10631,LKA,,1982,34642.54,PRIMAP
10632,LSO,,1982,2421.838761,PRIMAP


In [33]:

df_result = pd.merge(df1,df2,on=['country_iso_code','validity_date'],how='outer')  
df_result=df_result.convert_dtypes()
df_result.info(verbose=True)
#df_result.drop(columns=['country_name_y','attribute'],inplace=True)
df_result.drop(columns=['country_name_y'],inplace=True)
df_result.rename(columns={"country_name_x":"country_name","value_x": "ghg_total_with_lulucf","value_y": "ghg_total_without_lulucf"},inplace=True)

###

df_result = pd.merge(df_result,df3,on=['country_iso_code','validity_date'],how="outer")  
df_result.rename(columns={"value":"gdp"},inplace=True)
###
df_result = pd.merge(df_result,df4,on=['country_iso_code','validity_date'],how="outer")  
df_result.rename(columns={"value":"gdp_ppp"},inplace=True)



df_result["ghg_intensity_with_lulucf_per_gdp"] = df_result["ghg_total_with_lulucf"]/df_result["gdp_ppp"]*1000000
df_result["ghg_intensity_without_lulucf_per_gdp"] = df_result["ghg_total_without_lulucf"]/df_result["gdp_ppp"]*1000000
df_result.info(verbose=True)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10634 entries, 0 to 10633
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype          
---  ------              --------------  -----          
 0   country_iso_code    10634 non-null  string         
 1   country_name_x      2256 non-null   string         
 2   validity_date       10634 non-null  Int64          
 3   value_x             10634 non-null  pint[CO2e * kt]
 4   scope1_incl_source  10634 non-null  string         
 5   country_name_y      1333 non-null   string         
 6   value_y             10633 non-null  pint[CO2e * kt]
 7   scope1_excl_source  10633 non-null  string         
dtypes: Int64(1), pint[CO2e * kt](2), string(5)
memory usage: 675.1 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15025 entries, 0 to 15024
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype                
---  ------                                --------------  --

In [34]:
# If DF_COL contains Pint quantities (because it is a PintArray or an array of Pint Quantities),
# return a two-column dataframe of magnitudes and units.
# If DF_COL contains no Pint quanities, return it unchanged.

def dequantify_column(df_col: pd.Series):
    if type(df_col.values)==PintArray:
        return pd.DataFrame({df_col.name: df_col.values.quantity.m,
                             df_col.name + "_units": str(df_col.values.dtype.units)},
                            index=df_col.index)
    elif df_col.size==0:
        return df_col
    elif isinstance(df_col.iloc[0], Quantity):
        values = df_col.map(lambda x: (x.m, x.u))
        return pd.DataFrame({df_col.name: df_col.map(lambda x: x.m),
                             df_col.name + "_units": df_col.map(lambda x: str(x.u))},
                            index=df_col.index)
    else:
        return df_col

# Rewrite dataframe DF so that columns containing Pint quantities are represented by a column for the Magnitude and column for the Units.
# The magnitude column retains the original column name and the units column is renamed with a _units suffix.
def dequantify_df(df):
    return pd.concat([dequantify_column(df[col]) for col in df.columns], axis=1)

In [35]:
df1 = dequantify_df(df1)
df2 = dequantify_df(df2)
df3 = dequantify_df(df3)
df_result = dequantify_df(df_result)
df_result=df_result.convert_dtypes()
df_result.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15025 entries, 0 to 15024
Data columns (total 17 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   country_iso_code                            15025 non-null  string 
 1   country_name                                2256 non-null   string 
 2   validity_date                               15025 non-null  Int64  
 3   ghg_total_with_lulucf                       10634 non-null  Float64
 4   ghg_total_with_lulucf_units                 15025 non-null  string 
 5   scope1_incl_source                          10634 non-null  string 
 6   ghg_total_without_lulucf                    10633 non-null  Float64
 7   ghg_total_without_lulucf_units              15025 non-null  string 
 8   scope1_excl_source                          10633 non-null  string 
 9   gdp                                         13118 non-null  Float64
 10  gdp_units 

In [36]:

# make sure schema exists, or table creation below will fail in weird ways
sql = f"""
CREATE SCHEMA if not exists {ingest_catalog}.{ingest_schema}
 AUTHORIZATION USER mersin35
 WITH (
     location = 's3a://osc-datacommons-s3-bucket-dev02/data/pcaf_covereign.db'
 )
"""
print(sql)
qres = engine.execute(sql)
#print(qres.fetchall())


CREATE SCHEMA if not exists osc_datacommons_dev.pcaf_sovereign_footprint
 AUTHORIZATION USER mersin35
 WITH (
     location = 's3a://osc-datacommons-s3-bucket-dev02/data/pcaf_covereign.db'
 )



Run these in a notebook cell if you need to install onto your nb env

<font size="5">Save the results in Trino</font>


In [37]:
import osc_ingest_trino as osc
#df_result['validity_date']=pd.to_datetime(df_result['validity_date'], unit='D')
#df_result=df_result.convert_dtypes()
#df_result.info()
columnschema = osc.create_table_schema_pairs(df_result) 
sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
print(sql)
qres = engine.execute(sql)
#print(qres.fetchall())




drop table if exists osc_datacommons_dev.pcaf_sovereign_footprint.sf_unfccc_results



In [38]:
tabledef = f"""
create table if not exists {ingest_catalog}.{ingest_schema}.{ingest_table}(
{columnschema}
) with (
    format = 'ORC',
    partitioning = array['country_iso_code']
)
"""
print(tabledef)
qres = engine.execute(tabledef)
#print(qres.fetchall())


create table if not exists osc_datacommons_dev.pcaf_sovereign_footprint.sf_unfccc_results(
    country_iso_code varchar,
    country_name varchar,
    validity_date bigint,
    ghg_total_with_lulucf double,
    ghg_total_with_lulucf_units varchar,
    scope1_incl_source varchar,
    ghg_total_without_lulucf double,
    ghg_total_without_lulucf_units varchar,
    scope1_excl_source varchar,
    gdp double,
    gdp_units varchar,
    gdp_ppp double,
    gdp_ppp_units varchar,
    ghg_intensity_with_lulucf_per_gdp double,
    ghg_intensity_with_lulucf_per_gdp_units varchar,
    ghg_intensity_without_lulucf_per_gdp double,
    ghg_intensity_without_lulucf_per_gdp_units varchar
) with (
    format = 'ORC',
    partitioning = array['country_iso_code']
)



In [39]:
# Delete all data from our db, so we start with empty table
sql=f"""
delete from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
qres = engine.execute(sql)
#print(qres.fetchall())

In [40]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
pd.read_sql(sql, engine)


Unnamed: 0,country_iso_code,country_name,validity_date,ghg_total_with_lulucf,ghg_total_with_lulucf_units,scope1_incl_source,ghg_total_without_lulucf,ghg_total_without_lulucf_units,scope1_excl_source,gdp,gdp_units,gdp_ppp,gdp_ppp_units,ghg_intensity_with_lulucf_per_gdp,ghg_intensity_with_lulucf_per_gdp_units,ghg_intensity_without_lulucf_per_gdp,ghg_intensity_without_lulucf_per_gdp_units


In [41]:
print(ingest_catalog)
#df=df.drop(df[df.country_name=="cote d'ivoire"].index)
df_result.to_sql(ingest_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method=osc.TrinoBatchInsert(batch_size = 1000, verbose = True))

osc_datacommons_dev
constructed fully qualified table name as: "pcaf_sovereign_footprint.sf_unfccc_results"
inserting 1000 records
  ('BRA', 'Brazil', 1990, 1346971.12, 'CO2e * kt', 'UNFCCC', 609000.0, 'CO2e * kt', 'PRIMAP', 390725626002.866, 'USD', 997366926074.853, 'USD', 1.3505271578445233, 'CO2e * kt / USD', 0.6106077754119291, 'CO2e * kt / USD')
  ('BRA', 'Brazil', 1991, 1232547.09, 'CO2e * kt', 'UNFCCC', 634000.0, 'CO2e * kt', 'PRIMAP', 342609227209.645, 'USD', 1041738357708.39, 'USD', 1.1831637770460426, 'CO2e * kt / USD', 0.6085981142085135, 'CO2e * kt / USD')
  ('BRA', 'Brazil', 1992, 1384444.15, 'CO2e * kt', 'UNFCCC', 643000.0, 'CO2e * kt', 'PRIMAP', 328187944300.908, 'USD', 1059681669176.96, 'USD', 1.306471736059451, 'CO2e * kt / USD', 0.6067859987607497, 'CO2e * kt / USD')
  ...
  ('CPV', 'Cabo Verde', 1995, 222.56934425, 'CO2e * kt', 'UNFCCC', 343.0, 'CO2e * kt', 'PRIMAP', 487148993.533109, 'USD', 674607712.39943, 'USD', 0.3299241027920808, 'CO2e * kt / USD', 0.50844363871

In [42]:
#df_result.to_csv(ingest_table+".csv")
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table} where country_iso_code='TWN' and validity_date > 2015"""
pd.read_sql(sql, engine)


Unnamed: 0,country_iso_code,country_name,validity_date,ghg_total_with_lulucf,ghg_total_with_lulucf_units,scope1_incl_source,ghg_total_without_lulucf,ghg_total_without_lulucf_units,scope1_excl_source,gdp,gdp_units,gdp_ppp,gdp_ppp_units,ghg_intensity_with_lulucf_per_gdp,ghg_intensity_with_lulucf_per_gdp_units,ghg_intensity_without_lulucf_per_gdp,ghg_intensity_without_lulucf_per_gdp_units
0,TWN,,2016,276074.0,CO2e * kt,PRIMAP,298000.0,CO2e * kt,PRIMAP,,USD,,USD,,CO2e * kt / USD,,CO2e * kt / USD
1,TWN,,2017,282039.0,CO2e * kt,PRIMAP,304000.0,CO2e * kt,PRIMAP,,USD,,USD,,CO2e * kt / USD,,CO2e * kt / USD
2,TWN,,2018,280016.0,CO2e * kt,PRIMAP,302000.0,CO2e * kt,PRIMAP,,USD,,USD,,CO2e * kt / USD,,CO2e * kt / USD
3,TWN,,2019,271083.0,CO2e * kt,PRIMAP,293000.0,CO2e * kt,PRIMAP,,USD,,USD,,CO2e * kt / USD,,CO2e * kt / USD
4,TWN,,2020,269095.096,CO2e * kt,PRIMAP,291000.0,CO2e * kt,PRIMAP,,USD,,USD,,CO2e * kt / USD,,CO2e * kt / USD
5,TWN,,2021,283058.35,CO2e * kt,PRIMAP,305000.0,CO2e * kt,PRIMAP,,USD,,USD,,CO2e * kt / USD,,CO2e * kt / USD
