<font size="6">Calculate GHG Intensity per GDP, PPP </font>

<font size="4">Load Environment Variables</font>

In [1]:
from pcaf_env import *
from functools import reduce
import matplotlib.pyplot as plot

using connect string: trino://MichaelTiemannOSC@trino-secure-odh-trino.apps.odh-cl2.apps.os-climate.org:443/osc_datacommons_dev
create schema if not exists osc_datacommons_dev.mdt_sandbox


In [2]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('aicoe_osc_demo_results',)
('default',)
('demo_dv',)
('dera',)
('essd',)
('iceberg_demo',)
('information_schema',)
('ingest',)
('mdt_sandbox',)
('pcaf_sovereign_footprint',)
('rmi',)
('sandbox',)
('wri_gppd',)


In [3]:
# define source and destination tables
# LULUCF (Land Use, Land-Use Change and Forestry)

ingest_table = 'sf_unfccc_results'
src_table_1 = 'sf_unfccc_with_lulucf'
src_table_2 = 'sf_unfccc_without_lulucf'
src_table_3 = 'sf_wdi_gdp'
models_table = 'pcaf_dbt_models'

In [5]:
sql=f"""
select country_iso_code,country_name,validity_date,attribute,value,value_units from {ingest_catalog}.{ingest_schema}.{src_table_1}_source """ 
df1 = pd.read_sql(sql, engine)
#df1["units"] = "kt"
df1 = requantify_df(df1).convert_dtypes()
df1
df1.info(verbose=True)
sql=f"""
select country_iso_code,country_name,validity_date,attribute,value,value_units from {ingest_catalog}.{ingest_schema}.{src_table_2}_source """
df2 = pd.read_sql(sql, engine)
df2 = requantify_df(df2).convert_dtypes()
#df2
sql=f"""
select country_iso_code,validity_date,value,value_units from {ingest_catalog}.{ingest_schema}.{src_table_3}_source """
df3 = pd.read_sql(sql, engine)
df3 = requantify_df(df3).convert_dtypes()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2256 entries, 0 to 2255
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype           
---  ------            --------------  -----           
 0   country_iso_code  2256 non-null   string          
 1   country_name      2256 non-null   string          
 2   validity_date     2256 non-null   Int64           
 3   attribute         2256 non-null   string          
 4   value             2256 non-null   pint[CO2eq * kt]
dtypes: Int64(1), pint[CO2eq * kt](1), string(3)
memory usage: 90.5 KB


Read the source tables into dataframes  

<font size="4">Merge the source dataframes in a single dataframe and calculate ghg intensity values</font>

In [6]:
df_result = pd.merge(df1,df2,on=['country_iso_code','validity_date'])  
df_result=df_result.convert_dtypes()
df_result.drop(columns=['country_name_y','attribute_x','attribute_y'],inplace=True)
df_result.rename(columns={"country_name_x":"country_name", "value_x":"ghg_total_with_lulucf", "value_y":"ghg_total_without_lulucf"},inplace=True)

###

df_result = pd.merge(df_result,df3,on=['country_iso_code','validity_date'],how="inner")  
df_result.rename(columns={"value":"gdp"},inplace=True)
df_result["ghg_intensity_with_lulucf_per_gdp"] = df_result["ghg_total_with_lulucf"]/df_result["gdp"]*1000000
df_result["ghg_intensity_without_lulucf_per_gdp"] = df_result["ghg_total_without_lulucf"]/df_result["gdp"]*1000000
df_result.info(verbose=True)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4275 entries, 0 to 4274
Data columns (total 8 columns):
 #   Column                                Non-Null Count  Dtype                 
---  ------                                --------------  -----                 
 0   country_iso_code                      4275 non-null   string                
 1   country_name                          4275 non-null   string                
 2   validity_date                         4275 non-null   Int64                 
 3   ghg_total_with_lulucf                 4275 non-null   pint[CO2eq * kt]      
 4   ghg_total_without_lulucf              4275 non-null   pint[CO2eq * kt]      
 5   gdp                                   4275 non-null   pint[USD]             
 6   ghg_intensity_with_lulucf_per_gdp     4275 non-null   pint[CO2eq * kt / USD]
 7   ghg_intensity_without_lulucf_per_gdp  4275 non-null   pint[CO2eq * kt / USD]
dtypes: Int64(1), pint[CO2eq * kt / USD](2), pint[CO2eq * kt](2), pint[USD

In [7]:
df1 = dequantify_df(df1)
df2 = dequantify_df(df2)
df3 = dequantify_df(df3)
df_result = dequantify_df(df_result)
df_result=df_result.convert_dtypes()
df_result.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4275 entries, 0 to 4274
Data columns (total 13 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   country_iso_code                            4275 non-null   string 
 1   country_name                                4275 non-null   string 
 2   validity_date                               4275 non-null   Int64  
 3   ghg_total_with_lulucf                       4275 non-null   Float64
 4   ghg_total_with_lulucf_units                 4275 non-null   string 
 5   ghg_total_without_lulucf                    4275 non-null   Float64
 6   ghg_total_without_lulucf_units              4275 non-null   string 
 7   gdp                                         4275 non-null   Float64
 8   gdp_units                                   4275 non-null   string 
 9   ghg_intensity_with_lulucf_per_gdp           4275 non-null   Float64
 10  ghg_intensit

<font size="5">Save the results in Trino</font>


In [10]:
create_trino_table_and_dbt_metadata(ingest_table, df_result, ['country_iso_code'], custom_meta_content='', custom_meta_fields='', verbose=True)

drop table if exists mdt_sandbox.sf_unfccc_results_source
enforcing dataframe partition column order

verifying existence of table osc_datacommons_dev.mdt_sandbox.sf_unfccc_results_source
create table if not exists osc_datacommons_dev.mdt_sandbox.sf_unfccc_results_source (
    country_name varchar,
    validity_date bigint,
    ghg_total_with_lulucf double,
    ghg_total_with_lulucf_units varchar,
    ghg_total_without_lulucf double,
    ghg_total_without_lulucf_units varchar,
    gdp double,
    gdp_units varchar,
    ghg_intensity_with_lulucf_per_gdp double,
    ghg_intensity_with_lulucf_per_gdp_units varchar,
    ghg_intensity_without_lulucf_per_gdp double,
    ghg_intensity_without_lulucf_per_gdp_units varchar,
    country_iso_code varchar
) with (
    format = 'parquet',
    partitioning = array['country_iso_code']
)

staging dataframe parquet to s3 osc-datacommons-s3-bucket-dev02
/tmp/ingest_temp_a86e1deb/country_iso_code=UZB/21d7e6f0abf74035b8f779d5b892e5be-0.parquet  -->  trino

In [11]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}_source"""
pd.read_sql(sql, engine)


Unnamed: 0,country_name,validity_date,ghg_total_with_lulucf,ghg_total_with_lulucf_units,ghg_total_without_lulucf,ghg_total_without_lulucf_units,gdp,gdp_units,ghg_intensity_with_lulucf_per_gdp,ghg_intensity_with_lulucf_per_gdp_units,ghg_intensity_without_lulucf_per_gdp,ghg_intensity_without_lulucf_per_gdp_units,country_iso_code
0,Congo,1994,-6.848599e+04,CO2eq * kt,1374.69100,CO2eq * kt,1.769365e+09,USD,-38.706526,CO2eq * kt / USD,0.776940,CO2eq * kt / USD,COG
1,Congo,1994,-6.848599e+04,CO2eq * kt,1374.69100,CO2eq * kt,8.892706e+09,USD,-7.701366,CO2eq * kt / USD,0.154586,CO2eq * kt / USD,COG
2,Congo,2000,-8.000142e+04,CO2eq * kt,2065.05200,CO2eq * kt,3.227928e+09,USD,-24.784142,CO2eq * kt / USD,0.639745,CO2eq * kt / USD,COG
3,Congo,2000,-8.000142e+04,CO2eq * kt,2065.05200,CO2eq * kt,1.155590e+10,USD,-6.922995,CO2eq * kt / USD,0.178701,CO2eq * kt / USD,COG
4,Guyana,1990,-5.810327e+04,CO2eq * kt,2553.23000,CO2eq * kt,3.965823e+08,USD,-146.510007,CO2eq * kt / USD,6.438084,CO2eq * kt / USD,GUY
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4270,Central African Republic,2008,-1.736143e+06,CO2eq * kt,9782.46293,CO2eq * kt,3.635577e+09,USD,-477.542439,CO2eq * kt / USD,2.690759,CO2eq * kt / USD,CAF
4271,Central African Republic,2009,-1.740723e+06,CO2eq * kt,5201.62710,CO2eq * kt,2.067382e+09,USD,-841.994201,CO2eq * kt / USD,2.516046,CO2eq * kt / USD,CAF
4272,Central African Republic,2009,-1.740723e+06,CO2eq * kt,5201.62710,CO2eq * kt,3.973077e+09,USD,-438.129738,CO2eq * kt / USD,1.309219,CO2eq * kt / USD,CAF
4273,Central African Republic,2010,-1.740700e+06,CO2eq * kt,5225.25470,CO2eq * kt,2.142591e+09,USD,-812.427309,CO2eq * kt / USD,2.438755,CO2eq * kt / USD,CAF


In [16]:
osc._do_sql(f"insert into {ingest_schema}.{models_table} values(\'{json.dumps(dbt_dict['models'])}\')", engine, verbose=True)

insert into mdt_sandbox.pcaf_dbt_models values('{"sf_wdi_gdp": {"description": "\"\\nThis indicator provides per capita values for gross domestic product (GDP) expressed in current international dollars converted by purchasing power parity (PPP) conversion factor. \\n\\nGDP is the sum of gross value added by all resident producers in the country plus any product taxes and minus any subsidies not included in the value of the products. conversion factor is a spatial price deflator and currency converter that controls for price level differences between countries. Total population is a mid-year population based on the de facto definition of population, which counts all residents regardless of legal status or citizenship.\"", "columns": {"rec_source": {"description": "\"API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv and API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv\""}, "data_provider": {"description": "\"WDI\"", "tags": ["WDI"]}, "country_iso_code": {"description": "\"ISO-3166 Country Code (alpha_3)\"", 

[(1,)]