<font size="5">Ingest Time Series - GHG total without LULUCF, in kt CO₂ equivalent into Trino pipeline</font>


In [1]:
# 'capture' magic prevents long outputs from spamming your notebook
#%%capture pipoutput

# For loading predefined environment variables from files
# Typically used to load sensitive access credentials
%pip install python-dotenv

# Standard python package for interacting with S3 buckets
%pip install boto3

# Interacting with Trino and using Trino with sqlalchemy
%pip install trino sqlalchemy sqlalchemy-trino

# Pandas and parquet file i/o
%pip install pandas pyarrow fastparquet

# OS-Climate utilities to make data ingest easier
%pip install osc-ingest-tools
%pip install openpyxl
%pip install country_converter
%pip install pint_pandas
%pip install openscm_units

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import dotenv_values, load_dotenv
import osc_ingest_trino as osc
import os
import pathlib

Load Environment Variables

In [3]:
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [4]:
# use a catalog that is configured for iceberg
ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'pcaf_sovereign_footprint'
ingest_table = 'sf_unfccc_without_lulucf'

In [5]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

trino_bucket = osc.attach_s3_bucket("S3_DEV")

In [6]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

Open a Trino connection using JWT for authentication

In [7]:
# make sure schema exists, or table creation below will fail in weird ways
sql = f"""
create schema if not exists {ingest_catalog}.{ingest_schema}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [8]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('default',)
('demo_dv',)
('iceberg_demo',)
('information_schema',)
('pcaf_sovereign_footprint',)
('sandbox',)


In [9]:
pip install country_converter --upgrade

Note: you may need to restart the kernel to use updated packages.


Load PCAF_UNFCC_WITHOUT_LULUCF

In [10]:
import pandas as pd
import ParseXLS as parser

ticker_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/UNFCC/Time_Series_GHG_total_without_LULUCF_in_kt_CO₂_equivalent.xlsx')
ticker_file.download_file(f'/tmp/Time_Series_GHG_total_without_LULUCF_in_kt_CO₂_equivalent.xlsx')

df = parser.process('UNFCCC_without_LULUCF.ini','UNFCCC_without_LULUCF.csv') 

ticker_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/UNFCC/Annual_Net_emissions_removals_in_Gg_CO₂_equivalent_excl_LULUCF.csv')
ticker_file.download_file(f'/tmp/Annual_Net_emissions_removals_in_Gg_CO₂_equivalent_excl_LULUCF.csv')

df2 = parser.process('UNFCCC_without_LULUCF_other.ini','UNFCCC_without_LULUCF_other.csv') 

# combine both dataframes

df =pd.concat([df,df2])

#pd.to_numeric(df["validity_date"],errors='raise')
df=df.astype({'validity_date': 'int32'})
df=df.astype({'country_iso_code': 'string'})
df=df.astype({'value': 'float'})

df= df.convert_dtypes()
df.info(verbose=True)
df= df[['rec_source','data_provider','country_iso_code','country_name','validity_date','attribute','value','value_units']].dropna(subset=['value'])






European Union (Convention) not found in regex
European Union (KP) not found in regex
nan not found in ISO3
Source: UNFCCC GHG Data Interface not found in regex
Note 1: The reporting and review requirements for GHG inventories are different for Annex I and non-Annex I Parties. The definition format of data for emissions/removals from the forestry sector is different for Annex I and non-Annex I Parties. not found in regex


UNFCCC_without_LULUCF.ini
file_list:
['/tmp/Time_Series_GHG_total_without_LULUCF_in_kt_CO₂_equivalent.xlsx']
/tmp/Time_Series_GHG_total_without_LULUCF_in_kt_CO₂_equivalent.xlsx
2
xls
/tmp/Time_Series_GHG_total_without_LULUCF_in_kt_CO₂_equivalen
<configparser.ConfigParser object at 0x7f20a81fb9d0>
['0', '2']
eval_components
['Time Series - GHG total without LULUCF, in kt CO₂ equivalent']
['Party', 'Base year', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', 'Last Inventory Year (2019)', 'Change from base year to latest reported year']
Time Series - GHG total without LULUCF, in kt CO₂ equivalent
52


Note 2: Base year data in the data interface relate to the base year under the Climate Change Convention (UNFCCC).  The base year under the Convention is defined slightly different than the base year under the Kyoto Protocol.  An exception is made for European Union (KP) whereby the base year under the Kyoto Protocol is displayed. not found in regex
Note 3: – means "No data available" not found in regex
Note 4: Data displayed on the data interface are "as received" from Parties. The publication of Party submissions on this website does not imply the expression of any opinion whatsoever on the part of the UNFCCC or the Secretariat of the United Nations concerning the legal status of any country, territory, city or area or of its authorities, or concerning the delimitation of its frontiers or boundaries as may be referred to in any of the submissions. not found in regex
Report produced on Friday, 18 March 2022 14:21:36 CET not found in regex
Party \ Unit not found in regex


52
list columns
['country_iso_code', 'country_name', 'attribute', 'units', 'year', 'value']
['country_iso_code', 'country_name', 'attribute', 'value_units', 'value', 'rec_source', 'data_provider', 'validity_date']
UNFCCC_without_LULUCF_other.ini
file_list:
['/tmp/Annual_Net_emissions_removals_in_Gg_CO₂_equivalent_excl_LULUCF.csv']
/tmp/Annual_Net_emissions_removals_in_Gg_CO₂_equivalent_excl_LULUCF.csv
2
csv
/tmp/Annual_Net_emissions_removals_in_Gg_CO₂_equivalent_excl_LULUC
<configparser.ConfigParser object at 0x7f208b1c2a60>
['0', '3']
eval_components
['Time Series - GHG total without LULUCF, in kt CO₂ equivalent']
['Year', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', 'Last Inventory Year (2019)']
Time Series - GHG total without LULUCF, in kt CO₂ equivalent
156


Note 1: The reporting and review requirements for GHG inventories are different for Annex I and non-Annex I Parties. The definition format of data for emissions/removals from the forestry sector is different for Annex I and non-Annex I Parties. not found in regex
Note 2: Base year data in the data interface relate to the base year under the Climate Change Convention (UNFCCC).  The base year under the Convention is defined slightly different than the base year under the Kyoto Protocol.  An exception is made for European Union (KP) whereby the base year under the Kyoto Protocol is displayed. not found in regex
More then one regular expression match for Note 3: Some non-Annex I Parties submitted their GHG inventory data using  the format of the 2006 IPCC Guidelines in reporting GHG emissions/removals.  For this reason, these data could not be included in the data interface.  However, the data are available in the national communications (Andorra, Angola,  Antigua and Barbuda, Armenia, Aze

156
list columns
['country_iso_code', 'country_name', 'attribute', 'units', 'year', 'value']
['country_iso_code', 'country_name', 'attribute', 'value_units', 'value', 'rec_source', 'data_provider', 'validity_date']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5760 entries, 0 to 4675
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   rec_source        5760 non-null   string 
 1   data_provider     5760 non-null   string 
 2   country_name      5760 non-null   string 
 3   country_iso_code  5760 non-null   string 
 4   validity_date     5760 non-null   Int32  
 5   attribute         5760 non-null   string 
 6   value             2212 non-null   Float64
 7   value_units       5760 non-null   string 
dtypes: Float64(1), Int32(1), string(6)
memory usage: 393.8 KB


In [11]:
import osc_ingest_trino as osc
columnschema = osc.create_table_schema_pairs(df) 

sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
print(sql)
qres = engine.execute(sql)
print(qres.fetchall())



drop table if exists osc_datacommons_dev.pcaf_sovereign_footprint.sf_unfccc_without_lulucf

[(True,)]


In [12]:
tabledef = f"""
create table if not exists {ingest_catalog}.{ingest_schema}.{ingest_table}(
{columnschema}
) with (
    format = 'ORC',
    partitioning = array['country_iso_code']
)
"""
print(tabledef)
qres = engine.execute(tabledef)
print(qres.fetchall())


create table if not exists osc_datacommons_dev.pcaf_sovereign_footprint.sf_unfccc_without_lulucf(
    rec_source varchar,
    data_provider varchar,
    country_iso_code varchar,
    country_name varchar,
    validity_date integer,
    attribute varchar,
    value double,
    value_units varchar
) with (
    format = 'ORC',
    partitioning = array['country_iso_code']
)

[(True,)]


In [13]:
# Delete all data from our db, so we start with empty table
sql=f"""
delete from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(None,)]


In [14]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
pd.read_sql(sql, engine)


Unnamed: 0,rec_source,data_provider,country_iso_code,country_name,validity_date,attribute,value,value_units


In [15]:
print(ingest_catalog)
#df=df.drop(df[df.country_name=="cote d'ivoire"].index)
df.to_sql(ingest_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method=osc.TrinoBatchInsert(batch_size = 1000, verbose = True))

osc_datacommons_dev
constructed fully qualified table name as: "pcaf_sovereign_footprint.sf_unfccc_without_lulucf"
inserting 1000 records
  ('Time_Series_GHG_total_without_LULUCF_in_kt_CO₂_equivalent.xlsx', 'UNFCCC', 'AUS', 'Australia', 1990, 'Time Series - GHG total without LULUCF, in kt CO₂ equivalent', 423672.2245458623, 'kt CO2e')
  ('Time_Series_GHG_total_without_LULUCF_in_kt_CO₂_equivalent.xlsx', 'UNFCCC', 'AUT', 'Austria', 1990, 'Time Series - GHG total without LULUCF, in kt CO₂ equivalent', 78420.48531347305, 'kt CO2e')
  ('Time_Series_GHG_total_without_LULUCF_in_kt_CO₂_equivalent.xlsx', 'UNFCCC', 'BLR', 'Belarus', 1990, 'Time Series - GHG total without LULUCF, in kt CO₂ equivalent', 139151.98563558093, 'kt CO2e')
  ...
  ('Time_Series_GHG_total_without_LULUCF_in_kt_CO₂_equivalent.xlsx', 'UNFCCC', 'EST', 'Estonia', 2013, 'Time Series - GHG total without LULUCF, in kt CO₂ equivalent', 22019.83909772197, 'kt CO2e')
batch insert result: [(1000,)]
inserting 1000 records
  ('Time_Se

In [16]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}"""
pd.read_sql(sql, engine)


Unnamed: 0,rec_source,data_provider,country_iso_code,country_name,validity_date,attribute,value,value_units
0,Annual_Net_emissions_removals_in_Gg_CO₂_equiva...,UNFCCC,UZB,Uzbekistan,1990,"Time Series - GHG total without LULUCF, in kt ...",180315.320000,kt CO2e
1,Annual_Net_emissions_removals_in_Gg_CO₂_equiva...,UNFCCC,UZB,Uzbekistan,1991,"Time Series - GHG total without LULUCF, in kt ...",183155.740000,kt CO2e
2,Annual_Net_emissions_removals_in_Gg_CO₂_equiva...,UNFCCC,UZB,Uzbekistan,1992,"Time Series - GHG total without LULUCF, in kt ...",176299.410000,kt CO2e
3,Annual_Net_emissions_removals_in_Gg_CO₂_equiva...,UNFCCC,UZB,Uzbekistan,1993,"Time Series - GHG total without LULUCF, in kt ...",203048.700000,kt CO2e
4,Annual_Net_emissions_removals_in_Gg_CO₂_equiva...,UNFCCC,UZB,Uzbekistan,1994,"Time Series - GHG total without LULUCF, in kt ...",183614.000000,kt CO2e
...,...,...,...,...,...,...,...,...
2207,Time_Series_GHG_total_without_LULUCF_in_kt_CO₂...,UNFCCC,ROU,Romania,2015,"Time Series - GHG total without LULUCF, in kt ...",113193.867137,kt CO2e
2208,Time_Series_GHG_total_without_LULUCF_in_kt_CO₂...,UNFCCC,ROU,Romania,2016,"Time Series - GHG total without LULUCF, in kt ...",110762.205144,kt CO2e
2209,Time_Series_GHG_total_without_LULUCF_in_kt_CO₂...,UNFCCC,ROU,Romania,2017,"Time Series - GHG total without LULUCF, in kt ...",114245.644446,kt CO2e
2210,Time_Series_GHG_total_without_LULUCF_in_kt_CO₂...,UNFCCC,ROU,Romania,2018,"Time Series - GHG total without LULUCF, in kt ...",115090.958527,kt CO2e
