<font size="5">Ingest Time Series - GHG total with LULUCF, in kt CO₂ equivalent into Trino pipeline</font>


In [1]:
from dotenv import dotenv_values, load_dotenv
import osc_ingest_trino as osc
import os
import pathlib

Load Environment Variables

In [2]:
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [3]:
# use a catalog that is configured for iceberg
ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'pcaf_sovereign_footprint'
ingest_table = 'sf_unfccc_with_lulucf'
ingest_country_table ='sf_unfccc_countries'

In [4]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

trino_bucket = osc.attach_s3_bucket("S3_DEV")

In [5]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

Open a Trino connection using JWT for authentication

In [6]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('aicoe_osc_demo_results',)
('default',)
('demo_dv',)
('dera',)
('essd',)
('iceberg_demo',)
('information_schema',)
('ingest',)
('mdt_sandbox',)
('pcaf_sovereign_footprint',)
('rmi',)
('sandbox',)
('wri_gppd',)


In [7]:
pip install country_converter --upgrade


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


Load PCAF_UNFCC_WITH_LULUCF

In [8]:
import pandas as pd
import ParseXLS as parser

ticker_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/UNFCCC/Time_Series_GHG_total_with_LULUCF_in_kt_CO₂_equivalent.xlsx')
ticker_file.download_file(f'/tmp/Time_Series_GHG_total_with_LULUCF_in_kt_CO₂_equivalent.xlsx')

df1 = parser.process('UNFCCC_with_LULUCF.ini','UNFCCC_with_LULUCF.csv') 


ticker_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/UNFCCC/Annual_Net_emissions_removals_in_Gg_CO₂_equivalent_incl_LULUCF.csv')
ticker_file.download_file(f'/tmp/Annual_Net_emissions_removals_in_Gg_CO₂_equivalent_incl_LULUCF.csv')

df2 = parser.process('UNFCCC_with_LULUCF_other.ini','UNFCCC_with_LULUCF_other.csv') 

# combine both dataframes

df =pd.concat([df1,df2])

#pd.to_numeric(df["validity_date"],errors='raise')
df=df.astype({'validity_date': 'int32'})
df=df.astype({'country_iso_code': 'string'})
df=df.astype({'value': 'float'})

df= df.convert_dtypes()
df.info(verbose=True)
df= df[['rec_source','data_provider','country_iso_code','country_name','validity_date','attribute','value','value_units']].dropna(subset=['value'])
#df


UNFCCC_with_LULUCF.ini
file_list:
['/tmp/Time_Series_GHG_total_with_LULUCF_in_kt_CO₂_equivalent.xlsx']
/tmp/Time_Series_GHG_total_with_LULUCF_in_kt_CO₂_equivalent.xlsx
2
xls
/tmp/Time_Series_GHG_total_with_LULUCF_in_kt_CO₂_equivalen
<configparser.ConfigParser object at 0x7f48a0c1ce20>
['0', '2']
UNFCCC_with_LULUCF_other.ini
file_list:
['/tmp/Annual_Net_emissions_removals_in_Gg_CO₂_equivalent_incl_LULUCF.csv']
/tmp/Annual_Net_emissions_removals_in_Gg_CO₂_equivalent_incl_LULUCF.csv
2
csv
/tmp/Annual_Net_emissions_removals_in_Gg_CO₂_equivalent_incl_LULUC
<configparser.ConfigParser object at 0x7f48a0c09a00>


NameError: name 'XXXXXXX' is not defined

In [None]:
df_annex1=df1[["country_iso_code"]].drop_duplicates()
df_annex1=df_annex1.astype({'country_iso_code': 'string'})
df_annex1["annex1_flag"] = "Y"
df_annex1=df_annex1.astype({'annex1_flag': 'string'})
df_annex1.info(verbose=True)
df_annex1

In [None]:
df_non_annex1=df2[["country_iso_code"]].drop_duplicates()
df_non_annex1["annex1_flag"] = "N"
df_non_annex1

In [None]:
df_country = pd.concat([df_annex1,df_non_annex1])
df_country=df_country.astype({'country_iso_code': 'string'})
df_country = df_country.convert_dtypes()
df_country
## add the missing countries

missing_countries = ({'country_iso_code': ['TWN','AND','HKG','BMU'],'annex1_flag':['N','N','N','N']})
df_country_missing = pd.DataFrame(missing_countries)
df_country = pd.concat([df_country,df_country_missing])                                            
df_country
#df_country[df_country["annex1_flag"]=='Y']

In [None]:
import osc_ingest_trino as osc
columnschema = osc.create_table_schema_pairs(df) 

sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_country_table}
"""
print(sql)
qres = engine.execute(sql)

In [None]:
print(ingest_catalog)
#df=df.drop(df[df.country_name=="cote d'ivoire"].index)
df_country.to_sql(ingest_country_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method=osc.TrinoBatchInsert(batch_size = 1000, verbose = True))

In [None]:
import osc_ingest_trino as osc
columnschema = osc.create_table_schema_pairs(df) 

sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
print(sql)
qres = engine.execute(sql)

In [None]:
tabledef = f"""
create table if not exists {ingest_catalog}.{ingest_schema}.{ingest_table}(
{columnschema}
) with (
    format = 'ORC',
    partitioning = array['country_iso_code']
)
"""
print(tabledef)
qres = engine.execute(tabledef)
#print(qres.fetchall())

In [None]:
# Delete all data from our db, so we start with empty table
sql=f"""
delete from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
qres = engine.execute(sql)
print(qres.fetchall())

In [None]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
pd.read_sql(sql, engine)


In [None]:
print(ingest_catalog)
#df=df.drop(df[df.country_name=="cote d'ivoire"].index)
df.to_sql(ingest_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method=osc.TrinoBatchInsert(batch_size = 1000, verbose = True))

In [None]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table} where country_iso_code='CAN' and validity_date=2020"""
pd.read_sql(sql, engine)


In [None]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_country_table}  """
df = pd.read_sql(sql, engine)
df.to_excel("pcaf_countries.xls",index=False,encoding='utf-8')