### The PRIMAP-hist national historical emissions time series (1750-2019) v2.3.1

<font size="3">https://zenodo.org/record/5494497#.YujsrTfP2Un</font>

Load Environment Variables

In [None]:
from dotenv import dotenv_values, load_dotenv
import osc_ingest_trino as osc
import os
import pathlib
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [None]:
# use a catalog that is configured for iceberg
ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'pcaf_sovereign_footprint'
ingest_table = 'sf_primap_hist_emissions'

In [None]:
# Bucket must be configured with credentials for the Hive ingestion bucket
hive_bucket = osc.attach_s3_bucket('S3_OSCCL2')
hive_catalog = 'osc_datacommons_hive_ingest'
hive_schema = 'ingest'

In [None]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

trino_bucket = osc.attach_s3_bucket("S3_OSCCL2")

In [None]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

Open a Trino connection using JWT for authentication

In [None]:
# make sure schema exists, or table creation below will fail in weird ways
sql = f"""
create schema if not exists {ingest_catalog}.{ingest_schema}
"""
qres = engine.execute(sql)
#print(qres.fetchall())

In [None]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

In [None]:
import pandas as pd
import csv
import ParseXLS as parser


primap_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/PRIMAP/Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep_2021.csv')
primap_file.download_file(f'/tmp/Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep_2021.csv')
#df = parser.process('OECD_IMGR_FCO2.ini','OECD.csv') 

df = parser.process('PRIMAP.ini','PRIMAP.csv') 
df=df.astype({'validity_date': 'int32'})
df=df.astype({'country_iso_code': 'string'})
df=df.astype({'attribute': 'string'})
df= df.convert_dtypes()
df.info(verbose=True)

#df['validity_date_dt'] = pd.to_datetime(df.validity_date, format='%Y')
#df = df.convert_dtypes()
#df.validity_date_dt = df.validity_date_dt.astype('datetime64[ns]')



In [None]:
columnschema = osc.create_table_schema_pairs(df,typemap={'datetime64[ns]':'timestamp(6)'}) 
print(columnschema)

tabledef = f"""
create table if not exists {ingest_catalog}.{ingest_schema}.{ingest_table}(
{columnschema}
) with (
    format = 'ORC',
    partitioning = ARRAY['country_iso_code']
)
"""
print(tabledef)


sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
print(sql)
qres = engine.execute(sql)
#print(qres.fetchall())

#table_create = engine.execute(tabledef)
#print(table_create.fetchall())




In [None]:
df.info(verbose=True)
        
        

In [None]:
osc.fast_pandas_ingest_via_hive(
    df,
    engine,
    ingest_catalog, ingest_schema, ingest_table,
    hive_bucket, hive_catalog, hive_schema,
    partition_columns = ['country_iso_code'],
    overwrite = True,
    verbose = True
)

In [None]:
df.to_sql(ingest_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method=osc.TrinoBatchInsert(batch_size = 1, verbose = True))

In [None]:
import pandas as pd
sql=f"""
select *  from {ingest_catalog}.{ingest_schema}.{ingest_table} where validity_date=2007 and country_iso_code='DEU'"""
pd.read_sql(sql, engine)