### The PRIMAP-hist national historical emissions time series (1750-2019) v2.3.1

<font size="3">https://zenodo.org/record/5494497#.YujsrTfP2Un</font>

In [1]:
# 'capture' magic prevents long outputs from spamming your notebook
#%%capture pipoutput

# For loading predefined environment variables from files
# Typically used to load sensitive access credentials
%pip install python-dotenv

# Standard python package for interacting with S3 buckets
%pip install boto3

# Interacting with Trino and using Trino with sqlalchemy
%pip install trino sqlalchemy sqlalchemy-trino

# Pandas and parquet file i/o
%pip install pandas pyarrow fastparquet

# OS-Climate utilities to make data ingest easier
%pip install --upgrade osc-ingest-tools
#%pip install osc-ingest-tools

%pip install osc_ingest_trino
%pip install country_converter --upgrade

%pip install pint-pandas
%pip install openscm-units
%pip install pint
%pip install pycountry
%pip install openpyxl



Collecting python-dotenv
  Downloading python_dotenv-0.20.0-py3-none-any.whl (17 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.20.0
Note: you may need to restart the kernel to use updated packages.
Collecting boto3
  Downloading boto3-1.24.55-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.5/132.5 kB[0m [31m92.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.6/79.6 kB[0m [31m312.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting botocore<1.28.0,>=1.27.55
  Downloading botocore-1.27.55-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m353.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jmespath, 

Load Environment Variables

In [2]:
from dotenv import dotenv_values, load_dotenv
import osc_ingest_trino as osc
import os
import pathlib
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [3]:
# use a catalog that is configured for iceberg
ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'pcaf_sovereign_footprint'
ingest_table = 'sf_primap_hist_emissions'

In [4]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

trino_bucket = osc.attach_s3_bucket("S3_OSCCL2")

In [5]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

Open a Trino connection using JWT for authentication

In [6]:
# make sure schema exists, or table creation below will fail in weird ways
sql = f"""
create schema if not exists {ingest_catalog}.{ingest_schema}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [7]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('default',)
('demo_dv',)
('iceberg_demo',)
('information_schema',)
('ingest',)
('mdt_sandbox',)
('pcaf_sovereign_footprint',)
('sandbox',)


In [8]:
import pandas as pd
import csv
import ParseXLS as parser


primap_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/PRIMAP/Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep_2021.csv')
primap_file.download_file(f'/tmp/Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep_2021.csv')
#df = parser.process('OECD_IMGR_FCO2.ini','OECD.csv') 

df = parser.process('PRIMAP.ini','PRIMAP.csv') 
df=df.astype({'validity_date': 'int32'})
df=df.astype({'country_iso_code': 'string'})
df=df.astype({'attribute': 'string'})
df= df.convert_dtypes()
df.info(verbose=True)

df['validity_date_dt'] = pd.to_datetime(df.validity_date, format='%Y')
df = df.convert_dtypes()
df.validity_date_dt = df.validity_date_dt.astype('datetime64[ns]')



PRIMAP.ini
file_list:
['/tmp/Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep_2021.csv']
/tmp/Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep_2021.csv
2
csv
/tmp/Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep_202
<configparser.ConfigParser object at 0x7f7056c6afd0>
                   source scenario (PRIMAP-hist) area (ISO3) entity  \
0      PRIMAP-hist_v2.3.1                 HISTCR         ABW    CH4   
1      PRIMAP-hist_v2.3.1                 HISTCR         ABW    CH4   
2      PRIMAP-hist_v2.3.1                 HISTCR         ABW    CH4   
3      PRIMAP-hist_v2.3.1                 HISTCR         ABW    CH4   
4      PRIMAP-hist_v2.3.1                 HISTCR         ABW    CH4   
...                   ...                    ...         ...    ...   
28275  PRIMAP-hist_v2.3.1                 HISTTP         ZWE    N2O   
28276  PRIMAP-hist_v2.3.1                 HISTTP         ZWE    N2O   
28277  PRIMAP-hist_v2.3.1                 HISTTP         ZWE    N2O   
28278  PRIMAP-hist_v2.3.1  

In [9]:
columnschema = osc.create_table_schema_pairs(df,typemap={'datetime64[ns]':'timestamp(6)'}) 
print(columnschema)

tabledef = f"""
create table if not exists {ingest_catalog}.{ingest_schema}.{ingest_table}(
{columnschema}
) with (
    format = 'ORC',
    partitioning = ARRAY['country_iso_code']
)
"""
print(tabledef)


sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
print(sql)
qres = engine.execute(sql)
print(qres.fetchall())

table_create = engine.execute(tabledef)
print(table_create.fetchall())




    rec_source varchar,
    data_provider varchar,
    country_name bigint,
    country_iso_code varchar,
    validity_date integer,
    attribute varchar,
    value double,
    value_units varchar,
    validity_date_dt timestamp(6)

create table if not exists osc_datacommons_dev.pcaf_sovereign_footprint.sf_primap_hist_emissions(
    rec_source varchar,
    data_provider varchar,
    country_name bigint,
    country_iso_code varchar,
    validity_date integer,
    attribute varchar,
    value double,
    value_units varchar,
    validity_date_dt timestamp(6)
) with (
    format = 'ORC',
    partitioning = ARRAY['country_iso_code']
)


drop table if exists osc_datacommons_dev.pcaf_sovereign_footprint.sf_primap_hist_emissions

[(True,)]
[(True,)]


In [10]:
df.info(verbose=True)
        
        

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1023840 entries, 0 to 1023839
Data columns (total 9 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   rec_source        1023840 non-null  string        
 1   data_provider     1023840 non-null  string        
 2   country_name      0 non-null        Int64         
 3   country_iso_code  1023840 non-null  string        
 4   validity_date     1023840 non-null  Int32         
 5   attribute         1023840 non-null  string        
 6   value             1019882 non-null  Float64       
 7   value_units       1023840 non-null  string        
 8   validity_date_dt  1023840 non-null  datetime64[ns]
dtypes: Float64(1), Int32(1), Int64(1), datetime64[ns](1), string(5)
memory usage: 77.1 MB


In [11]:
df.to_sql(ingest_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method=osc.TrinoBatchInsert(batch_size = 5000, verbose = True))

constructed fully qualified table name as: "pcaf_sovereign_footprint.sf_primap_hist_emissions"
inserting 5000 records
  ('Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep_2021.csv', 'PRIMAP', NULL, 'ABW', 1750, 'CH4', 0.014, 'Gg CH4 / yr', TIMESTAMP '1750-01-01 00:00:00')
  ('Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep_2021.csv', 'PRIMAP', NULL, 'ABW', 1750, 'CO2', 0.0, 'Gg CO2 / yr', TIMESTAMP '1750-01-01 00:00:00')
  ('Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep_2021.csv', 'PRIMAP', NULL, 'ABW', 1750, 'KYOTOGHG (AR4GWP100)', 0.365, 'Gg CO2 / yr', TIMESTAMP '1750-01-01 00:00:00')
  ...
  ('Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep_2021.csv', 'PRIMAP', NULL, 'MOZ', 1751, 'HFCS (AR4GWP100)', 0.0, 'Gg CO2 / yr', TIMESTAMP '1751-01-01 00:00:00')
batch insert result: [(5000,)]
inserting 5000 records
  ('Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep_2021.csv', 'PRIMAP', NULL, 'MOZ', 1751, 'HFCS (SARGWP100)', 0.0, 'Gg CO2 / yr', TIMESTAMP '1751-01-01 00:00:00')
  ('Guetschow-et-al-

In [12]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}" + "where validity_date=2019"""
pd.read_sql(sql, engine)

Unnamed: 0,rec_source,data_provider,country_name,country_iso_code,validity_date,attribute,value,value_units,validity_date_dt
0,Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep...,PRIMAP,,LBN,2019,CH4,109.000000,Gg CH4 / yr,2019-01-01 00:00:00.000
1,Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep...,PRIMAP,,LBN,2019,CO2,21000.000000,Gg CO2 / yr,2019-01-01 00:00:00.000
2,Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep...,PRIMAP,,LBN,2019,KYOTOGHG (AR4GWP100),25400.000000,Gg CO2 / yr,2019-01-01 00:00:00.000
3,Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep...,PRIMAP,,LBN,2019,KYOTOGHG (SARGWP100),25100.000000,Gg CO2 / yr,2019-01-01 00:00:00.000
4,Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep...,PRIMAP,,LBN,2019,N2O,5.780000,Gg N2O / yr,2019-01-01 00:00:00.000
...,...,...,...,...,...,...,...,...,...
3787,Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep...,PRIMAP,,LUX,2019,KYOTOGHG (SARGWP100),10700.000000,Gg CO2 / yr,2019-01-01 00:00:00.000
3788,Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep...,PRIMAP,,LUX,2019,N2O,1.190000,Gg N2O / yr,2019-01-01 00:00:00.000
3789,Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep...,PRIMAP,,LUX,2019,PFCS (AR4GWP100),2.050000,Gg CO2 / yr,2019-01-01 00:00:00.000
3790,Guetschow-et-al-2021-PRIMAP-hist_v2.3.1_20-Sep...,PRIMAP,,LUX,2019,PFCS (SARGWP100),1.630000,Gg CO2 / yr,2019-01-01 00:00:00.000
