### The PRIMAP-hist national historical emissions time series (1750-2019) v2.3.1

<font size="3">https://zenodo.org/record/5494497#.YujsrTfP2Un</font>

Load Environment Variables

In [1]:
from dotenv import dotenv_values, load_dotenv
import osc_ingest_trino as osc
import os
import pathlib
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [2]:
# use a catalog that is configured for iceberg
ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'pcaf_sovereign_footprint'
ingest_table = 'sf_primap_emissions_with_LULUCF'

In [3]:
# Bucket must be configured with credentials for the Hive ingestion bucket
#hive_bucket = osc.attach_s3_bucket('S3_OSCCL2')
hive_bucket = osc.attach_s3_bucket('S3_HIVE')
hive_catalog = 'osc_datacommons_hive_ingest'
hive_schema = 'ingest'

In [4]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

trino_bucket = osc.attach_s3_bucket("S3_LANDING")

In [5]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

Open a Trino connection using JWT for authentication

In [6]:
# make sure schema exists, or table creation below will fail in weird ways
sql = f"""
create schema if not exists {ingest_catalog}.{ingest_schema}
"""
qres = engine.execute(sql)
#print(qres.fetchall())

In [7]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {ingest_catalog}')
for row in schema_read.fetchall():
    print(row)

('aicoe_osc_demo_results',)
('default',)
('demo_dv',)
('dera',)
('essd',)
('iceberg_demo',)
('information_schema',)
('ingest',)
('mdt_sandbox',)
('pcaf_sovereign_footprint',)
('rmi',)
('sandbox',)
('wri_gppd',)


In [8]:
import pandas as pd
import csv
import ParseXLS as parser


primap_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'PCAF-sovereign-footprint/PRIMAP/Guetschow-et-al-2022-PRIMAP-hist_v2.4_no_rounding_11-Oct-2022.csv')
primap_file.download_file(f'/tmp/Guetschow-et-al-2022-PRIMAP-hist_v2.4_no_rounding_11-Oct-2022.csv')
#df = parser.process('OECD_IMGR_FCO2.ini','OECD.csv') 

df = parser.process('PRIMAP_with_LULUCF.ini','PRIMAP.csv') 
df=df.astype({'validity_date': 'int32'})
df=df.astype({'country_iso_code': 'string'})
df=df.astype({'attribute': 'string'})
df= df.convert_dtypes()
df.info(verbose=True)

#df['validity_date_dt'] = pd.to_datetime(df.validity_date, format='%Y')
#df = df.convert_dtypes()
#df.validity_date_dt = df.validity_date_dt.astype('datetime64[ns]')



PRIMAP_with_LULUCF.ini
file_list:
['/tmp/Guetschow-et-al-2022-PRIMAP-hist_v2.4_no_rounding_11-Oct-2022.csv']
/tmp/Guetschow-et-al-2022-PRIMAP-hist_v2.4_no_rounding_11-Oct-2022.csv
2
csv
/tmp/Guetschow-et-al-2022-PRIMAP-hist_v2.4_no_rounding_11-Oct-202
<configparser.ConfigParser object at 0x7f27a8110ee0>
                    source scenario (PRIMAP-hist) area (ISO3) entity   
0      PRIMAP-hist_v2.4_nr                 HISTCR         ABW    CH4  \
1      PRIMAP-hist_v2.4_nr                 HISTCR         ABW    CH4   
2      PRIMAP-hist_v2.4_nr                 HISTCR         ABW    CH4   
3      PRIMAP-hist_v2.4_nr                 HISTCR         ABW    CH4   
4      PRIMAP-hist_v2.4_nr                 HISTCR         ABW    CH4   
...                    ...                    ...         ...    ...   
36385  PRIMAP-hist_v2.4_nr                 HISTTP         ZWE    N2O   
36386  PRIMAP-hist_v2.4_nr                 HISTTP         ZWE    N2O   
36387  PRIMAP-hist_v2.4_nr                 HIST

In [9]:
sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
print(sql)
qres = engine.execute(sql)


'''
columnschema = osc.create_table_schema_pairs(df,typemap={'datetime64[ns]':'timestamp(6)'}) 
print(columnschema)


tabledef = f"""
create table if not exists {ingest_catalog}.{ingest_schema}.{ingest_table}(
{columnschema}
) with (
    format = 'ORC',
    partitioning = ARRAY['country_iso_code']
)"""

print(tabledef)


#print(qres.fetchall())

table_create = engine.execute(tabledef)
#print(table_create.fetchall())
'''




drop table if exists osc_datacommons_dev.pcaf_sovereign_footprint.sf_primap_emissions_with_LULUCF



'\ncolumnschema = osc.create_table_schema_pairs(df,typemap={\'datetime64[ns]\':\'timestamp(6)\'}) \nprint(columnschema)\n\n\ntabledef = f"""\ncreate table if not exists {ingest_catalog}.{ingest_schema}.{ingest_table}(\n{columnschema}\n) with (\n    format = \'ORC\',\n    partitioning = ARRAY[\'country_iso_code\']\n)"""\n\nprint(tabledef)\n\n\n#print(qres.fetchall())\n\ntable_create = engine.execute(tabledef)\n#print(table_create.fetchall())\n'

In [10]:
df.info(verbose=True)
        
        

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58480 entries, 0 to 58479
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   rec_source        58480 non-null  string 
 1   data_provider     58480 non-null  string 
 2   country_name      0 non-null      Int64  
 3   country_iso_code  58480 non-null  string 
 4   validity_date     58480 non-null  Int32  
 5   attribute         58480 non-null  string 
 6   value             58480 non-null  Float64
 7   value_units       58480 non-null  string 
dtypes: Float64(1), Int32(1), Int64(1), string(5)
memory usage: 3.5 MB


In [11]:
'''
osc.fast_pandas_ingest_via_hive(
    df,
    engine,
    ingest_catalog, ingest_schema, ingest_table,
    hive_bucket, hive_catalog, hive_schema,
    partition_columns = ['country_iso_code'],
    overwrite = True,
    verbose = True
)
'''

"\nosc.fast_pandas_ingest_via_hive(\n    df,\n    engine,\n    ingest_catalog, ingest_schema, ingest_table,\n    hive_bucket, hive_catalog, hive_schema,\n    partition_columns = ['country_iso_code'],\n    overwrite = True,\n    verbose = True\n)\n"

In [12]:

df.to_sql(ingest_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method=osc.TrinoBatchInsert(batch_size = 10000, verbose = True))


constructed fully qualified table name as: "pcaf_sovereign_footprint.sf_primap_emissions_with_LULUCF"
inserting 10000 records
  ('Guetschow-et-al-2022-PRIMAP-hist_v2.4_no_rounding_11-Oct-2022.csv', 'PRIMAP', NULL, 'ABW', 1750, 'KYOTOGHG (AR4GWP100)', 0.0, 'CO2 * gigagram / a')
  ('Guetschow-et-al-2022-PRIMAP-hist_v2.4_no_rounding_11-Oct-2022.csv', 'PRIMAP', NULL, 'AFG', 1750, 'KYOTOGHG (AR4GWP100)', 0.057188118, 'CO2 * gigagram / a')
  ('Guetschow-et-al-2022-PRIMAP-hist_v2.4_no_rounding_11-Oct-2022.csv', 'PRIMAP', NULL, 'AGO', 1750, 'KYOTOGHG (AR4GWP100)', -2.14051e-13, 'CO2 * gigagram / a')
  ...
  ('Guetschow-et-al-2022-PRIMAP-hist_v2.4_no_rounding_11-Oct-2022.csv', 'PRIMAP', NULL, 'LBY', 1796, 'KYOTOGHG (AR4GWP100)', 0.0, 'CO2 * gigagram / a')
batch insert result: [(10000,)]
inserting 10000 records
  ('Guetschow-et-al-2022-PRIMAP-hist_v2.4_no_rounding_11-Oct-2022.csv', 'PRIMAP', NULL, 'LCA', 1796, 'KYOTOGHG (AR4GWP100)', 0.0, 'CO2 * gigagram / a')
  ('Guetschow-et-al-2022-PRIMAP-his

  df.to_sql(ingest_table,


In [13]:
import pandas as pd
sql=f"""
select *  from {ingest_catalog}.{ingest_schema}.{ingest_table} where validity_date=2021 and country_iso_code='TWN'"""
pd.read_sql(sql, engine)

Unnamed: 0,rec_source,data_provider,country_name,country_iso_code,validity_date,attribute,value,value_units
0,Guetschow-et-al-2022-PRIMAP-hist_v2.4_no_round...,PRIMAP,,TWN,2021,KYOTOGHG (AR4GWP100),-21941.65,CO2 * gigagram / a


In [14]:
sql=f"""
select distinct country_iso_code  from {ingest_catalog}.{ingest_schema}.{ingest_table} where validity_date=2021"""
pd.read_sql(sql, engine)

Unnamed: 0,country_iso_code
0,ARE
1,ARG
2,BASIC
3,CAF
4,CAN
...,...
210,TKL
211,TLS
212,VAT
213,VCT
