Demonstrate writing and reading dataframes containing PINT data (and highlight limitations)

Next step: add uncertainties

In [1]:
import os
import pathlib
from dotenv import load_dotenv

# Load some standard environment variables from a dot-env file, if it exists.
# If no such file can be found, does not fail, and so allows these environment vars to
# be populated in some other way
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

import osc_ingest_trino as osc

import numpy as np
import pandas as pd

import pint
import pint_pandas
import openscm_units

from pint import set_application_registry
from pint_pandas import PintArray, PintType
from openscm_units import unit_registry
PintType.ureg = unit_registry
ureg = unit_registry
set_application_registry(ureg)
Q_ = ureg.Quantity
PA_ = PintArray

ureg.define("CO2e = CO2 = CO2eq = CO2_eq")
ureg.define("Fe_ton = [produced_ton]")

import ast

### Connect to Trino with sqlalchemy

In [2]:
import trino
from sqlalchemy.engine import create_engine

ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'sandbox'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ['TRINO_USER'],
    host = os.environ['TRINO_HOST'],
    port = os.environ['TRINO_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ['TRINO_PASSWD']),
    'http_scheme': 'https',
    'catalog': ingest_catalog,
    'schema': ingest_schema,
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

Create simple dataframe with pint units (megametric tons co2) and numeric data (year expressed as a number).

Note that dates can neither be *datetime* nor *string*, because neither types are understood by Pint.

In [3]:
co2_df = pd.DataFrame({'co2': pd.Series([Q_(1.0, 'Mt CO2'), Q_(0.9, 'Mt CO2'), Q_(0.8, 'Mt CO2')],
                                        dtype='pint[Mt CO2]'),
                       'year': pd.Series([2016.0, 2017.0, 2018.0],
                                         dtype='pint[dimensionless]')})

Write the dataframe to a Trino table.

The *pint.dequantify* method converts Pint columns to tuples of column names and Pint types.  But in so doing, it also downcases the CO2 units, ***which are then no longer correctly recognized in the unit registry***.  This problem is not unique to CO2: M and m mean different things to Pint, as do G and g, and several other SI abbreviations.

In [4]:
ingest_table = "pint_co2"

engine.execute(f"drop table if exists {ingest_schema}.{ingest_table}")
co2_df.pint.dequantify().to_sql(ingest_table, con=engine, schema=ingest_schema, if_exists='append', index=False,
              method=osc.TrinoBatchInsert(batch_size = 100, verbose = True, optimize = True))

constructed fully qualified table name as: "sandbox.pint_co2"
inserting 3 records
  (1.0, 2016.0)
  (0.9, 2017.0)
  (0.8, 2018.0)
batch insert result: [(3,)]
optimizing table files
execute optimize: []


In [5]:
engine.execute("describe sandbox.pint_co2").fetchall()

[("('co2', 'co2 * megametric_ton')", 'double', '', ''),
 ("('year', 'dimensionless')", 'double', '', '')]

In [6]:
engine.execute("select * from sandbox.pint_co2").fetchall()

[(1.0, 2016.0), (0.9, 2017.0), (0.8, 2018.0)]

In [7]:
new_df = pd.read_sql(f"select * from {ingest_schema}.{ingest_table}", con=engine)

In [8]:
new_df.columns

Index(['('co2', 'co2 * megametric_ton')', '('year', 'dimensionless')'], dtype='object')

To reconstruct the dataframe back into Pint quantities we can use, we need to use the specially constructed column names to reconstruct the proper column name and Pint datatype.  Is there an easier way?

In [9]:
new_co2_df = pd.DataFrame({col:series.astype(f"pint[{unit.replace('co2', 'CO2')}]")
                          for col, unit, series in zip(list(map(lambda x: ast.literal_eval(x)[0], new_df.T.apply(lambda x: x.index, axis=1).index.values)),
                                                       list(map(lambda x: ast.literal_eval(x)[1], new_df.T.apply(lambda x: x.index, axis=1).index.values)),
                                                       [v for v in new_df.to_dict(orient='series').values()])})
new_co2_df.dtypes

co2     pint[CO2 * megametric_ton]
year           pint[dimensionless]
dtype: object

In [10]:
new_co2_df

  return np.array(qtys, dtype="object", copy=copy)
  return np.array(qtys, dtype="object", copy=copy)


Unnamed: 0,co2,year
0,1.0,2016.0
1,0.9,2017.0
2,0.8,2018.0


In [11]:

new_co2_df.pint.dequantify()

Unnamed: 0_level_0,co2,year
unit,CO2 * megametric_ton,dimensionless
0,1.0,2016.0
1,0.9,2017.0
2,0.8,2018.0
