Demonstrate writing and reading dataframes containing PINT data (and highlight limitations)

Next step: Try using uncertainties (expressed as Pint measurements)

In [1]:
import os
import pathlib
from dotenv import load_dotenv

# Load some standard environment variables from a dot-env file, if it exists.
# If no such file can be found, does not fail, and so allows these environment vars to
# be populated in some other way
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

import osc_ingest_trino as osc

import numpy as np
import pandas as pd

import pint
import pint_pandas
import openscm_units

from pint import set_application_registry, Quantity
from pint_pandas import PintArray, PintType
from pint_pandas.pint_array import is_pint_type
from openscm_units import unit_registry
PintType.ureg = unit_registry
ureg = unit_registry
set_application_registry(ureg)
Q_ = ureg.Quantity
PA_ = PintArray

ureg.define("CO2e = CO2 = CO2eq = CO2_eq")
ureg.define("Fe_ton = [produced_ton]")
ureg.define("USD = [currency]")

import ast

### Connect to Trino with sqlalchemy

In [2]:
import trino
from sqlalchemy.engine import create_engine

ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'sandbox'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ['TRINO_USER'],
    host = os.environ['TRINO_HOST'],
    port = os.environ['TRINO_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ['TRINO_PASSWD']),
    'http_scheme': 'https',
    'catalog': ingest_catalog,
    'schema': ingest_schema,
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

Create simple dataframe with pint units (megametric tons co2) and numeric data (year expressed as a number).

Note that dates can neither be *datetime* nor *string*, because neither types are understood by Pint.

In [3]:
sample_df = pd.DataFrame({'company_name': ['PG&E Corp.', 'PNM Resources, Inc.', 'POSCO', 'PPL Corp.'],
                          'company_lei': ['8YQ2GSDWYZXO2EDN3511', '5493003JOBJGLZSDDQ28', '988400E5HRVX81AYLM04', '9N3UAJSNOUXFKQLF3V18'],
                          'comapny_isin': ['US69331C1080','US69349H1077','KR7005490008','US69351T1060'],
                          '2019_revenue': PintArray([17129000000.0,1457603000.0,55955872344.0,7769000000.0],'USD'),
                          '2016_ghg_s1': PintArray([2.216543993,6.337250786,81.309800,30.08848723],'Mt CO2'),
                          '2017_ghg_s1': PintArray([2.251191566,6.488768702,75.633360,30.24837146],'Mt CO2'),
                          '2018_ghg_s1': PintArray([2.451149772,5.217895758,77.391479,31.61146904],'Mt CO2'),
                          '2019_ghg_s1': PintArray([2.451149772,np.nan,77.391479,np.nan],'Mt CO2')
                          # As of 20220430, the following create the dataframe correctly, but throws UnitStrippedWarning
                          # '2016_production': [Q_(32.993292,'TWh'),Q_(10.2316757,'TWh'),Q_(42199000.0,'Fe_ton'),Q_(34.61322117,'TWh')],
                          # '2017_production': [Q_(34.490224,'TWh'),Q_(10.1709745,'TWh'),Q_(37207000.0,'Fe_ton'),Q_(33.53286848,'TWh')],
                          # '2018_production': [Q_(32.28122,'TWh'),Q_(9.307788099,'TWh'),Q_(37735000.0,'Fe_ton'),Q_(35.57197004,'TWh')],
                          })
# We can construct an equivalent DataFrame by separating magnitudes and units, and then combining via multiplication
s_2016 = pd.Series(data=[32.993292, 10.2316757, 42199000.0, 34.61322117], name='2016_production') * pd.Series(data=[ureg(x).u for x in ['TWh','TWh','Fe_ton','TWh']], name='2016_production')
s_2017 = pd.Series(data=[34.490224, 10.1709745, 37207000.0, 33.53286848], name='2017_production') * pd.Series(data=[ureg(x).u for x in ['TWh','TWh','Fe_ton','TWh']], name='2017_production')
s_2018 = pd.Series(data=[32.28122, 9.307788099, 37735000.0, 35.57197004], name='2018_production') * pd.Series(data=[ureg(x).u for x in ['TWh','TWh','Fe_ton','TWh']], name='2018_production')
sample_df = pd.concat([sample_df, s_2016, s_2017, s_2018], axis=1).convert_dtypes()

In [4]:
sample_df.sort_values(by='company_name')

  return np.array(qtys, dtype="object", copy=copy)
  return np.array(qtys, dtype="object", copy=copy)


Unnamed: 0,company_name,company_lei,comapny_isin,2019_revenue,2016_ghg_s1,2017_ghg_s1,2018_ghg_s1,2019_ghg_s1,2016_production,2017_production,2018_production
0,PG&E Corp.,8YQ2GSDWYZXO2EDN3511,US69331C1080,17129000000.0,2.216543993,2.251191566,2.451149772,2.451149772,32.993292 terawatt_hour,34.490224 terawatt_hour,32.28122 terawatt_hour
1,"PNM Resources, Inc.",5493003JOBJGLZSDDQ28,US69349H1077,1457603000.0,6.337250786,6.488768702,5.217895758,,10.2316757 terawatt_hour,10.1709745 terawatt_hour,9.307788099 terawatt_hour
2,POSCO,988400E5HRVX81AYLM04,KR7005490008,55955872344.0,81.3098,75.63336,77.391479,77.391479,42199000.0 Fe_ton,37207000.0 Fe_ton,37735000.0 Fe_ton
3,PPL Corp.,9N3UAJSNOUXFKQLF3V18,US69351T1060,7769000000.0,30.08848723,30.24837146,31.61146904,,34.61322117 terawatt_hour,33.53286848 terawatt_hour,35.57197004 terawatt_hour


In [5]:
# If DF_COL contains Pint quantities (because it is a PintArray or an array of Pint Quantities),
# return a two-column dataframe of magnitudes and units.
# If DF_COL contains no Pint quanities, return it unchanged.

def dequantify_column(df_col: pd.Series):
    if type(df_col.values)==PintArray:
        return pd.DataFrame({df_col.name: df_col.values.quantity.m,
                             df_col.name + "_units": str(df_col.values.dtype.units)},
                            index=df_col.index)
    elif df_col.size==0:
        return df_col
    elif isinstance(df_col.iloc[0], Quantity):
        values = df_col.map(lambda x: (x.m, x.u))
        return pd.DataFrame({df_col.name: df_col.map(lambda x: x.m),
                             df_col.name + "_units": df_col.map(lambda x: str(x.u))},
                            index=df_col.index)
    else:
        return df_col

# Rewrite dataframe DF so that columns containing Pint quantities are represented by a column for the Magnitude and column for the Units.
# The magnitude column retains the original column name and the units column is renamed with a _units suffix.
def dequantify_df(df):
    return pd.concat([dequantify_column(df[col]) for col in df.columns], axis=1)

In [6]:
dequantify_df(sample_df)

Unnamed: 0,company_name,company_lei,comapny_isin,2019_revenue,2019_revenue_units,2016_ghg_s1,2016_ghg_s1_units,2017_ghg_s1,2017_ghg_s1_units,2018_ghg_s1,2018_ghg_s1_units,2019_ghg_s1,2019_ghg_s1_units,2016_production,2016_production_units,2017_production,2017_production_units,2018_production,2018_production_units
0,PG&E Corp.,8YQ2GSDWYZXO2EDN3511,US69331C1080,17129000000.0,USD,2.216544,CO2 * megametric_ton,2.251192,CO2 * megametric_ton,2.45115,CO2 * megametric_ton,2.45115,CO2 * megametric_ton,32.99329,terawatt_hour,34.49022,terawatt_hour,32.28122,terawatt_hour
1,"PNM Resources, Inc.",5493003JOBJGLZSDDQ28,US69349H1077,1457603000.0,USD,6.337251,CO2 * megametric_ton,6.488769,CO2 * megametric_ton,5.217896,CO2 * megametric_ton,,CO2 * megametric_ton,10.23168,terawatt_hour,10.17097,terawatt_hour,9.307788,terawatt_hour
2,POSCO,988400E5HRVX81AYLM04,KR7005490008,55955870000.0,USD,81.3098,CO2 * megametric_ton,75.63336,CO2 * megametric_ton,77.391479,CO2 * megametric_ton,77.391479,CO2 * megametric_ton,42199000.0,Fe_ton,37207000.0,Fe_ton,37735000.0,Fe_ton
3,PPL Corp.,9N3UAJSNOUXFKQLF3V18,US69351T1060,7769000000.0,USD,30.088487,CO2 * megametric_ton,30.248371,CO2 * megametric_ton,31.611469,CO2 * megametric_ton,,CO2 * megametric_ton,34.61322,terawatt_hour,33.53287,terawatt_hour,35.57197,terawatt_hour


Write the dataframe to a Trino table.

The *pint.dequantify* method converts PintArray columns to columns named as tuples of original column names and Pint types.  This leads to two problems:
1. If column values contain Quantities of different units, dequantify fails
2. Trino requires lower-case column names ***which are then no longer correctly recognized in the unit registry***.  This problem is not unique to CO2: M and m mean different things to Pint, as do G and g, and several other SI abbreviations.

To solve this we write our own dequantify routine, which explodes Quantity columns into magnitude (using original column name {COL}) and {COL}_units

In [7]:
ingest_table = "sample_itr"

engine.execute(f"drop table if exists {ingest_schema}.{ingest_table}")
dequantify_df(sample_df).to_sql(ingest_table, con=engine, schema=ingest_schema, if_exists='append', index=False,
                                method=osc.TrinoBatchInsert(batch_size = 100, verbose = True, optimize = True))

constructed fully qualified table name as: "sandbox.sample_itr"
inserting 4 records
  ('PG&E Corp.', '8YQ2GSDWYZXO2EDN3511', 'US69331C1080', 17129000000.0, 'USD', 2.216543993, 'CO2 * megametric_ton', 2.251191566, 'CO2 * megametric_ton', 2.451149772, 'CO2 * megametric_ton', 2.451149772, 'CO2 * megametric_ton', 32.993292, 'terawatt_hour', 34.490224, 'terawatt_hour', 32.28122, 'terawatt_hour')
  ('PNM Resources, Inc.', '5493003JOBJGLZSDDQ28', 'US69349H1077', 1457603000.0, 'USD', 6.337250786, 'CO2 * megametric_ton', 6.488768702, 'CO2 * megametric_ton', 5.217895758, 'CO2 * megametric_ton', NULL, 'CO2 * megametric_ton', 10.2316757, 'terawatt_hour', 10.1709745, 'terawatt_hour', 9.307788099, 'terawatt_hour')
  ('POSCO', '988400E5HRVX81AYLM04', 'KR7005490008', 55955872344.0, 'USD', 81.3098, 'CO2 * megametric_ton', 75.63336, 'CO2 * megametric_ton', 77.391479, 'CO2 * megametric_ton', 77.391479, 'CO2 * megametric_ton', 42199000.0, 'Fe_ton', 37207000.0, 'Fe_ton', 37735000.0, 'Fe_ton')
  ('PPL Corp.

In [8]:
engine.execute(f"describe {ingest_schema}.{ingest_table}").fetchall()

[('company_name', 'varchar', '', ''),
 ('company_lei', 'varchar', '', ''),
 ('comapny_isin', 'varchar', '', ''),
 ('2019_revenue', 'double', '', ''),
 ('2019_revenue_units', 'varchar', '', ''),
 ('2016_ghg_s1', 'double', '', ''),
 ('2016_ghg_s1_units', 'varchar', '', ''),
 ('2017_ghg_s1', 'double', '', ''),
 ('2017_ghg_s1_units', 'varchar', '', ''),
 ('2018_ghg_s1', 'double', '', ''),
 ('2018_ghg_s1_units', 'varchar', '', ''),
 ('2019_ghg_s1', 'double', '', ''),
 ('2019_ghg_s1_units', 'varchar', '', ''),
 ('2016_production', 'double', '', ''),
 ('2016_production_units', 'varchar', '', ''),
 ('2017_production', 'double', '', ''),
 ('2017_production_units', 'varchar', '', ''),
 ('2018_production', 'double', '', ''),
 ('2018_production_units', 'varchar', '', '')]

In [9]:
engine.execute(f"select * from {ingest_schema}.{ingest_table}").fetchall()

[('PPL Corp.', '9N3UAJSNOUXFKQLF3V18', 'US69351T1060', 7769000000.0, 'USD', 30.08848723, 'CO2 * megametric_ton', 30.24837146, 'CO2 * megametric_ton', 31.61146904, 'CO2 * megametric_ton', None, 'CO2 * megametric_ton', 34.61322117, 'terawatt_hour', 33.53286848, 'terawatt_hour', 35.57197004, 'terawatt_hour'),
 ('PNM Resources, Inc.', '5493003JOBJGLZSDDQ28', 'US69349H1077', 1457603000.0, 'USD', 6.337250786, 'CO2 * megametric_ton', 6.488768702, 'CO2 * megametric_ton', 5.217895758, 'CO2 * megametric_ton', None, 'CO2 * megametric_ton', 10.2316757, 'terawatt_hour', 10.1709745, 'terawatt_hour', 9.307788099, 'terawatt_hour'),
 ('POSCO', '988400E5HRVX81AYLM04', 'KR7005490008', 55955872344.0, 'USD', 81.3098, 'CO2 * megametric_ton', 75.63336, 'CO2 * megametric_ton', 77.391479, 'CO2 * megametric_ton', 77.391479, 'CO2 * megametric_ton', 42199000.0, 'Fe_ton', 37207000.0, 'Fe_ton', 37735000.0, 'Fe_ton'),
 ('PG&E Corp.', '8YQ2GSDWYZXO2EDN3511', 'US69331C1080', 17129000000.0, 'USD', 2.216543993, 'CO2 * m

In [10]:
sql_df = pd.read_sql(f"select * from {ingest_schema}.{ingest_table}", con=engine)
sql_df

Unnamed: 0,company_name,company_lei,comapny_isin,2019_revenue,2019_revenue_units,2016_ghg_s1,2016_ghg_s1_units,2017_ghg_s1,2017_ghg_s1_units,2018_ghg_s1,2018_ghg_s1_units,2019_ghg_s1,2019_ghg_s1_units,2016_production,2016_production_units,2017_production,2017_production_units,2018_production,2018_production_units
0,"PNM Resources, Inc.",5493003JOBJGLZSDDQ28,US69349H1077,1457603000.0,USD,6.337251,CO2 * megametric_ton,6.488769,CO2 * megametric_ton,5.217896,CO2 * megametric_ton,,CO2 * megametric_ton,10.23168,terawatt_hour,10.17097,terawatt_hour,9.307788,terawatt_hour
1,POSCO,988400E5HRVX81AYLM04,KR7005490008,55955870000.0,USD,81.3098,CO2 * megametric_ton,75.63336,CO2 * megametric_ton,77.391479,CO2 * megametric_ton,77.391479,CO2 * megametric_ton,42199000.0,Fe_ton,37207000.0,Fe_ton,37735000.0,Fe_ton
2,PPL Corp.,9N3UAJSNOUXFKQLF3V18,US69351T1060,7769000000.0,USD,30.088487,CO2 * megametric_ton,30.248371,CO2 * megametric_ton,31.611469,CO2 * megametric_ton,,CO2 * megametric_ton,34.61322,terawatt_hour,33.53287,terawatt_hour,35.57197,terawatt_hour
3,PG&E Corp.,8YQ2GSDWYZXO2EDN3511,US69331C1080,17129000000.0,USD,2.216544,CO2 * megametric_ton,2.251192,CO2 * megametric_ton,2.45115,CO2 * megametric_ton,2.45115,CO2 * megametric_ton,32.99329,terawatt_hour,34.49022,terawatt_hour,32.28122,terawatt_hour


In [11]:
# Because this DF comes from reading a Trino table, and because columns must be unqiue, we don't have to enumerate to ensure we properly handle columns with duplicated names

def requantify_df(df):
    units_col = None
    columns_reversed = reversed(df.columns)
    for col in columns_reversed:
        if col.endswith("_units"):
            if units_col:
                # We expect _units column to follow a non-units column
                raise ValueError
            units_col = col
            continue
        if units_col:
            if col + '_units' != units_col:
                raise ValueError
            if (df[units_col]==df[units_col][0]).all():
                # Make a PintArray
                new_col = PintArray(df[col], dtype=f"pint[{ureg(df[units_col][0]).u}]")
            else:
                # Make a pd.Series of Quantity in a way that does not throw UnitStrippedWarning
                new_col = pd.Series(data=df[col], name=col) * pd.Series(data=df[units_col].map(lambda x: ureg(x).u), name=col)
            df = df.drop(columns=units_col)
            df[col] = new_col
            units_col = None
    return df

In [12]:
new_df = requantify_df(sql_df).convert_dtypes()

In [13]:
new_df.columns

Index(['company_name', 'company_lei', 'comapny_isin', '2019_revenue',
       '2016_ghg_s1', '2017_ghg_s1', '2018_ghg_s1', '2019_ghg_s1',
       '2016_production', '2017_production', '2018_production'],
      dtype='object')

In [14]:
new_df.sort_values(by='company_name')

  return np.array(qtys, dtype="object", copy=copy)
  return np.array(qtys, dtype="object", copy=copy)


Unnamed: 0,company_name,company_lei,comapny_isin,2019_revenue,2016_ghg_s1,2017_ghg_s1,2018_ghg_s1,2019_ghg_s1,2016_production,2017_production,2018_production
3,PG&E Corp.,8YQ2GSDWYZXO2EDN3511,US69331C1080,17129000000.0,2.216543993,2.251191566,2.451149772,2.451149772,32.993292 terawatt_hour,34.490224 terawatt_hour,32.28122 terawatt_hour
0,"PNM Resources, Inc.",5493003JOBJGLZSDDQ28,US69349H1077,1457603000.0,6.337250786,6.488768702,5.217895758,,10.2316757 terawatt_hour,10.1709745 terawatt_hour,9.307788099 terawatt_hour
1,POSCO,988400E5HRVX81AYLM04,KR7005490008,55955872344.0,81.3098,75.63336,77.391479,77.391479,42199000.0 Fe_ton,37207000.0 Fe_ton,37735000.0 Fe_ton
2,PPL Corp.,9N3UAJSNOUXFKQLF3V18,US69351T1060,7769000000.0,30.08848723,30.24837146,31.61146904,,34.61322117 terawatt_hour,33.53286848 terawatt_hour,35.57197004 terawatt_hour


In [15]:
pd.testing.assert_frame_equal(sample_df.sort_values(by='company_name').set_index('company_name'),
                              new_df.sort_values(by='company_name').set_index('company_name'))

In [16]:
sample_df.sort_values(by='company_name').set_index('company_name').info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, PG&E Corp. to PPL Corp.
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype                     
---  ------           --------------  -----                     
 0   company_lei      4 non-null      string                    
 1   comapny_isin     4 non-null      string                    
 2   2019_revenue     4 non-null      pint[USD]                 
 3   2016_ghg_s1      4 non-null      pint[CO2 * megametric_ton]
 4   2017_ghg_s1      4 non-null      pint[CO2 * megametric_ton]
 5   2018_ghg_s1      4 non-null      pint[CO2 * megametric_ton]
 6   2019_ghg_s1      2 non-null      pint[CO2 * megametric_ton]
 7   2016_production  4 non-null      object                    
 8   2017_production  4 non-null      object                    
 9   2018_production  4 non-null      object                    
dtypes: object(3), pint[CO2 * megametric_ton](4), pint[USD](1), string(2)
memory usage: 352.0+ bytes


In [17]:
new_df.sort_values(by='company_name').set_index('company_name').info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, PG&E Corp. to PPL Corp.
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype                     
---  ------           --------------  -----                     
 0   company_lei      4 non-null      string                    
 1   comapny_isin     4 non-null      string                    
 2   2019_revenue     4 non-null      pint[USD]                 
 3   2016_ghg_s1      4 non-null      pint[CO2 * megametric_ton]
 4   2017_ghg_s1      4 non-null      pint[CO2 * megametric_ton]
 5   2018_ghg_s1      4 non-null      pint[CO2 * megametric_ton]
 6   2019_ghg_s1      2 non-null      pint[CO2 * megametric_ton]
 7   2016_production  4 non-null      object                    
 8   2017_production  4 non-null      object                    
 9   2018_production  4 non-null      object                    
dtypes: object(3), pint[CO2 * megametric_ton](4), pint[USD](1), string(2)
memory usage: 352.0+ bytes
