Demonstrate writing and reading dataframes containing PINT data

Next step: add uncertainties

In [2]:
import os
import pathlib
from dotenv import load_dotenv

# Load some standard environment variables from a dot-env file, if it exists.
# If no such file can be found, does not fail, and so allows these environment vars to
# be populated in some other way
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

import osc_ingest_trino as osc

import numpy as np
import pandas as pd

import pint
import pint_pandas
import openscm_units

from pint import set_application_registry
from pint_pandas import PintArray, PintType
from openscm_units import unit_registry
PintType.ureg = unit_registry
ureg = unit_registry
set_application_registry(ureg)
Q_ = ureg.Quantity
PA_ = PintArray

ureg.define("CO2e = CO2 = CO2eq = CO2_eq")
ureg.define("Fe_ton = [produced_ton]")

### S3 and boto3

In [3]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

### Connect to Trino with sqlalchemy

In [4]:
import trino
from sqlalchemy.engine import create_engine

ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'sandbox'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ['TRINO_USER'],
    host = os.environ['TRINO_HOST'],
    port = os.environ['TRINO_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ['TRINO_PASSWD']),
    'http_scheme': 'https',
    'catalog': ingest_catalog,
    'schema': ingest_schema,
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

Create simple dataframe with pint units (megametric tons co2) and numeric data (year expressed as a number)

In [5]:
co2_df = pd.DataFrame({'co2': pd.Series([Q_(1.0, 'Mt CO2'), Q_(0.9, 'Mt CO2'), Q_(0.8, 'Mt CO2')],
                                        dtype='pint[Mt CO2]'),
                       'year': [2016, 2017, 2018]})

Write the dataframe to a Trino table.  This should preserve unit informtion, either by making the data something that triggers the reading of a unitized quantity when parsed, or by creating an appropriately named column for the unit information, or by creating other metadata that can properly restore units to the dataframe when read.

In [6]:
ingest_table = "pint_co2"

engine.execute(f"drop table if exists {ingest_schema}.{ingest_table}")
co2_df.to_sql(ingest_table, con=engine, schema=ingest_schema, if_exists='append', index=False,
              method=osc.TrinoBatchInsert(batch_size = 100, verbose = True, optimize = True))

  return np.array(qtys, dtype="object", copy=copy)


constructed fully qualified table name as: "sandbox.pint_co2"
inserting 3 records
  (1.0 CO2 * megametric_ton, 2016)
  (0.9 CO2 * megametric_ton, 2017)
  (0.8 CO2 * megametric_ton, 2018)


TrinoUserError: TrinoUserError(type=USER_ERROR, name=SYNTAX_ERROR, message="line 2:6: mismatched input 'CO2'. Expecting: ')', ','", query_id=20220427_135010_00188_7eyy5)