In [1]:
#pip install boto3 pyarrow python-dotenv

In [2]:
import re
import pandas as pd
_wsdedup = re.compile(r"\s+")
_usdedup = re.compile(r"__+")
_rmpunc = re.compile(r"[,.()&$/+-]+")
# 63 seems to be a common max column name length
def snakify(name, maxlen=63):
    if isinstance(name, list):
        return [snakify(e) for e in name]
    w = name.casefold().rstrip().lstrip()
    w = w.replace("-", "_")
    w = _rmpunc.sub("", w)
    w = _wsdedup.sub("_", w)
    w = _usdedup.sub("_", w)
    w = w.replace("average", "avg")
    w = w.replace("maximum", "max")
    w = w.replace("minimum", "min")
    w = w.replace("absolute", "abs")
    w = w.replace("source", "src")
    w = w.replace("distribution", "dist")
    # these are common in the sample names but unsure of standard abbv
    #w = w.replace("inference", "inf")
    #w = w.replace("emissions", "emis")
    #w = w.replace("intensity", "int")
    #w = w.replace("reported", "rep")
    #w = w.replace("revenue", "rev")
    w = w[:maxlen] 
    return w

def snakify_columns(df, inplace=False, maxlen=63):
    icols = df.columns.to_list()
    ocols = snakify(icols, maxlen=maxlen)
    scols = set(ocols)
    if (len(set(ocols)) < len(ocols)):
        raise ValueError("remapped column names were not unique!")
    rename_map = dict(list(zip(icols,snakify(icols))))
    return df.rename(columns=rename_map, inplace=inplace)

_p2smap = {
    'string': 'varchar',
    'Float64': 'double',
    'Int64': 'bigint'
}

def pandas_type_to_sql(pt):
    st = _p2smap.get(pt)
    if st is not None:
        return st
    raise ValueError("unexpected pandas column type '{pt}'".format(pt=pt))

# add ability to specify optional dict for specific fields?
# if column name is present, use specified value?
def generate_table_schema_pairs(df):
    ptypes = [str(e) for e in df.dtypes.to_list()]
    stypes = [pandas_type_to_sql(e) for e in ptypes]
    pz = list(zip(df.columns.to_list(), stypes))
    return ",\n".join(["    {n} {t}".format(n=e[0],t=e[1]) for e in pz])

## Example `credentials.env` file

```
# s3 credentials
S3_ENDPOINT=https://s3.us-east-1.amazonaws.com
S3_BUCKET=ocp-odh-os-demo-s3
S3_ACCESS_KEY=xxx
S3_SECRET_KEY=xxx

# trino credentials
TRINO_USER=xxx
TRINO_PASSWD=xxx
TRINO_HOST=trino-secure-odh-trino.apps.odh-cl1.apps.os-climate.org
TRINO_PORT=443
```

In [3]:
from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [4]:
import boto3
# Create an S3 client
s3 = boto3.client(
    service_name="s3",
    endpoint_url=os.environ['S3_ENDPOINT'],
    aws_access_key_id=os.environ['S3_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_SECRET_KEY'],
)

In [5]:
import pandas as pd
obj = s3.get_object(
    Bucket=os.environ['S3_BUCKET'],
    Key='urgentem/UrgentemDataSampleEmissionsTargetsDec2020.csv')

# load the raw file from the bucket
dfEmissions = (pd.read_csv(obj['Body'])).convert_dtypes()

# convert columns to specific data types
dfEmissions = dfEmissions.convert_dtypes()

# map column names to a form that works for SQL
snakify_columns(dfEmissions, inplace=True)
#dfEmissions.head()

In [6]:
# a way to examine the structure of a pandas data frame
dfEmissions.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 15 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   company_name                        19 non-null     string 
 1   isin                                19 non-null     string 
 2   target_type                         19 non-null     string 
 3   scope                               19 non-null     string 
 4   coverage_s1                         16 non-null     Float64
 5   coverage_s2                         15 non-null     Float64
 6   coverage_s3                         4 non-null      Int64  
 7   reduction_ambition                  19 non-null     Float64
 8   base_year                           19 non-null     Int64  
 9   end_year                            19 non-null     Int64  
 10  start_year                          19 non-null     Int64  
 11  base_year_ghg_emissions_s1_tco2e    1 non-null 

In [7]:
# parquet has multiple options for appending or updating data
# including adding new files, or appending, sharding directory trees, etc
dfEmissions.to_parquet('/tmp/emissions.parquet', index=False)
s3.upload_file(
    Bucket=os.environ['S3_BUCKET'],
    Key='urgentem/trino/itr_emissions_sample_test/emissions.parquet',
    Filename='/tmp/emissions.parquet'
)

In [8]:
import trino
conn = trino.dbapi.connect(
    auth=trino.auth.BasicAuthentication(os.environ['TRINO_USER'], os.environ['TRINO_PASSWD']),
    host=os.environ['TRINO_HOST'],
    port=int(os.environ['TRINO_PORT']),
    http_scheme='https',
    verify=True,
)
cur = conn.cursor()

In [9]:
# For this demonstration example, we just recreate table from scratch each time.
# in live data platform there will need to be policies and mechanisms for either
# appending new data, or overwriting old data, or saving off conditioned by a versioning number
# this is a data governance topic
cur.execute('drop table if exists hive.urgentem.itr_emissions_sample_test')
cur.fetchall()

[[True]]

In [10]:
# generate a sql schema that will correspond to the data types
# of columns in the pandas DF
# to-do: add some mechanisms for overriding types, either here
# or on the pandas data-frame itself before we write it out
schema = generate_table_schema_pairs(dfEmissions)

tabledef = """create table if not exists default.urgentem.itr_emissions_sample_test(
{schema}
) with (
    format = 'parquet',
    external_location = 's3a://ocp-odh-os-demo-s3/urgentem/trino/itr_emissions_sample_test/'
)""".format(schema=schema)
print(tabledef)

# tables created externally may not show up immediately in cloud-beaver
cur.execute(tabledef)
cur.fetchall()

create table if not exists default.urgentem.itr_emissions_sample_test(
    company_name varchar,
    isin varchar,
    target_type varchar,
    scope varchar,
    coverage_s1 double,
    coverage_s2 double,
    coverage_s3 bigint,
    reduction_ambition double,
    base_year bigint,
    end_year bigint,
    start_year bigint,
    base_year_ghg_emissions_s1_tco2e varchar,
    base_year_ghg_emissions_s1s2_tco2e varchar,
    base_year_ghg_emissions_s3_tco2e varchar,
    achieved_reduction double
) with (
    format = 'parquet',
    external_location = 's3a://ocp-odh-os-demo-s3/urgentem/trino/itr_emissions_sample_test/'
)


[[True]]

In [11]:
# test creation of new table
#cur.execute('show tables in default.urgentem')
#cur.fetchall()

In [12]:
# test that we can get data
#cur.execute('select isin from default.urgentem.itr_emissions_sample_test')
#cur.fetchall()