In [None]:
#pip install boto3 pyarrow python-dotenv openpyxl

In [None]:
import re
import pandas as pd
_wsdedup = re.compile(r"\s+")
_usdedup = re.compile(r"__+")
_rmpunc = re.compile(r"[,.()&$/+-]+")
# 63 seems to be a common max column name length
def snakify(name, maxlen=63):
    if isinstance(name, list):
        return [snakify(e) for e in name]
    w = str(name).casefold().rstrip().lstrip()
    w = w.replace("-", "_")
    w = _rmpunc.sub("", w)
    w = _wsdedup.sub("_", w)
    w = _usdedup.sub("_", w)
    w = w.replace("average", "avg")
    w = w.replace("maximum", "max")
    w = w.replace("minimum", "min")
    w = w.replace("absolute", "abs")
    w = w.replace("source", "src")
    w = w.replace("distribution", "dist")
    # these are common in the sample names but unsure of standard abbv
    #w = w.replace("inference", "inf")
    #w = w.replace("emissions", "emis")
    #w = w.replace("intensity", "int")
    #w = w.replace("reported", "rep")
    #w = w.replace("revenue", "rev")
    w = w[:maxlen] 
    return w

def snakify_columns(df, inplace=False, maxlen=63):
    icols = df.columns.to_list()
    ocols = snakify(icols, maxlen=maxlen)
    scols = set(ocols)
    if (len(set(ocols)) < len(ocols)):
        raise ValueError("remapped column names were not unique!")
    rename_map = dict(list(zip(icols,snakify(icols))))
    return df.rename(columns=rename_map, inplace=inplace)

_p2smap = {
    'string': 'varchar',
    'Float64': 'double',
    'Int64': 'bigint'
}

def pandas_type_to_sql(pt):
    st = _p2smap.get(pt)
    if st is not None:
        return st
    raise ValueError("unexpected pandas column type '{pt}'".format(pt=pt))

# add ability to specify optional dict for specific fields?
# if column name is present, use specified value?
def generate_table_schema_pairs(df):
    ptypes = [str(e) for e in df.dtypes.to_list()]
    stypes = [pandas_type_to_sql(e) for e in ptypes]
    pz = list(zip(df.columns.to_list(), stypes))
    return ",\n".join(["    {n} {t}".format(n=e[0],t=e[1]) for e in pz])

# this seems to be unnecessary when loading directly from xlsx files
def clean_ref_err(v):
    if pd.isna(v):
        return pd.NA
    if v == '#REF!':
        return pd.NA
    if v == '#ERROR!':
        return pd.NA
    return v

In [None]:
rename_year_columns={}
for y in range(2000,2100):
    rename_year_columns[str(y)] = 'y{yr}'.format(yr=y)
#rename_year_columns

## Example `credentials.env` file

```
# s3 credentials
S3_ENDPOINT=https://s3.us-east-1.amazonaws.com
S3_BUCKET=ocp-odh-os-demo-s3
S3_ACCESS_KEY=xxx
S3_SECRET_KEY=xxx

# trino credentials
TRINO_USER=xxx
TRINO_PASSWD=xxx
TRINO_HOST=trino-secure-odh-trino.apps.odh-cl1.apps.os-climate.org
TRINO_PORT=443
```

In [None]:
from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [None]:
import boto3
# Create an S3 client
s3 = boto3.client(
    service_name="s3",
    endpoint_url=os.environ['DEMO1_S3_ENDPOINT'],
    aws_access_key_id=os.environ['DEMO1_S3_ACCESS_KEY'],
    aws_secret_access_key=os.environ['DEMO1_S3_SECRET_KEY'],
)

In [None]:
import trino
conn = trino.dbapi.connect(
    auth=trino.auth.BasicAuthentication(os.environ['TRINO_USER'], os.environ['TRINO_PASSWD']),
    host=os.environ['TRINO_HOST'],
    port=int(os.environ['TRINO_PORT']),
    http_scheme='https',
    verify=True,
)
cur = conn.cursor()

In [None]:
cur.execute('create schema if not exists demo1.company_data')
cur.fetchall()

## load xlsx file

Loading directly from xlsx -> pandas yields cleaner data than attempting multiple translations:
xlsx -> gsheet -> csv -> pandas

In [None]:
# loading excel doesn't work from 'get_object' mode

s3.download_file(
    os.environ['DEMO1_S3_BUCKET'],
    'itr-demo-data/ITR_company_data_minimum_required.xlsx',
    '/tmp/t.xlsx'
)

# load the raw file from the bucket
xls = pd.ExcelFile('/tmp/t.xlsx')
xls.sheet_names

## ITR Fundamental Data

In [None]:
# this will be output trino table name
tablename = 'fundamental_data'

# get this sheet, and assess data types
df = pd.read_excel(xls, 'fundamental_data').convert_dtypes()

# rename columns to forms that sql will handle
snakify_columns(df, inplace=True)
df.rename(columns=rename_year_columns, inplace=True)

In [None]:
# a way to examine the structure of a pandas data frame
df.info(verbose=True)

In [None]:
df.to_parquet('/tmp/{tname}.parquet'.format(tname=tablename), index=False)
s3.upload_file(
    Bucket=os.environ['DEMO1_S3_BUCKET'],
    Key='trino/company_data/{tname}/{tname}.parquet'.format(tname=tablename),
    Filename='/tmp/{tname}.parquet'.format(tname=tablename)
)

In [None]:
# For this demonstration example, we just recreate table from scratch each time.
# in live data platform there will need to be policies and mechanisms for either
# appending new data, or overwriting old data, or saving off conditioned by a versioning number
# this is a data governance topic
cur.execute('drop table if exists demo1.company_data.{tname}'.format(tname=tablename))
cur.fetchall()

In [None]:
# generate a sql schema that will correspond to the data types
# of columns in the pandas DF
# to-do: add some mechanisms for overriding types, either here
# or on the pandas data-frame itself before we write it out
schema = generate_table_schema_pairs(df)

tabledef = """create table if not exists demo1.company_data.{tname}(
{schema}
) with (
    format = 'parquet',
    external_location = 's3a://{bucket}/trino/company_data/{tname}/'
)""".format(schema=schema,bucket=os.environ['DEMO1_S3_BUCKET'],tname=tablename)
print(tabledef)

# tables created externally may not show up immediately in cloud-beaver
cur.execute(tabledef)
cur.fetchall()

In [None]:
# test that we can get data
#cur.execute('select country from demo1.company_data.fundamental_data')
#cur.fetchall()

## ITR Projected Production

In [None]:
# this will be output trino table name
tablename = 'projected_production'

# get this sheet, and assess data types
df = pd.read_excel(xls, 'projected_production').convert_dtypes()

# rename columns to forms that sql will handle
snakify_columns(df, inplace=True)
df.rename(columns=rename_year_columns, inplace=True)

In [None]:
# a way to examine the structure of a pandas data frame
#df.info(verbose=True)

In [None]:
df.to_parquet('/tmp/{tname}.parquet'.format(tname=tablename), index=False)
s3.upload_file(
    Bucket=os.environ['DEMO1_S3_BUCKET'],
    Key='trino/company_data/{tname}/{tname}.parquet'.format(tname=tablename),
    Filename='/tmp/{tname}.parquet'.format(tname=tablename)
)

In [None]:
cur.execute('drop table if exists demo1.company_data.{tname}'.format(tname=tablename))
cur.fetchall()

In [None]:
schema = generate_table_schema_pairs(df)

tabledef = """create table if not exists demo1.company_data.{tname}(
{schema}
) with (
    format = 'parquet',
    external_location = 's3a://{bucket}/trino/company_data/{tname}/'
)""".format(schema=schema,bucket=os.environ['DEMO1_S3_BUCKET'],tname=tablename)
print(tabledef)

# tables created externally may not show up immediately in cloud-beaver
cur.execute(tabledef)
cur.fetchall()

In [None]:
# test that we can get data
#cur.execute('select y2020 from demo1.company_data.projected_production')
#cur.fetchall()

## ITR Projected Target

In [None]:
tablename = 'projected_target'

# get this sheet, and assess data types
df = pd.read_excel(xls, 'projected_target').convert_dtypes()

# rename columns to forms that sql will handle
snakify_columns(df, inplace=True)
df.rename(columns=rename_year_columns, inplace=True)

In [None]:
df.to_parquet('/tmp/{tname}.parquet'.format(tname=tablename), index=False)
s3.upload_file(
    Bucket=os.environ['DEMO1_S3_BUCKET'],
    Key='trino/company_data/{tname}/{tname}.parquet'.format(tname=tablename),
    Filename='/tmp/{tname}.parquet'.format(tname=tablename)
)

In [None]:
cur.execute('drop table if exists demo1.company_data.{tname}'.format(tname=tablename))
cur.fetchall()

In [None]:
schema = generate_table_schema_pairs(df)

tabledef = """create table if not exists demo1.company_data.{tname}(
{schema}
) with (
    format = 'parquet',
    external_location = 's3a://{bucket}/trino/company_data/{tname}/'
)""".format(schema=schema,bucket=os.environ['DEMO1_S3_BUCKET'],tname=tablename)
print(tabledef)

# tables created externally may not show up immediately in cloud-beaver
cur.execute(tabledef)
cur.fetchall()

## ITR Projected ei in Wh

In [None]:
tablename = 'projected_ei_in_wh'

# get this sheet, and assess data types
df = pd.read_excel(xls, 'projected_ei_in_Wh').convert_dtypes()

# rename columns to forms that sql will handle
snakify_columns(df, inplace=True)
df.rename(columns=rename_year_columns, inplace=True)

In [None]:
df.to_parquet('/tmp/{tname}.parquet'.format(tname=tablename), index=False)
s3.upload_file(
    Bucket=os.environ['DEMO1_S3_BUCKET'],
    Key='trino/company_data/{tname}/{tname}.parquet'.format(tname=tablename),
    Filename='/tmp/{tname}.parquet'.format(tname=tablename)
)

In [None]:
cur.execute('drop table if exists demo1.company_data.{tname}'.format(tname=tablename))
cur.fetchall()

In [None]:
schema = generate_table_schema_pairs(df)

tabledef = """create table if not exists demo1.company_data.{tname}(
{schema}
) with (
    format = 'parquet',
    external_location = 's3a://{bucket}/trino/company_data/{tname}/'
)""".format(schema=schema,bucket=os.environ['DEMO1_S3_BUCKET'],tname=tablename)
print(tabledef)

# tables created externally may not show up immediately in cloud-beaver
cur.execute(tabledef)
cur.fetchall()