In [1]:
#pip install boto3 pyarrow python-dotenv

In [2]:
import re
import pandas as pd
_wsdedup = re.compile(r"\s+")
_usdedup = re.compile(r"__+")
_rmpunc = re.compile(r"[,.()&$/+-]+")
# 63 seems to be a common max column name length
def snakify(name, maxlen=63):
    if isinstance(name, list):
        return [snakify(e) for e in name]
    w = name.casefold().rstrip().lstrip()
    w = w.replace("-", "_")
    w = _rmpunc.sub("", w)
    w = _wsdedup.sub("_", w)
    w = _usdedup.sub("_", w)
    w = w.replace("average", "avg")
    w = w.replace("maximum", "max")
    w = w.replace("minimum", "min")
    w = w.replace("absolute", "abs")
    w = w.replace("source", "src")
    w = w.replace("distribution", "dist")
    # these are common in the sample names but unsure of standard abbv
    #w = w.replace("inference", "inf")
    #w = w.replace("emissions", "emis")
    #w = w.replace("intensity", "int")
    #w = w.replace("reported", "rep")
    #w = w.replace("revenue", "rev")
    w = w[:maxlen] 
    return w

def snakify_columns(df, inplace=False, maxlen=63):
    icols = df.columns.to_list()
    ocols = snakify(icols, maxlen=maxlen)
    scols = set(ocols)
    if (len(set(ocols)) < len(ocols)):
        raise ValueError("remapped column names were not unique!")
    rename_map = dict(list(zip(icols,snakify(icols))))
    return df.rename(columns=rename_map, inplace=inplace)

_p2smap = {
    'string': 'varchar',
    'Float64': 'double',
    'Int64': 'bigint'
}

def pandas_type_to_sql(pt):
    st = _p2smap.get(pt)
    if st is not None:
        return st
    raise ValueError("unexpected pandas column type '{pt}'".format(pt=pt))

# add ability to specify optional dict for specific fields?
# if column name is present, use specified value?
def generate_table_schema_pairs(df):
    ptypes = [str(e) for e in df.dtypes.to_list()]
    stypes = [pandas_type_to_sql(e) for e in ptypes]
    pz = list(zip(df.columns.to_list(), stypes))
    return ",\n".join(["    {n} {t}".format(n=e[0],t=e[1]) for e in pz])

def clean_ref_err(v):
    if pd.isna(v):
        return pd.NA
    if v == '#REF!':
        return pd.NA
    if v == '#ERROR!':
        return pd.NA
    return v

## Example `credentials.env` file

```
# s3 credentials
S3_ENDPOINT=https://s3.us-east-1.amazonaws.com
S3_BUCKET=ocp-odh-os-demo-s3
S3_ACCESS_KEY=xxx
S3_SECRET_KEY=xxx

# trino credentials
TRINO_USER=xxx
TRINO_PASSWD=xxx
TRINO_HOST=trino-secure-odh-trino.apps.odh-cl1.apps.os-climate.org
TRINO_PORT=443
```

In [3]:
from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [4]:
import boto3
# Create an S3 client
s3 = boto3.client(
    service_name="s3",
    endpoint_url=os.environ['DEMO1_S3_ENDPOINT'],
    aws_access_key_id=os.environ['DEMO1_S3_ACCESS_KEY'],
    aws_secret_access_key=os.environ['DEMO1_S3_SECRET_KEY'],
)

In [5]:
import trino
conn = trino.dbapi.connect(
    auth=trino.auth.BasicAuthentication(os.environ['TRINO_USER'], os.environ['TRINO_PASSWD']),
    host=os.environ['TRINO_HOST'],
    port=int(os.environ['TRINO_PORT']),
    http_scheme='https',
    verify=True,
)
cur = conn.cursor()

In [6]:
cur.execute('create schema if not exists demo1.company_data')
cur.fetchall()

[[True]]

## ITR Fundamental Data

In [7]:
# this will be output trino table name
tablename='fundamental_data'

obj = s3.get_object(
    Bucket=os.environ['DEMO1_S3_BUCKET'],
    Key='itr-demo-data/itr_fundamental_data.csv')

# load the raw file from the bucket
df = (pd.read_csv(obj['Body'])).convert_dtypes()

# convert columns to specific data types
df = df.convert_dtypes()

# map column names to a form that works for SQL
snakify_columns(df, inplace=True)
#dfEmissions.head()

In [8]:
# a way to examine the structure of a pandas data frame
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   company_name              31 non-null     string 
 1   company_id                31 non-null     string 
 2   isic                      0 non-null      Int64  
 3   country                   31 non-null     string 
 4   region                    31 non-null     string 
 5   industry_level_1          0 non-null      Int64  
 6   industry_level_2          0 non-null      Int64  
 7   industry_level_3          0 non-null      Int64  
 8   industry_level_4          0 non-null      Int64  
 9   sector                    31 non-null     string 
 10  company_revenue           30 non-null     Float64
 11  company_market_cap        30 non-null     Float64
 12  company_enterprise_value  30 non-null     Float64
 13  company_total_assets      30 non-null     Float64
 14  company_cash

In [9]:
df.to_parquet('/tmp/{tname}.parquet'.format(tname=tablename), index=False)
s3.upload_file(
    Bucket=os.environ['DEMO1_S3_BUCKET'],
    Key='trino/company_data/{tname}/{tname}.parquet'.format(tname=tablename),
    Filename='/tmp/{tname}.parquet'.format(tname=tablename)
)

In [10]:
# For this demonstration example, we just recreate table from scratch each time.
# in live data platform there will need to be policies and mechanisms for either
# appending new data, or overwriting old data, or saving off conditioned by a versioning number
# this is a data governance topic
cur.execute('drop table if exists demo1.company_data.{tname}'.format(tname=tablename))
cur.fetchall()

[[True]]

In [11]:
# generate a sql schema that will correspond to the data types
# of columns in the pandas DF
# to-do: add some mechanisms for overriding types, either here
# or on the pandas data-frame itself before we write it out
schema = generate_table_schema_pairs(df)

tabledef = """create table if not exists demo1.company_data.{tname}(
{schema}
) with (
    format = 'parquet',
    external_location = 's3a://{bucket}/trino/company_data/{tname}/'
)""".format(schema=schema,bucket=os.environ['DEMO1_S3_BUCKET'],tname=tablename)
print(tabledef)

# tables created externally may not show up immediately in cloud-beaver
cur.execute(tabledef)
cur.fetchall()

create table if not exists demo1.company_data.fundamental_data(
    company_name varchar,
    company_id varchar,
    isic bigint,
    country varchar,
    region varchar,
    industry_level_1 bigint,
    industry_level_2 bigint,
    industry_level_3 bigint,
    industry_level_4 bigint,
    sector varchar,
    company_revenue double,
    company_market_cap double,
    company_enterprise_value double,
    company_total_assets double,
    company_cash_equivalents double,
    target_probability double
) with (
    format = 'parquet',
    external_location = 's3a://ocp-odh-data-bucket1-s3/trino/company_data/fundamental_data/'
)


[[True]]

In [12]:
# test that we can get data
#cur.execute('select country from demo1.company_data.fundamental_data')
#cur.fetchall()

## ITR Projected Production

In [13]:
tablename = 'projected_production'

obj = s3.get_object(
    Bucket=os.environ['DEMO1_S3_BUCKET'],
    Key='itr-demo-data/itr_projected_production.csv')

# load the raw file from the bucket
df = (pd.read_csv(obj['Body'])).convert_dtypes()

# map column names to a form that works for SQL
snakify_columns(df, inplace=True)

#dfEmissions.head()

In [14]:
# a way to examine the structure of a pandas data frame
#df.info(verbose=True)

In [15]:
df = df[['company_id','variable','2020']]
rename={
    '2020':'y2020'
}
df.rename(columns=rename, inplace=True)

In [16]:
df['y2020'] = (df['y2020']).map(clean_ref_err)

In [17]:
df = df.convert_dtypes()

In [18]:
df['y2020'] = df['y2020'].astype('Float64')

In [19]:
df.to_parquet('/tmp/{tname}.parquet'.format(tname=tablename), index=False)
s3.upload_file(
    Bucket=os.environ['DEMO1_S3_BUCKET'],
    Key='trino/company_data/{tname}/{tname}.parquet'.format(tname=tablename),
    Filename='/tmp/{tname}.parquet'.format(tname=tablename)
)

In [20]:
cur.execute('drop table if exists demo1.company_data.{tname}'.format(tname=tablename))
cur.fetchall()

[[True]]

In [21]:
schema = generate_table_schema_pairs(df)

tabledef = """create table if not exists demo1.company_data.{tname}(
{schema}
) with (
    format = 'parquet',
    external_location = 's3a://{bucket}/trino/company_data/{tname}/'
)""".format(schema=schema,bucket=os.environ['DEMO1_S3_BUCKET'],tname=tablename)
print(tabledef)

# tables created externally may not show up immediately in cloud-beaver
cur.execute(tabledef)
cur.fetchall()

create table if not exists demo1.company_data.projected_production(
    company_id varchar,
    variable varchar,
    y2020 double
) with (
    format = 'parquet',
    external_location = 's3a://ocp-odh-data-bucket1-s3/trino/company_data/projected_production/'
)


[[True]]

In [22]:
# test that we can get data
#cur.execute('select y2020 from demo1.company_data.projected_production')
#cur.fetchall()

## ITR Projected Target

In [26]:
tablename = 'projected_target'

obj = s3.get_object(
    Bucket=os.environ['DEMO1_S3_BUCKET'],
    Key='itr-demo-data/itr_projected_target.csv')

# load the raw file from the bucket
df = (pd.read_csv(obj['Body']))

# map column names to a form that works for SQL
snakify_columns(df, inplace=True)

In [27]:
df = df[['company_id','variable','2020']]
rename={
    '2020':'y2020'
}
df.rename(columns=rename, inplace=True)

In [28]:
df['y2020'] = (df['y2020']).map(clean_ref_err)

In [29]:
df = df.convert_dtypes()

In [30]:
df['y2020'] = (df['y2020']).astype('Float64')

In [31]:
df.to_parquet('/tmp/{tname}.parquet'.format(tname=tablename), index=False)
s3.upload_file(
    Bucket=os.environ['DEMO1_S3_BUCKET'],
    Key='trino/company_data/{tname}/{tname}.parquet'.format(tname=tablename),
    Filename='/tmp/{tname}.parquet'.format(tname=tablename)
)

In [32]:
cur.execute('drop table if exists demo1.company_data.{tname}'.format(tname=tablename))
cur.fetchall()

[[True]]

In [33]:
schema = generate_table_schema_pairs(df)

tabledef = """create table if not exists demo1.company_data.{tname}(
{schema}
) with (
    format = 'parquet',
    external_location = 's3a://{bucket}/trino/company_data/{tname}/'
)""".format(schema=schema,bucket=os.environ['DEMO1_S3_BUCKET'],tname=tablename)
print(tabledef)

# tables created externally may not show up immediately in cloud-beaver
cur.execute(tabledef)
cur.fetchall()

create table if not exists demo1.company_data.projected_target(
    company_id varchar,
    variable varchar,
    y2020 double
) with (
    format = 'parquet',
    external_location = 's3a://ocp-odh-data-bucket1-s3/trino/company_data/projected_target/'
)


[[True]]

## ITR Projected ei in Wh

In [34]:
tablename = 'projected_ei_in_wh'

obj = s3.get_object(
    Bucket=os.environ['DEMO1_S3_BUCKET'],
    Key='itr-demo-data/itr_projected_ei_in_Wh.csv')

# load the raw file from the bucket
df = (pd.read_csv(obj['Body']))

# map column names to a form that works for SQL
snakify_columns(df, inplace=True)

In [35]:
df = df[['company_id','variable','2020']]
rename={
    '2020':'y2020'
}
df.rename(columns=rename, inplace=True)

In [36]:
df['y2020'] = (df['y2020']).map(clean_ref_err)

In [37]:
df = df.convert_dtypes()

In [38]:
df['y2020'] = (df['y2020']).astype('Float64')

In [39]:
df.to_parquet('/tmp/{tname}.parquet'.format(tname=tablename), index=False)
s3.upload_file(
    Bucket=os.environ['DEMO1_S3_BUCKET'],
    Key='trino/company_data/{tname}/{tname}.parquet'.format(tname=tablename),
    Filename='/tmp/{tname}.parquet'.format(tname=tablename)
)

In [40]:
cur.execute('drop table if exists demo1.company_data.{tname}'.format(tname=tablename))
cur.fetchall()

[[True]]

In [41]:
schema = generate_table_schema_pairs(df)

tabledef = """create table if not exists demo1.company_data.{tname}(
{schema}
) with (
    format = 'parquet',
    external_location = 's3a://{bucket}/trino/company_data/{tname}/'
)""".format(schema=schema,bucket=os.environ['DEMO1_S3_BUCKET'],tname=tablename)
print(tabledef)

# tables created externally may not show up immediately in cloud-beaver
cur.execute(tabledef)
cur.fetchall()

create table if not exists demo1.company_data.projected_ei_in_wh(
    company_id varchar,
    variable varchar,
    y2020 double
) with (
    format = 'parquet',
    external_location = 's3a://ocp-odh-data-bucket1-s3/trino/company_data/projected_ei_in_wh/'
)


[[True]]