In [1]:
#pip install boto3 pyarrow

In [2]:
import re
import pandas as pd
_wsdedup = re.compile(r"\s+")
_usdedup = re.compile(r"__+")
_rmpunc = re.compile(r"[,.()&$/+-]+")
# 63 seems to be a common max column name length
def snakify(name, maxlen=63):
    if isinstance(name, list):
        return [snakify(e) for e in name]
    w = name.casefold().rstrip().lstrip()
    w = w.replace("-", "_")
    w = _rmpunc.sub("", w)
    w = _wsdedup.sub("_", w)
    w = _usdedup.sub("_", w)
    w = w.replace("average", "avg")
    w = w.replace("maximum", "max")
    w = w.replace("minimum", "min")
    w = w.replace("absolute", "abs")
    w = w.replace("source", "src")
    w = w.replace("distribution", "dist")
    # these are common in the sample names but unsure of standard abbv
    #w = w.replace("inference", "inf")
    #w = w.replace("emissions", "emis")
    #w = w.replace("intensity", "int")
    #w = w.replace("reported", "rep")
    #w = w.replace("revenue", "rev")
    w = w[:maxlen] 
    return w

def snakify_columns(df, inplace=False, maxlen=63):
    icols = dfEmissions.columns.to_list()
    ocols = snakify(icols, maxlen=maxlen)
    scols = set(ocols)
    if (len(set(ocols)) < len(ocols)):
        raise ValueError("remapped column names were not unique!")
    rename_map = dict(list(zip(icols,snakify(icols))))
    return df.rename(columns=rename_map, inplace=inplace)

_p2smap = {
    'string': 'varchar',
    'Float64': 'double',
    'Int64': 'bigint'
}

def pandas_type_to_sql(pt):
    st = _p2smap.get(pt)
    if st is not None:
        return st
    raise ValueError("unexpected pandas column type '{pt}'".format(pt=pt))

# add ability to specify optional dict for specific fields?
# if column name is present, use specified value?
def generate_table_schema_pairs(df):
    ptypes = [str(e) for e in df.dtypes.to_list()]
    stypes = [pandas_type_to_sql(e) for e in ptypes]
    pz = list(zip(df.columns.to_list(), stypes))
    return ",\n".join(["    {n} {t}".format(n=e[0],t=e[1]) for e in pz])

In [3]:
# best security practice to avoid hard-coding access credentials
with open('/opt/app-root/src/s3-auth.txt', 'r') as file:
    lines = [line.rstrip() for line in file.readlines()]

s3_endpoint_url = lines[0]
s3_access_key = lines[1]
s3_secret_key = lines[2]
s3_bucket = lines[3]
del lines

[s3_endpoint_url, s3_bucket]

['https://s3.us-east-1.amazonaws.com', 'ocp-odh-os-demo-s3']

In [4]:
with open('/opt/app-root/src/trino-auth.txt', 'r') as file:
    lines = [line.rstrip() for line in file.readlines()]

trino_user = lines[0]
trino_passwd = lines[1]
trino_host = 'trino-secure-odh-trino.apps.odh-cl1.apps.os-climate.org'
trino_port = 443
del lines

trino_user

'erik'

In [5]:
import boto3
# Create an S3 client
s3 = boto3.client(
    service_name="s3",
    aws_access_key_id=s3_access_key,
    aws_secret_access_key=s3_secret_key,
    endpoint_url=s3_endpoint_url,
)

In [6]:
import pandas as pd
obj = s3.get_object(
    Bucket=s3_bucket,
    Key='urgentem/UrgentemDataSampleEmissionsTargetsDec2020.csv')
dfEmissions = (pd.read_csv(obj['Body'])).convert_dtypes()
snakify_columns(dfEmissions, inplace=True)
dfEmissions.head()

Unnamed: 0,company_name,isin,target_type,scope,coverage_s1,coverage_s2,coverage_s3,reduction_ambition,base_year,end_year,start_year,base_year_ghg_emissions_s1_tco2e,base_year_ghg_emissions_s1s2_tco2e,base_year_ghg_emissions_s3_tco2e,achieved_reduction
0,3M CO,US88579Y1010,Absolute,S1+S2,1.0,1.0,,0.5,2002,2025,2015,,18300000.0,91500000.0,0.3
1,ADIDAS AG,DE000A1EWWW0,Absolute,S1+S2,0.9,0.9,,0.15,2015,2020,2015,,59132.0,295660.0,1.0
2,BARCLAYS PLC,GB0031348658,Absolute,S1+S2,1.0,1.0,,0.37,2018,2025,2018,,282593.0,1412965.0,0.0
3,DANONE,FR0000120644,Absolute,S1+S2,0.95,0.95,,0.3,2015,2030,2017,,1681235.0,8406175.0,0.68
4,EQUINOR ASA,NO0010096985,Absolute,S1,1.0,,,0.21,2016,2030,2017,9329201.0,,,0.06


In [7]:
# a way to examine the structure of a pandas data frame
#dfEmissions.info(verbose=True)

In [8]:
# parquet has multiple options for appending or updating data
# including adding new files, or appending, sharding directory trees, etc
dfEmissions.to_parquet('/tmp/emissions.parquet', index=False)
s3.upload_file(
    Bucket=s3_bucket,
    Key='urgentem/trino/itr_emissions_sample_test/emissions.parquet',
    Filename='/tmp/emissions.parquet'
)

In [9]:
import trino
conn = trino.dbapi.connect(
    auth=trino.auth.BasicAuthentication(trino_user, trino_passwd),
    host=trino_host,
    port=trino_port,
    http_scheme='https',
    verify=True,
)
cur = conn.cursor()

In [10]:
# For this demonstration example, we just recreate table from scratch each time.
# in live data platform there will need to be policies and mechanisms for either
# appending new data, or overwriting old data, or saving off conditioned by a versioning number
# this is a data governance topic
cur.execute('drop table if exists hive.urgentem.itr_emissions_sample_test')
cur.fetchall()

[[True]]

In [11]:
# generate a sql schema that will correspond to the data types
# of columns in the pandas DF
# to-do: add some mechanisms for overriding types, either here
# or on the pandas data-frame itself before we write it out
schema = generate_table_schema_pairs(dfEmissions)

tabledef = """create table if not exists hive.urgentem.itr_emissions_sample_test(
{schema}
) with (
    format = 'parquet',
    external_location = 's3a://ocp-odh-os-demo-s3/urgentem/trino/itr_emissions_sample_test/'
)""".format(schema=schema)
print(tabledef)

# tables created externally may not show up immediately in cloud-beaver
cur.execute(tabledef)
cur.fetchall()

create table if not exists hive.urgentem.itr_emissions_sample_test(
    company_name varchar,
    isin varchar,
    target_type varchar,
    scope varchar,
    coverage_s1 double,
    coverage_s2 double,
    coverage_s3 bigint,
    reduction_ambition double,
    base_year bigint,
    end_year bigint,
    start_year bigint,
    base_year_ghg_emissions_s1_tco2e varchar,
    base_year_ghg_emissions_s1s2_tco2e varchar,
    base_year_ghg_emissions_s3_tco2e varchar,
    achieved_reduction double
) with (
    format = 'parquet',
    external_location = 's3a://ocp-odh-os-demo-s3/urgentem/trino/itr_emissions_sample_test/'
)


[[True]]

In [12]:
# test creation of new table
#cur.execute('show tables in hive.urgentem')
#cur.fetchall()

In [13]:
# test that we can get data
#cur.execute('select isin from hive.urgentem.itr_emissions_sample_test')
#cur.fetchall()