Run these in a notebook cell if you need to install onto your nb env
```
# 'capture' magic prevents long outputs from spamming your notebook
%%capture pipoutput

# For loading predefined environment variables from files
# Typically used to load sensitive access credentials
%pip install python-dotenv

# Standard python package for interacting with S3 buckets
%pip install boto3

# Interacting with Trino and using Trino with sqlalchemy
%pip install trino sqlalchemy sqlalchemy-trino

# Pandas and parquet file i/o
%pip install pandas pyarrow fastparquet

# OS-Climate utilities to make data ingest easier
%pip install osc-ingest-tools
```

In [1]:
import os
import pathlib
from dotenv import load_dotenv

# Load some standard environment variables from a dot-env file, if it exists.
# If no such file can be found, does not fail, and so allows these environment vars to
# be populated in some other way
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [2]:
import boto3
s3 = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ["S3_DEV_ENDPOINT"],
    aws_access_key_id=os.environ["S3_DEV_ACCESS_KEY"],
    aws_secret_access_key=os.environ["S3_DEV_SECRET_KEY"],
)
bucket = s3.Bucket(os.environ["S3_DEV_BUCKET"])

In [3]:
import trino
from sqlalchemy.engine import create_engine

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ['TRINO_USER'],
    host = os.environ['TRINO_HOST'],
    port = os.environ['TRINO_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ['TRINO_PASSWD']),
    'http_scheme': 'https'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

In [4]:
ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'demo'
ingest_table = 'parquet_partitions_tutorial'

In [5]:
partition_columns = ['year']

In [6]:
import pandas as pd
data = [['2020', 10], ['2021', 15], ['2022', 14]]
df1 = pd.DataFrame(data, columns = ['year', 'metric'])
df1 = df1.convert_dtypes()

In [7]:
# to do: put this in osc-ingest-tools
def enforce_partition_column_order(df, pcols, inplace=False):
    cols = list(df.columns.values)
    for c in pcols:
        cols.remove(c)
        cols.append(c)
    if not inplace:
        return df[cols]
    for c in cols:
        s = df[c]
        df.drop(columns=[c], inplace=True)
        df[c] = s

In [8]:
enforce_partition_column_order(df1, partition_columns, inplace=True)
df1

Unnamed: 0,metric,year
0,10,2020
1,15,2021
2,14,2022


In [9]:
sql = f"""
drop table if exists {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
qres = engine.execute(sql)
print(qres.fetchall())

# hive connector does not manage underlying files for you
# so to truly drop a table you must manually remove underlying data files
bucket.objects \
    .filter(Prefix=f'trino/{ingest_schema}/{ingest_table}/') \
    .delete()

[(True,)]


[{'ResponseMetadata': {'RequestId': 'M91YEP0813EZH0ZA',
   'HostId': 'TiYsfaLZOCcZcIo+/n2vN/vWdqFsSzOkrFhd3rTNej6N1GQynjWdRBd1D6OUnoYnJFfjfJIUzPg=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'TiYsfaLZOCcZcIo+/n2vN/vWdqFsSzOkrFhd3rTNej6N1GQynjWdRBd1D6OUnoYnJFfjfJIUzPg=',
    'x-amz-request-id': 'M91YEP0813EZH0ZA',
    'date': 'Fri, 03 Dec 2021 23:15:53 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'trino/demo/parquet_partitions_tutorial/year=2021/c0ded545f68843b8a34903c1aa17f391.parquet'},
   {'Key': 'trino/demo/parquet_partitions_tutorial/year=2020/eeef91d926af44c2b3458697b3243613.parquet'},
   {'Key': 'trino/demo/parquet_partitions_tutorial/year=2022/537b859071174d9fb5955b2264abc793.parquet'}]}]

In [10]:
# todo: add this utility to osc-ingest-tools
import os
def upload_directory_to_s3(path, bucket, prefix):
    for subdir, dirs, files in os.walk(path):
        for f in files:
            pfx = subdir.replace(path, prefix)
            src = os.path.join(subdir, f)
            dst = os.path.join(pfx, f)
            #print(f'{src}  -->  {dst}')
            bucket.upload_file(src, dst)

In [11]:
import shutil
tmp = f'/tmp/{ingest_table}'

# pandas does not clean out destination directory for you:
shutil.rmtree(tmp, ignore_errors=True)

# tell pandas to write a directory tree, using partitions
df1.to_parquet(tmp,
               partition_cols=partition_columns,
               index=False)

# upload the tree onto S3
# The previous call to `df.to_parquet` automatically creates unique filenames,
# so any pre-existing data out on s3 is NOT overwritten.
# Effectively this is an "append" operation on the corresponding trino DB
# If you want to overwrite trino's data you will also have to remove
# any pre-existing data out on S3 before you upload
upload_directory_to_s3(tmp, bucket, f'trino/{ingest_schema}/{ingest_table}')

In [12]:
for e in bucket.objects \
    .filter(Prefix=f'trino/{ingest_schema}/{ingest_table}/'):
    print(e.key)

trino/demo/parquet_partitions_tutorial/year=2020/ae44418ebbb246b891ce4bdff939b2f5.parquet
trino/demo/parquet_partitions_tutorial/year=2021/d1001551ca81425eaa0eafc4e33468be.parquet
trino/demo/parquet_partitions_tutorial/year=2022/c8db9cc2e12a4521913c100a052a486d.parquet


In [13]:
import osc_ingest_trino as osc
columnschema = osc.create_table_schema_pairs(df1)

tabledef = f"""
create table if not exists {ingest_catalog}.{ingest_schema}.{ingest_table}(
{columnschema}
) with (
    format = 'parquet',
    partitioned_by = array{partition_columns},
    external_location = 's3a://{bucket.name}/trino/{ingest_schema}/{ingest_table}/'
)
"""
print(tabledef)
qres = engine.execute(tabledef)
print(qres.fetchall())


create table if not exists osc_datacommons_dev.demo.parquet_partitions_tutorial(
    metric bigint,
    year varchar
) with (
    format = 'parquet',
    partitioned_by = array['year'],
    external_location = 's3a://ocp-odh-os-demo-s3/trino/demo/parquet_partitions_tutorial/'
)

[(True,)]


In [14]:
sql = f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
df = pd.read_sql(sql, engine)
df

Unnamed: 0,metric,year
