Run these in a notebook cell if you need to install onto your nb env
```
# 'capture' magic prevents long outputs from spamming your notebook
%%capture pipoutput

# For loading predefined environment variables from files
# Typically used to load sensitive access credentials
%pip install python-dotenv

# Standard python package for interacting with S3 buckets
%pip install boto3

# Interacting with Trino and using Trino with sqlalchemy
%pip install trino sqlalchemy sqlalchemy-trino

# Pandas and parquet file i/o
%pip install pandas pyarrow fastparquet

# OS-Climate utilities to make data ingest easier
%pip install osc-ingest-tools
```

In [1]:
%xmode Minimal

Exception reporting mode: Minimal


In [2]:
%%capture pipoutput
%pip install python-dotenv boto3 trino sqlalchemy sqlalchemy-trino pandas pyarrow fastparquet 

In [3]:
%%capture pipoutput
%pip install --upgrade osc-ingest-tools==0.2.1

In [4]:
ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'demo'
ingest_table = 'parquet_partitions_tutorial'

In [5]:
partition_columns = ['year']

In [6]:
iceberg_catalog = 'osc_datacommons_iceberg_dev'
iceberg_schema = 'iceberg_demo'
iceberg_table = 'hive_iceberg_ingest'
iceberg_table_tosql = 'iceberg_tosql'

In [7]:
# this notebook assumes osc-ingest-tools == 0.2.1
import osc_ingest_trino as osc
osc.load_credentials_dotenv()
bucket = osc.attach_s3_bucket('S3_DEV')
engine = osc.attach_trino_engine(catalog = iceberg_catalog, verbose=True)

using connect string: trino://erikerlandson@trino-secure-odh-trino.apps.odh-cl1.apps.os-climate.org:443/osc_datacommons_iceberg_dev


Ingest table is on hive catalog

In [8]:
import random
import pandas as pd

datasize = 100000
yeardata = [random.randrange(2020,2023) for x in range(datasize)]
metricdata = [random.random() for x in range(datasize)]

df1 = pd.DataFrame(data={"year": yeardata, "metric": metricdata})
df1.convert_dtypes()

Unnamed: 0,year,metric
0,2020,0.236327
1,2020,0.284953
2,2020,0.267217
3,2020,0.278004
4,2022,0.663924
...,...,...
99995,2020,0.490976
99996,2020,0.299551
99997,2020,0.167506
99998,2020,0.77609


In [9]:
df1.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   year    100000 non-null  int64  
 1   metric  100000 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.5 MB


In [10]:
osc.enforce_partition_column_order(df1, partition_columns, inplace=True)
df1

Unnamed: 0,metric,year
0,0.236327,2020
1,0.284953,2020
2,0.267217,2020
3,0.278004,2020
4,0.663924,2022
...,...,...
99995,0.490976,2020
99996,0.299551,2020
99997,0.167506,2020
99998,0.776090,2020


In [11]:
osc.drop_unmanaged_table(ingest_catalog, ingest_schema, ingest_table, engine, bucket, verbose=True)

[{'ResponseMetadata': {'RequestId': 'FXX07RMPCDJ8XVGY', 'HostId': 'cLLLzaE1w0ti9uyTHXPvsoN3v9xUJAJLvImQwSFSK0XgE+hzlmyxaeMS0MkILc6olFWJqbr8cJY=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'cLLLzaE1w0ti9uyTHXPvsoN3v9xUJAJLvImQwSFSK0XgE+hzlmyxaeMS0MkILc6olFWJqbr8cJY=', 'x-amz-request-id': 'FXX07RMPCDJ8XVGY', 'date': 'Sat, 04 Jun 2022 18:07:43 GMT', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3', 'connection': 'close'}, 'RetryAttempts': 0}, 'Deleted': [{'Key': 'trino/demo/parquet_partitions_tutorial/year=2021/355edeac7c99443da3e4fa911705583c-0.parquet'}, {'Key': 'trino/demo/parquet_partitions_tutorial/year=2020/355edeac7c99443da3e4fa911705583c-0.parquet'}, {'Key': 'trino/demo/parquet_partitions_tutorial/year=2022/355edeac7c99443da3e4fa911705583c-0.parquet'}]}]


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fc2bd183ca0>

In [12]:
%%time
osc.ingest_unmanaged_parquet(df1, ingest_schema, ingest_table, bucket,
                             partition_columns=partition_columns,
                             verbose=True)

/tmp/parquet_partitions_tutorial/year=2020/b31e4b82e0ba403fac3b3000abce37b5-0.parquet  -->  trino/demo/parquet_partitions_tutorial/year=2020/b31e4b82e0ba403fac3b3000abce37b5-0.parquet
/tmp/parquet_partitions_tutorial/year=2022/b31e4b82e0ba403fac3b3000abce37b5-0.parquet  -->  trino/demo/parquet_partitions_tutorial/year=2022/b31e4b82e0ba403fac3b3000abce37b5-0.parquet
/tmp/parquet_partitions_tutorial/year=2021/b31e4b82e0ba403fac3b3000abce37b5-0.parquet  -->  trino/demo/parquet_partitions_tutorial/year=2021/b31e4b82e0ba403fac3b3000abce37b5-0.parquet
CPU times: user 47 ms, sys: 17.8 ms, total: 64.7 ms
Wall time: 335 ms


In [13]:
for e in bucket.objects \
    .filter(Prefix=f'trino/{ingest_schema}/{ingest_table}/'):
    print(e.key)

trino/demo/parquet_partitions_tutorial/year=2020/b31e4b82e0ba403fac3b3000abce37b5-0.parquet
trino/demo/parquet_partitions_tutorial/year=2021/b31e4b82e0ba403fac3b3000abce37b5-0.parquet
trino/demo/parquet_partitions_tutorial/year=2022/b31e4b82e0ba403fac3b3000abce37b5-0.parquet


In [14]:
sql = osc.unmanaged_parquet_tabledef(df1, ingest_catalog, ingest_schema, ingest_table, bucket,
                                    partition_columns = partition_columns,
                                    verbose = True)
qres = engine.execute(sql)
print(qres.fetchall())

create table if not exists osc_datacommons_dev.demo.parquet_partitions_tutorial (
    metric double,
    year bigint
) with (
    format = 'parquet',
    partitioned_by = array['year'],
    external_location = 's3a://ocp-odh-os-demo-s3/trino/demo/parquet_partitions_tutorial/'
)
[(True,)]


In [15]:
sql = f"""
call {ingest_catalog}.system.sync_partition_metadata('{ingest_schema}', '{ingest_table}', 'FULL')
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [16]:
sql = f"""
select * from {ingest_catalog}.{ingest_schema}."{ingest_table}$partitions"
"""
df = pd.read_sql(sql, engine)
df

Unnamed: 0,year
0,2020
1,2021
2,2022


In [17]:
sql = f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}
where metric < 0.05 and year = 2022
"""
df = pd.read_sql(sql, engine)
df

Unnamed: 0,metric,year
0,0.038092,2022
1,0.025893,2022
2,0.039929,2022
3,0.004726,2022
4,0.008931,2022
...,...,...
1646,0.028098,2022
1647,0.047064,2022
1648,0.041197,2022
1649,0.035970,2022


In [18]:
sql = f"""
drop table if exists {iceberg_catalog}.{iceberg_schema}.{iceberg_table}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [19]:
import osc_ingest_trino as osc
columnschema = osc.create_table_schema_pairs(df1)

tabledef = f"""
create table if not exists {iceberg_catalog}.{iceberg_schema}.{iceberg_table}(
{columnschema}
) with (
    format = 'parquet',
    partitioning = array['year']
)
"""
print(tabledef)
qres = engine.execute(tabledef)
print(qres.fetchall())


create table if not exists osc_datacommons_iceberg_dev.iceberg_demo.hive_iceberg_ingest(
    metric double,
    year bigint
) with (
    format = 'parquet',
    partitioning = array['year']
)

[(True,)]


In [20]:
%%time
sql = f"""
insert into {iceberg_catalog}.{iceberg_schema}.{iceberg_table} (metric, year)
select metric, year from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(100000,)]
CPU times: user 14 ms, sys: 966 µs, total: 15 ms
Wall time: 1.43 s


In [21]:
import math
import os
from datetime import datetime

import trino
from sqlalchemy.engine import create_engine
from sqlalchemy.sql import text

class TrinoBatchInsert(object):
    def __init__(self, catalog=None, schema=None, batch_size=1000, optimize=False, verbose=False):
        self.catalog = catalog
        self.schema = schema
        self.batch_size = batch_size
        self.optimize = optimize
        self.verbose = verbose

    # conforms to signature expected by pandas 'callable' value for method kw arg
    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html
    # https://pandas.pydata.org/docs/user_guide/io.html#io-sql-method
    def __call__(self, sqltbl, dbcxn, columns, data_iter):
        fqname = self._full_table_name(sqltbl)
        batch = []
        self.ninserts = 0
        for r in data_iter:
            # each row of data_iter is a python tuple
            # I cannot currently make good use of sqlalchemy ':params'
            # and so I have to do my own "manual" value formatting for insertions
            row = ", ".join([TrinoBatchInsert._sqlform(e) for e in r])
            row = f"({row})"
            batch.append(row)
            # possible alternative: dispatch batches by total batch size in bytes
            if len(batch) >= self.batch_size:
                self._do_insert(dbcxn, fqname, batch)
                batch = []
        if len(batch) > 0:
            self._do_insert(dbcxn, fqname, batch)
        if self.optimize:
            if self.verbose:
                print("optimizing table files")
            sql = text(f"alter table {fqname} execute optimize")
            qres = dbcxn.execute(sql)
            x = qres.fetchall()
            if self.verbose:
                print(f"execute optimize: {x}")

    def _do_insert(self, dbcxn, fqname, batch):
        if self.verbose:
            print(f"inserting {len(batch)} records")
            TrinoBatchInsert._print_batch(batch)
        # trino is not currently supporting sqlalchemy cursor.executemany()
        # and so I am generating an insert command with no ':params' that
        # includes all batch data as literal sql values
        valclause = ",\n".join(batch)
        # injecting raw sql strings is deprecated and will be illegal in sqlalchemy 2.x
        # using text() is the correct way:
        sql = text(f"insert into {fqname} values\n{valclause}")
        # if self.verbose: print(f'{sql}')
        qres = dbcxn.execute(sql)
        x = qres.fetchall()
        if self.verbose:
            print(f"batch insert result: {x}")

    def _full_table_name(self, sqltbl):
        # start with table name
        name = f"{sqltbl.name}"
        # prepend schema - allow override from this class
        name = f"{self.schema or sqltbl.schema}.{name}"
        # prepend catalog, if provided
        if self.catalog is not None:
            name = f"{self.catalog}.{name}"
        if self.verbose:
            print(f'constructed fully qualified table name as: "{name}"')
        return name

    @staticmethod
    def _sqlform(x):
        if x is None:
            return "NULL"
        if isinstance(x, str):
            # escape any single quotes in the string
            t = x.replace("'", "''")
            # colons are mostly a problem for ':some_id_name', which is interpreted as
            # a parameter requiring binding, but just escaping them all works
            t = t.replace(":", "\\:")
            # enclose string with single quotes
            return f"'{t}'"
        if isinstance(x, datetime):
            return f"TIMESTAMP '{x}'"
        if isinstance(x, float):
            if math.isnan(x):
                return "nan()"
            if math.isinf(x):
                if x < 0:
                    return "-infinity()"
                return "infinity()"
        return str(x)

    @staticmethod
    def _print_batch(batch):
        if len(batch) > 5:
            print("\n".join(f"  {e}" for e in batch[:3]))
            print("  ...")
            print(f"  {batch[-1]}")
        else:
            print("\n".join(f"  {e}" for e in batch))

In [22]:
sql = f"""
drop table if exists {iceberg_catalog}.{iceberg_schema}.{iceberg_table_tosql}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [23]:
columnschema = osc.create_table_schema_pairs(df1)

tabledef = f"""
create table if not exists {iceberg_catalog}.{iceberg_schema}.{iceberg_table_tosql}(
{columnschema}
) with (
    format = 'parquet',
    partitioning = array['year']
)
"""
print(tabledef)
qres = engine.execute(tabledef)
print(qres.fetchall())


create table if not exists osc_datacommons_iceberg_dev.iceberg_demo.iceberg_tosql(
    metric double,
    year bigint
) with (
    format = 'parquet',
    partitioning = array['year']
)

[(True,)]


In [24]:
%%time
df1.to_sql(iceberg_table_tosql,
           con=engine,
           schema=iceberg_schema,
           if_exists='append',
           index=False,
           method=TrinoBatchInsert(catalog = iceberg_catalog, batch_size = 10000, verbose = True))

constructed fully qualified table name as: "osc_datacommons_iceberg_dev.iceberg_demo.iceberg_tosql"
inserting 10000 records
  (0.2363267296442988, 2020)
  (0.28495342468047025, 2020)
  (0.26721730150989764, 2020)
  ...
  (0.36733658467995256, 2020)
batch insert result: [(10000,)]
inserting 10000 records
  (0.1073199769094757, 2022)
  (0.6040171521454266, 2020)
  (0.8017334232530048, 2020)
  ...
  (0.07489413375778031, 2022)
batch insert result: [(10000,)]
inserting 10000 records
  (0.03276188622718912, 2020)
  (0.29457713400891994, 2021)
  (0.35411996702132553, 2022)
  ...
  (0.7302995205471225, 2021)
batch insert result: [(10000,)]
inserting 10000 records
  (0.8352265518994908, 2020)
  (0.11706624824011358, 2020)
  (0.9951036349813431, 2021)
  ...
  (0.7629798084864959, 2022)
batch insert result: [(10000,)]
inserting 10000 records
  (0.8058771459126449, 2020)
  (0.3004698206676726, 2020)
  (0.15043774796325327, 2020)
  ...
  (0.6674073713470688, 2020)
batch insert result: [(10000,)]
i