Run these in a notebook cell if you need to install onto your nb env
```
# 'capture' magic prevents long outputs from spamming your notebook
%%capture pipoutput

# For loading predefined environment variables from files
# Typically used to load sensitive access credentials
%pip install python-dotenv

# Standard python package for interacting with S3 buckets
%pip install boto3

# Interacting with Trino and using Trino with sqlalchemy
%pip install trino sqlalchemy sqlalchemy-trino

# Pandas and parquet file i/o
%pip install pandas pyarrow fastparquet

# OS-Climate utilities to make data ingest easier
%pip install osc-ingest-tools
```

In [1]:
%xmode Minimal

Exception reporting mode: Minimal


In [2]:
%%capture pipoutput
%pip install python-dotenv boto3 trino sqlalchemy sqlalchemy-trino pandas pyarrow fastparquet 

In [3]:
%%capture pipoutput
%pip install --upgrade osc-ingest-tools==0.2.1

In [4]:
# this notebook assumes osc-ingest-tools == 0.2.1
import osc_ingest_trino as osc
osc.load_credentials_dotenv()
bucket = osc.attach_s3_bucket('S3_DEV')
engine = osc.attach_trino_engine(verbose=True)

using connect string: trino://erikerlandson@trino-secure-odh-trino.apps.odh-cl1.apps.os-climate.org:443


Ingest table is on hive catalog

In [5]:
ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'demo'
ingest_table = 'parquet_partitions_tutorial'

In [6]:
partition_columns = ['year']

In [7]:
import random
import pandas as pd

datasize = 10000000
yeardata = [random.randrange(2020,2023) for x in range(datasize)]
metricdata = [random.random() for x in range(datasize)]

df1 = pd.DataFrame(data={"year": yeardata, "metric": metricdata})
df1.convert_dtypes()

Unnamed: 0,year,metric
0,2021,0.139033
1,2021,0.04874
2,2021,0.220513
3,2022,0.567443
4,2021,0.630204
...,...,...
9999995,2022,0.061562
9999996,2020,0.470861
9999997,2022,0.718266
9999998,2020,0.758396


In [8]:
df1.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 2 columns):
 #   Column  Dtype  
---  ------  -----  
 0   year    int64  
 1   metric  float64
dtypes: float64(1), int64(1)
memory usage: 152.6 MB


In [9]:
osc.enforce_partition_column_order(df1, partition_columns, inplace=True)
df1

Unnamed: 0,metric,year
0,0.139033,2021
1,0.048740,2021
2,0.220513,2021
3,0.567443,2022
4,0.630204,2021
...,...,...
9999995,0.061562,2022
9999996,0.470861,2020
9999997,0.718266,2022
9999998,0.758396,2020


In [10]:
osc.drop_unmanaged_table(ingest_catalog, ingest_schema, ingest_table, engine, bucket, verbose=True)

[{'ResponseMetadata': {'RequestId': '1KCJ99CYJTKT8M29', 'HostId': 'TtEU8WkbS/wlp0O0Il6O6kxfcb8iWItkazI3jf+hfcpQxnWx7y9r4D13+XT6xR5tSNqBcsOBA6U=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'TtEU8WkbS/wlp0O0Il6O6kxfcb8iWItkazI3jf+hfcpQxnWx7y9r4D13+XT6xR5tSNqBcsOBA6U=', 'x-amz-request-id': '1KCJ99CYJTKT8M29', 'date': 'Fri, 03 Jun 2022 23:58:41 GMT', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3', 'connection': 'close'}, 'RetryAttempts': 0}, 'Deleted': [{'Key': 'trino/demo/parquet_partitions_tutorial/year=2021/08238079797f4628b23938b73bb44593-0.parquet'}, {'Key': 'trino/demo/parquet_partitions_tutorial/year=2022/08238079797f4628b23938b73bb44593-0.parquet'}, {'Key': 'trino/demo/parquet_partitions_tutorial/year=2020/08238079797f4628b23938b73bb44593-0.parquet'}]}]


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fa1f642e220>

In [11]:
osc.ingest_unmanaged_parquet(df1, ingest_schema, ingest_table, bucket,
                             partition_columns=partition_columns,
                             verbose=True)

/tmp/parquet_partitions_tutorial/year=2022/2f914d2af2d444aa93a80e8969c11e09-0.parquet  -->  trino/demo/parquet_partitions_tutorial/year=2022/2f914d2af2d444aa93a80e8969c11e09-0.parquet
/tmp/parquet_partitions_tutorial/year=2021/2f914d2af2d444aa93a80e8969c11e09-0.parquet  -->  trino/demo/parquet_partitions_tutorial/year=2021/2f914d2af2d444aa93a80e8969c11e09-0.parquet
/tmp/parquet_partitions_tutorial/year=2020/2f914d2af2d444aa93a80e8969c11e09-0.parquet  -->  trino/demo/parquet_partitions_tutorial/year=2020/2f914d2af2d444aa93a80e8969c11e09-0.parquet


In [12]:
for e in bucket.objects \
    .filter(Prefix=f'trino/{ingest_schema}/{ingest_table}/'):
    print(e.key)

trino/demo/parquet_partitions_tutorial/year=2020/2f914d2af2d444aa93a80e8969c11e09-0.parquet
trino/demo/parquet_partitions_tutorial/year=2021/2f914d2af2d444aa93a80e8969c11e09-0.parquet
trino/demo/parquet_partitions_tutorial/year=2022/2f914d2af2d444aa93a80e8969c11e09-0.parquet


In [13]:
sql = osc.unmanaged_parquet_tabledef(df1, ingest_catalog, ingest_schema, ingest_table, bucket,
                                    partition_columns = partition_columns,
                                    verbose = True)
qres = engine.execute(sql)
print(qres.fetchall())

create table if not exists osc_datacommons_dev.demo.parquet_partitions_tutorial (
    metric double,
    year bigint
) with (
    format = 'parquet',
    partitioned_by = array['year'],
    external_location = 's3a://ocp-odh-os-demo-s3/trino/demo/parquet_partitions_tutorial/'
)
[(True,)]


In [14]:
sql = f"""
call {ingest_catalog}.system.sync_partition_metadata('{ingest_schema}', '{ingest_table}', 'FULL')
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [15]:
sql = f"""
select * from {ingest_catalog}.{ingest_schema}."{ingest_table}$partitions"
"""
df = pd.read_sql(sql, engine)
df

Unnamed: 0,year
0,2020
1,2021
2,2022


In [16]:
sql = f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}
where metric < 0.1 and year = 2022
"""
df = pd.read_sql(sql, engine)
df

Unnamed: 0,metric,year
0,0.083461,2022
1,0.095466,2022
2,0.089612,2022
3,0.080941,2022
4,0.043161,2022
...,...,...
333707,0.001402,2022
333708,0.036057,2022
333709,0.006256,2022
333710,0.041585,2022


In [17]:
iceberg_catalog = 'osc_datacommons_iceberg_dev'
iceberg_schema = 'iceberg_demo'
iceberg_table = 'hive_iceberg_ingest'

In [18]:
sql = f"""
drop table if exists {iceberg_catalog}.{iceberg_schema}.{iceberg_table}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [19]:
import osc_ingest_trino as osc
columnschema = osc.create_table_schema_pairs(df1)

tabledef = f"""
create table if not exists {iceberg_catalog}.{iceberg_schema}.{iceberg_table}(
{columnschema}
) with (
    format = 'parquet',
    partitioning = array['year']
)
"""
print(tabledef)
qres = engine.execute(tabledef)
print(qres.fetchall())


create table if not exists osc_datacommons_iceberg_dev.iceberg_demo.hive_iceberg_ingest(
    metric double,
    year bigint
) with (
    format = 'parquet',
    partitioning = array['year']
)

[(True,)]


In [20]:
sql = f"""
insert into {iceberg_catalog}.{iceberg_schema}.{iceberg_table} (metric, year)
select metric, year from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(10000000,)]
