Run these in a notebook cell if you need to install onto your nb env
```
# 'capture' magic prevents long outputs from spamming your notebook
%%capture pipoutput

# For loading predefined environment variables from files
# Typically used to load sensitive access credentials
%pip install python-dotenv

# Standard python package for interacting with S3 buckets
%pip install boto3

# Interacting with Trino and using Trino with sqlalchemy
%pip install trino sqlalchemy sqlalchemy-trino

# Pandas and parquet file i/o
%pip install pandas pyarrow fastparquet

# OS-Climate utilities to make data ingest easier
%pip install osc-ingest-tools
```

In [2]:
# this notebook assumes osc-ingest-tools >= 0.2.0
import osc_ingest_trino as osc
osc.load_credentials_dotenv()
bucket = osc.attach_s3_bucket('S3_DEV')
engine = osc.attach_trino_engine()

In [3]:
ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'demo'
ingest_table = 'parquet_partitions_tutorial'

In [4]:
partition_columns = ['year']

In [5]:
import pandas as pd
data = [['2020', 10], ['2021', 15], ['2022', 14]]
df1 = pd.DataFrame(data, columns = ['year', 'metric'])
df1 = df1.convert_dtypes()

In [6]:
osc.enforce_partition_column_order(df1, partition_columns, inplace=True)
df1

Unnamed: 0,metric,year
0,10,2020
1,15,2021
2,14,2022


In [7]:
osc.drop_unmanaged_table(ingest_catalog, ingest_schema, ingest_table, engine, bucket, verbose=True)

[{'ResponseMetadata': {'RequestId': 'RJZQFTWK5B7N7A1M', 'HostId': 'Wx/Hc4BEry+c2i7wjIA5Vo/d2aheuI/YgBotE+P7C7sYnzG2heMbr18vanheETqY1z0RF7wO13Q=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'Wx/Hc4BEry+c2i7wjIA5Vo/d2aheuI/YgBotE+P7C7sYnzG2heMbr18vanheETqY1z0RF7wO13Q=', 'x-amz-request-id': 'RJZQFTWK5B7N7A1M', 'date': 'Sun, 05 Dec 2021 15:15:51 GMT', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3', 'connection': 'close'}, 'RetryAttempts': 0}, 'Deleted': [{'Key': 'trino/demo/parquet_partitions_tutorial/year=2021/4e3ab70e95c14cdab797ec3d436feb11.parquet'}, {'Key': 'trino/demo/parquet_partitions_tutorial/year=2022/de1af663382f4da1b85454b0fd9ace44.parquet'}, {'Key': 'trino/demo/parquet_partitions_tutorial/year=2024/ca8b0f0aa61f48a0a344460cae8bff4d.parquet'}, {'Key': 'trino/demo/parquet_partitions_tutorial/year=2023/1a76544770b342c494b9c6ec578433ab.parquet'}, {'Key': 'trino/demo/parquet_partitions_tutorial/year=2020/672b79c84998434c8af3ba845258

<sqlalchemy.engine.result.ResultProxy at 0x7f2fc4aabe50>

In [8]:
osc.ingest_unmanaged_parquet(df1, ingest_schema, ingest_table, bucket,
                             partition_columns=partition_columns,
                             verbose=True)

/tmp/parquet_partitions_tutorial/year=2020/3b495ac0d15748268d2e9f9ec7d45642.parquet  -->  trino/demo/parquet_partitions_tutorial/year=2020/3b495ac0d15748268d2e9f9ec7d45642.parquet
/tmp/parquet_partitions_tutorial/year=2021/6928287dbdbd40adb02ee43984bcc21e.parquet  -->  trino/demo/parquet_partitions_tutorial/year=2021/6928287dbdbd40adb02ee43984bcc21e.parquet
/tmp/parquet_partitions_tutorial/year=2022/a41cc3e7c81f44b199d7aecd13aa455d.parquet  -->  trino/demo/parquet_partitions_tutorial/year=2022/a41cc3e7c81f44b199d7aecd13aa455d.parquet


In [9]:
for e in bucket.objects \
    .filter(Prefix=f'trino/{ingest_schema}/{ingest_table}/'):
    print(e.key)

trino/demo/parquet_partitions_tutorial/year=2020/3b495ac0d15748268d2e9f9ec7d45642.parquet
trino/demo/parquet_partitions_tutorial/year=2021/6928287dbdbd40adb02ee43984bcc21e.parquet
trino/demo/parquet_partitions_tutorial/year=2022/a41cc3e7c81f44b199d7aecd13aa455d.parquet


In [10]:
sql = osc.unmanaged_parquet_tabledef(df1, ingest_catalog, ingest_schema, ingest_table, bucket,
                                    partition_columns = partition_columns,
                                    verbose = True)
qres = engine.execute(sql)
print(qres.fetchall())

create table if not exists osc_datacommons_dev.demo.parquet_partitions_tutorial (
    metric bigint,
    year varchar
) with (
    format = 'parquet',
    partitioned_by = array['year'],
    external_location = 's3a://ocp-odh-os-demo-s3/trino/demo/parquet_partitions_tutorial/'
)
[(True,)]


In [11]:
sql = f"""
call {ingest_catalog}.system.sync_partition_metadata('{ingest_schema}', '{ingest_table}', 'FULL')
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [12]:
sql = f"""
select * from {ingest_catalog}.{ingest_schema}."{ingest_table}$partitions"
"""
df = pd.read_sql(sql, engine)
df

Unnamed: 0,year
0,2020
1,2021
2,2022


In [13]:
sql = f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
df = pd.read_sql(sql, engine)
df

Unnamed: 0,metric,year
0,14,2022
1,15,2021
2,10,2020


In [14]:
data = [['2022', 30], ['2023', 35], ['2024', 40]]
df2 = pd.DataFrame(data, columns = ['year', 'metric'])
df2 = df2.convert_dtypes()
osc.enforce_partition_column_order(df2, partition_columns, inplace=True)
df2

Unnamed: 0,metric,year
0,30,2022
1,35,2023
2,40,2024


In [15]:
osc.ingest_unmanaged_parquet(df2, ingest_schema, ingest_table, bucket,
                             partition_columns=partition_columns,
                             verbose=True)

/tmp/parquet_partitions_tutorial/year=2022/1377770283354cd7a4874b013ef4dea9.parquet  -->  trino/demo/parquet_partitions_tutorial/year=2022/1377770283354cd7a4874b013ef4dea9.parquet
/tmp/parquet_partitions_tutorial/year=2023/4c5701cf885948c6beb414288a17b19a.parquet  -->  trino/demo/parquet_partitions_tutorial/year=2023/4c5701cf885948c6beb414288a17b19a.parquet
/tmp/parquet_partitions_tutorial/year=2024/378baaefec0a44bea04938ae6f3134e9.parquet  -->  trino/demo/parquet_partitions_tutorial/year=2024/378baaefec0a44bea04938ae6f3134e9.parquet


In [16]:
for e in bucket.objects \
    .filter(Prefix=f'trino/{ingest_schema}/{ingest_table}/'):
    print(e.key)

trino/demo/parquet_partitions_tutorial/year=2020/3b495ac0d15748268d2e9f9ec7d45642.parquet
trino/demo/parquet_partitions_tutorial/year=2021/6928287dbdbd40adb02ee43984bcc21e.parquet
trino/demo/parquet_partitions_tutorial/year=2022/1377770283354cd7a4874b013ef4dea9.parquet
trino/demo/parquet_partitions_tutorial/year=2022/a41cc3e7c81f44b199d7aecd13aa455d.parquet
trino/demo/parquet_partitions_tutorial/year=2023/4c5701cf885948c6beb414288a17b19a.parquet
trino/demo/parquet_partitions_tutorial/year=2024/378baaefec0a44bea04938ae6f3134e9.parquet


In [17]:
sql = f"""
call {ingest_catalog}.system.sync_partition_metadata('{ingest_schema}', '{ingest_table}', 'FULL')
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [18]:
sql = f"""
select * from {ingest_catalog}.{ingest_schema}."{ingest_table}$partitions"
"""
df = pd.read_sql(sql, engine)
df

Unnamed: 0,year
0,2020
1,2021
2,2022
3,2023
4,2024


In [19]:
sql = f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}
order by year asc
"""
df = pd.read_sql(sql, engine)
df

Unnamed: 0,metric,year
0,10,2020
1,15,2021
2,14,2022
3,30,2022
4,35,2023
5,40,2024
