Run these in a notebook cell if you need to install onto your nb env

```python
# 'capture' magic prevents long outputs from spamming your notebook
%%capture pipoutput

# For loading predefined environment variables from files
# Typically used to load sensitive access credentials
%pip install python-dotenv

# Standard python package for interacting with S3 buckets
%pip install boto3

# Interacting with Trino and using Trino with sqlalchemy
%pip install trino sqlalchemy sqlalchemy-trino

# Pandas and parquet file i/o
%pip install pandas pyarrow fastparquet

# OS-Climate utilities to make data ingest easier
%pip install osc-ingest-tools
```

In [4]:
from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [5]:
import boto3
s3 = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ["S3_DEV_ENDPOINT"],
    aws_access_key_id=os.environ["S3_DEV_ACCESS_KEY"],
    aws_secret_access_key=os.environ["S3_DEV_SECRET_KEY"],
)
bucket = s3.Bucket(os.environ["S3_DEV_BUCKET"])

In [35]:
ingest_catalog = 'osc_datacommons_iceberg_dev'
ingest_schema = 'eje'
ingest_table = 'quarterly_data_01'

In [37]:
# telling sqlalchemy about catalog has to be done in the sqlstring url:
import trino
from sqlalchemy.engine import create_engine

sqlstring = 'trino://{user}@{host}:{port}/{catalog}'.format(
    user = os.environ['TRINO_USER'],
    host = os.environ['TRINO_HOST'],
    port = os.environ['TRINO_PORT'],
    catalog = ingest_catalog
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ['TRINO_PASSWD']),
    'http_scheme': 'https'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

In [28]:
import pandas as pd
data1 = [['2021Q4', 0.6], ['2021Q4', 0.7], ['2021Q4', 0.8]]
df1 = pd.DataFrame(data, columns = ['quarter', 'reduction'])
df1 = df1.convert_dtypes()
print(df1.info(verbose=True))
df1

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   quarter    3 non-null      string 
 1   reduction  3 non-null      Float64
dtypes: Float64(1), string(1)
memory usage: 179.0 bytes
None


Unnamed: 0,quarter,reduction
0,2021Q4,0.6
1,2021Q4,0.7
2,2021Q4,0.8


In [38]:
# make sure schema exists, or table creation below will fail in weird ways
sql = f"""
create schema if not exists {ingest_catalog}.{ingest_schema}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [39]:
import osc_ingest_trino as osc
columnschema = osc.create_table_schema_pairs(df1)

tabledef = f"""
create table if not exists {ingest_catalog}.{ingest_schema}.{ingest_table}(
{columnschema}
) with (
    format = 'ORC',
    partitioning = array['quarter']
)
"""
print(tabledef)
qres = engine.execute(tabledef)
print(qres.fetchall())


create table if not exists osc_datacommons_iceberg_dev.eje.quarterly_data_01(
    quarter varchar,
    reduction double
) with (
    format = 'ORC',
    partitioning = array['quarter']
)

[(True,)]


In [50]:
sql=f"""
delete from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(None,)]


In [56]:
# method = 'multi' is important, default will not work
# important to tell it about schema here, and catalog when you create the db connection above
# index = False, unless you declared that as a column when you create the table
# use 'append' mode since we already created the table
df1.to_sql(ingest_table,
           con=engine,
           schema=ingest_schema,
           if_exists='append',
           index=False,
           method='multi')

In [57]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
pd.read_sql(sql, engine)

Unnamed: 0,quarter,reduction
0,2021Q4,0.6
1,2021Q4,0.7
2,2021Q4,0.8


In [61]:
sql=f"""
select snapshot_id, committed_at from {ingest_catalog}.{ingest_schema}."{ingest_table}$snapshots"
    order by committed_at desc
    limit 5
"""
qres = engine.execute(sql)
snapshots = qres.fetchall()
snapshots

[(1540796013034024563, '2021-11-24 22:08:40.617 UTC'),
 (4394807411222590725, '2021-11-24 22:05:17.691 UTC'),
 (7463047492821112181, '2021-11-24 22:03:55.906 UTC'),
 (4266993433696443252, '2021-11-24 21:55:22.995 UTC'),
 (1101692520317405417, '2021-11-24 21:40:36.148 UTC')]

In [63]:
previous_snapshot = snapshots[1][0]
previous_snapshot

4394807411222590725

In [64]:
sql=f"""
call {ingest_catalog}.system.rollback_to_snapshot('{ingest_schema}', '{ingest_table}', {previous_snapshot})
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


In [66]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
pd.read_sql(sql, engine)

Unnamed: 0,quarter,reduction
