In [1]:
# turn off full stack dumps on exceptions
%xmode Minimal

Exception reporting mode: Minimal


In [2]:
%%capture pipoutput

# Install some basic python packages
# use 'capture' to intercept the pip install log outputs
%pip install python-dotenv boto3 pandas pyarrow fastparquet 

In [3]:
%%capture pipoutput

# Install OSC ingest tools
%pip install --upgrade osc-ingest-tools

In [4]:
# We will be loading sql queries directly into pandas data frames
import pandas as pd

In [5]:
# We will use some OSC ingest tools to load our Trino credentials
# and then attach an sql-alchemy engine to the OSC trino db
import osc_ingest_trino as osc
osc.load_credentials_dotenv()

engine = osc.attach_trino_engine(verbose=True)

using connect string: trino://erikerlandson@trino-secure-odh-trino.apps.odh-cl2.apps.os-climate.org:443


In [6]:
# Start with a basic query to list the catalogs defined in the trino db
# For this tutorial we will be using the 'riskthinking' catalog
sql = f"""
show catalogs
"""
pd.read_sql(sql, engine)

Unnamed: 0,Catalog
0,jmx
1,kafka_fx
2,kafka_osclimate
3,osc_datacommons_dev
4,osc_datacommons_hive_ingest
5,riskthinking
6,system
7,uwm_prometheus


In [7]:
# List the schemas inside the riskthinking BigQuery catalog
# Currently the only schema that works with trino is 'rtai_indices'
sql = f"""
show schemas in riskthinking
"""
pd.read_sql(sql, engine)

Unnamed: 0,Schema
0,information_schema
1,osclimate
2,riskthinking_osclimate_big_query
3,rtai_indices
4,test


In [8]:
# List the tables under the riskthinking 'rtai_indices' schema:
# You can see that there are several hundred tables
sql = f"""
show tables in riskthinking.rtai_indices
"""
tables = pd.read_sql(sql, engine)
tables

Unnamed: 0,Table
0,consecutive_frost_days-djf-historical-hx3
1,consecutive_frost_days-djf-ssp119-hx3
2,consecutive_frost_days-djf-ssp126-hx3
3,consecutive_frost_days-djf-ssp245-hx3
4,consecutive_frost_days-djf-ssp370-hx3
...,...
574,wind_max_daily_mean-ys-ssp245-hx3
575,wind_max_daily_mean-ys-ssp370-hx3
576,wind_max_daily_mean-ys-ssp434-hx3
577,wind_max_daily_mean-ys-ssp460-hx3


In [9]:
# Let's look at the first table name in this list:
tables['Table'][0]

'consecutive_frost_days-djf-historical-hx3'

In [10]:
# List the first five rows in the first table
# NOTE: BigQuery table names may include characters such as '-' (dash) that are not standard SQL identifier chars,
# so you can see we enclose the table name in double quotes so trino will not complain
sql = f"""
select * from riskthinking.rtai_indices."consecutive_frost_days-djf-historical-hx3" limit 5
"""
pd.read_sql(sql, engine)

Unnamed: 0,hex_id,giss_e2_1_g,bcc_esm1,mpi_esm1_2_lr,gfdl_cm4,mpi_esm_1_2_ham,taiesm1,mpi_esm1_2_hr,cmcc_esm2,nesm3,...,fgoals_f3_l,miroc6,noresm2_mm,access_esm1_5,awi_cm_1_1_mr,bcc_csm2_mr,fgoals_g3,ipsl_cm6a_lr,noresm2_lm,timestamp
0,589973818928267263,67.18449,67.18449,63.523415,57.979715,63.863451,25.266594,67.18449,20.565836,67.18449,...,67.18449,67.18449,67.18449,59.520141,67.099174,67.18449,67.18449,46.697125,67.18449,1950-12-01 00:00:00.000 UTC
1,589974093806174207,67.18449,67.18449,47.119071,46.103194,50.96762,14.183392,67.18449,20.155347,67.18449,...,67.18449,62.229504,43.827301,38.973883,64.931378,67.18449,67.18449,25.900684,57.506326,1950-12-01 00:00:00.000 UTC
2,589974162525650943,67.18449,67.18449,60.94044,58.470312,63.125178,16.680061,67.18449,22.454019,67.18449,...,67.18449,67.18449,56.550181,54.782116,67.105203,67.18449,67.18449,53.28732,67.18449,1950-12-01 00:00:00.000 UTC
3,589989212091056127,64.047527,67.18449,14.285428,24.61988,10.845663,10.610088,23.313532,13.006809,31.34126,...,67.18449,19.077533,20.990096,36.792411,28.939532,67.18449,67.18449,13.556732,23.224574,1950-12-01 00:00:00.000 UTC
4,589989280810532863,64.380924,67.18449,10.788894,18.534346,8.466927,8.666372,22.067101,13.149396,26.061907,...,67.18449,18.228818,20.476592,31.436757,27.692107,67.18449,67.18449,13.714369,23.100953,1950-12-01 00:00:00.000 UTC
