Install the Trino client for python

In [None]:
!pip install trino

Import the packages and load the configuration.

The _creds_ configuration contains your secrets, you must provide them (in ~/.ssh/trino.ini in this example):
```
[default]
user=your username
pass=your password
```

In [None]:
import configparser
import os
from trino.auth import BasicAuthentication
from trino.dbapi import connect

In [None]:
creds = configparser.ConfigParser()
creds.read(os.getenv('HOME') + '/.ssh/trino.ini')
config = configparser.RawConfigParser()
config.read('config.ini')

Open a connection with Trino

In [None]:
print(f"Connect to {config.get('default','scheme')}://{config.get('default','host')}:{config.get('default','port')}/")
conn = connect(
    http_scheme=config.get('default', 'scheme'),
    host=config.get('default','host'),
    port=config.get('default','port'),
    user=creds.get('default', 'user'),
    auth=BasicAuthentication(creds.get('default','user'), creds.get('default','pass'))
)

Prepare the SQL query:
- _stop_: names and geolocations SBB stops
- _shape_: geospatial shapes of the swiss city boundaries (administrative zones)
- _geo_tagged_stop_: table derived from _stop_ and _shape_ placing stops in their respective cities
- _geo_tagged_istdaten_: actual arrival and departure delays, with information about day of week, hour and city containing the stop (from geo_tagged_stop)

In [None]:
aggregatePercentiles = f"""

WITH
    stop  AS (
        SELECT TRY(CAST(substr(stop_id,1,7) as INTEGER)) as bpuic, stop_lat, stop_lon
        FROM iceberg.com490_ice.sbb_stops_parquet_part
        WHERE year=2024 AND month=9 AND day=9
    ),
    shape AS (
        SELECT ST_GeomFromBinary(wkb_geometry) as geometry, name
        FROM iceberg.com490_ice.geo_parquet
        WHERE level='city'
    ),
    geo_tagged_stop AS (
        SELECT stop.bpuic, stop.stop_lat, stop.stop_lon, shape.name
        FROM stop
        JOIN shape ON ST_Contains(shape.geometry, ST_Point(stop.stop_lon, stop.stop_lat))
    ),
    geo_tagged_istdaten AS (
        SELECT day_of_week(istdaten.arr_actual) as day_week,
               hour(istdaten.arr_actual) hour_day,
               (istdaten.arr_actual - istdaten.arr_time) as arr_delay,
               (istdaten.dep_actual - istdaten.dep_time) as dep_delay,
               geo_tagged_stop.name
        FROM iceberg.com490_ice.sbb_istdaten_parquet_part AS istdaten
        JOIN geo_tagged_stop USING (bpuic)
    )

SELECT AVG(arr_delay) as arr_delay, AVG(dep_delay) as dep_delay,
       COUNT() as num, approx_percentile(to_milliseconds(arr_delay)/1000,
       ARRAY[0.25,0.5,0.75]) as percentiles, hour_day, name
       FROM geo_tagged_istdaten WHERE day_week>=1 AND day_week <= 5  GROUP BY name,hour_day ORDER BY name,hour_day
"""

Execute the query and get the results.

You can for instance iterate the cursor and write the rows to file or create a pandas DataFrame.

```
%%time
from contextlib import closing
import pandas as pd
with closing(conn.cursor()) as cur:
    cur.execute(aggregatePercentiles)
    columns = [col[0] for col in cur.description]
    df = pd.DataFrame(cur, columns=columns)
```

You can also directly use the pandas.read_sql_query directly. If using this second option you will need to suppress a warning, because pandas does not know that Trino is DBAPI2 compliant

In [None]:
%%time
import pandas as pd
import warnings

with warnings.catch_warnings():
    # Catch UserWarning: pandas works without warnings with sqlite and SQLAlchemy only
    warnings.simplefilter("ignore", category=UserWarning)
    df = pd.read_sql_query(aggregatePercentiles, conn)

In [None]:
df

In [None]:
conn.close()