# Trino Iceberg Datalakehouse - Getting Started

This notebook demonstrates basic DDL and DML operations on Iceberg tables using Trino.

In [1]:
# Install Trino client if not already in the Jupyter image
# %pip install trino sqlalchemy pandas

Note: you may need to restart the kernel to use updated packages.


In [1]:
from sqlalchemy import create_engine, text
import pandas as pd

TRINO_HOST = 'trino-coordinator' # Service name in docker-compose
TRINO_PORT = 8080
TRINO_USER = 'testuser' # Can be any string, Trino by default doesn't enforce auth in this setup
CATALOG = 'iceberg' # Catalog name as defined in iceberg.properties

# Connection string for Trino
trino_conn_str = f'trino://{TRINO_USER}@{TRINO_HOST}:{TRINO_PORT}/{CATALOG}'
engine = create_engine(trino_conn_str)

In [2]:
def run_trino_query(query, fetch_results=True):
    """Executes a Trino query and optionally fetches results into a Pandas DataFrame."""
    with engine.connect() as connection:
        # For queries that modify data or schema, autocommit is usually the default or not needed to be set explicitly for Trino
        # For DML/DDL, we might not always fetch results
        result_proxy = connection.execute(text(query))
        if fetch_results and result_proxy.returns_rows:
            df = pd.DataFrame(result_proxy.fetchall(), columns=result_proxy.keys())
            return df
        elif fetch_results: # No rows returned but fetch_results was true
            return pd.DataFrame(columns=result_proxy.keys() if result_proxy.returns_rows else [])
        else:
            print(f"Query executed successfully (returns_rows={result_proxy.returns_rows}).")
            # For DDL/DML, we might want to check row count if available
            # print(f"Rows affected (approx): {result_proxy.rowcount}") # rowcount might not be reliable for all statements/drivers
            return None

print(f"Connected to Trino: {trino_conn_str}")

Connected to Trino: trino://testuser@trino-coordinator:8080/iceberg


## 1. Create Schema (Namespace in Iceberg)

In [3]:
SCHEMA_NAME = 'trino_schema'

run_trino_query(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA_NAME} WITH (location = 's3a://iceberg-warehouse/{SCHEMA_NAME}/')", fetch_results=False)
print(f"Schema '{SCHEMA_NAME}' created or already exists.")

print("\nAvailable schemas in Iceberg catalog:")
schemas_df = run_trino_query(f"SHOW SCHEMAS FROM {CATALOG}")
print(schemas_df)

Query executed successfully (returns_rows=False).
Schema 'trino_schema' created or already exists.

Available schemas in Iceberg catalog:
               Schema
0  information_schema
1              system
2        trino_schema


## 2. Create an Iceberg Table

In [5]:
run_trino_query(f"SHOW TABLES FROM {CATALOG}.{SCHEMA_NAME}")

Unnamed: 0,Table


In [4]:
TABLE_NAME = 'employees'
FQN_TABLE_NAME = f"{CATALOG}.{SCHEMA_NAME}.{TABLE_NAME}"

create_table_sql = f"""
CREATE TABLE IF NOT EXISTS {FQN_TABLE_NAME} (
    id INT,
    name VARCHAR,
    department VARCHAR,
    salary DECIMAL(10, 2),
    hire_date DATE
)
WITH (
    format = 'PARQUET',
    partitioning = ARRAY['department']
)
"""
run_trino_query(create_table_sql, fetch_results=False)
print(f"Table '{FQN_TABLE_NAME}' created or already exists.")

print(f"\nTables in schema '{SCHEMA_NAME}':")
tables_df = run_trino_query(f"SHOW TABLES FROM {CATALOG}.{SCHEMA_NAME}")
print(tables_df)

Query executed successfully (returns_rows=False).
Table 'iceberg.trino_schema.employees' created or already exists.

Tables in schema 'trino_schema':
       Table
0  employees


## 3. Insert Data (DML)

In [5]:
insert_sql = f"""
INSERT INTO {FQN_TABLE_NAME} VALUES
(1, 'Alice Smith', 'Engineering', 90000.00, DATE '2020-01-15'),
(2, 'Bob Johnson', 'Engineering', 85000.00, DATE '2019-07-01'),
(3, 'Charlie Brown', 'HR', 70000.00, DATE '2021-03-10'),
(4, 'Diana Green', 'Sales', 95000.00, DATE '2018-05-22'),
(5, 'Edward Black', 'Sales', 105000.00, DATE '2017-11-30')
"""
run_trino_query(insert_sql, fetch_results=False)
print(f"Data inserted into {FQN_TABLE_NAME}.")

Query executed successfully (returns_rows=True).
Data inserted into iceberg.trino_schema.employees.


## 4. Select Data

In [6]:
print("All employees:")
all_employees_df = run_trino_query(f"SELECT * FROM {FQN_TABLE_NAME}")
print(all_employees_df)

print("\nEngineering department employees (filter pushdown check):")
eng_employees_df = run_trino_query(f"SELECT * FROM {FQN_TABLE_NAME} WHERE department = 'Engineering'")
print(eng_employees_df)

All employees:
   id           name   department     salary   hire_date
0   3  Charlie Brown           HR   70000.00  2021-03-10
1   4    Diana Green        Sales   95000.00  2018-05-22
2   1    Alice Smith  Engineering   90000.00  2020-01-15
3   2    Bob Johnson  Engineering   85000.00  2019-07-01
4   5   Edward Black        Sales  105000.00  2017-11-30

Engineering department employees (filter pushdown check):
   id         name   department    salary   hire_date
0   1  Alice Smith  Engineering  90000.00  2020-01-15
1   2  Bob Johnson  Engineering  85000.00  2019-07-01


## 5. Iceberg Table Metadata (Snapshots, Manifests, Files)

In [7]:
print("\nTable snapshots (history):")
history_df = run_trino_query(f"SELECT * FROM {CATALOG}.{SCHEMA_NAME}.\"{TABLE_NAME}$history\"") # Note escaped quotes for table name
print(history_df)

print("\nTable manifest files:")
manifests_df = run_trino_query(f"SELECT * FROM {CATALOG}.{SCHEMA_NAME}.\"{TABLE_NAME}$manifests\"")
print(manifests_df)

print("\nTable data files:")
files_df = run_trino_query(f"SELECT file_path, record_count, partition FROM {CATALOG}.{SCHEMA_NAME}.\"{TABLE_NAME}$files\"")
print(files_df)


Table snapshots (history):
                   made_current_at          snapshot_id     parent_id  \
0 2025-07-10 13:10:39.842000+00:00  2081820213866878126           NaN   
1 2025-07-10 13:11:50.079000+00:00  1527397584328712094  2.081820e+18   

   is_current_ancestor  
0                 True  
1                 True  

Table manifest files:
                                                path  length  \
0  s3a://iceberg-warehouse/trino_schema/employees...    7775   

   partition_spec_id    added_snapshot_id  added_data_files_count  \
0                  0  1527397584328712094                       3   

   added_rows_count  existing_data_files_count  existing_rows_count  \
0                 5                          0                    0   

   deleted_data_files_count  deleted_rows_count  \
0                         0                   0   

                                 partition_summaries  
0  [(contains_null: False, contains_nan: False, l...  

Table data files:
           

## 6. Hidden Partitioning Example

In [8]:
EVENTS_TABLE_NAME = 'events'
FQN_EVENTS_TABLE = f"{CATALOG}.{SCHEMA_NAME}.{EVENTS_TABLE_NAME}"

create_hidden_partition_table_sql = f"""
CREATE TABLE IF NOT EXISTS {FQN_EVENTS_TABLE} (
    event_id VARCHAR,
    event_type VARCHAR,
    event_ts TIMESTAMP(6),  -- High precision timestamp
    user_id INT
)
WITH (
    format = 'PARQUET',
    partitioning = ARRAY['day(event_ts)'] -- Hidden partitioning on event_ts by day
)
"""
run_trino_query(create_hidden_partition_table_sql, fetch_results=False)
print(f"Table '{FQN_EVENTS_TABLE}' with hidden partitioning created.")

insert_events_sql = f"""
INSERT INTO {FQN_EVENTS_TABLE} VALUES
('event1', 'click', TIMESTAMP '2023-10-26 10:00:00.123456', 101),
('event2', 'view', TIMESTAMP '2023-10-26 11:30:00.654321', 102),
('event3', 'purchase', TIMESTAMP '2023-10-27 09:15:00.000000', 101),
('event4', 'click', TIMESTAMP '2023-10-27 14:00:00.987654', 103)
"""
run_trino_query(insert_events_sql, fetch_results=False)
print(f"Data inserted into '{FQN_EVENTS_TABLE}'.")

print("\nEvents from 2023-10-26 (filter pushdown on hidden partition):")
events_26_df = run_trino_query(f"SELECT * FROM {FQN_EVENTS_TABLE} WHERE event_ts >= TIMESTAMP '2023-10-26 00:00:00' AND event_ts < TIMESTAMP '2023-10-27 00:00:00'")
print(events_26_df)

print("\nPartitions for events table (shows transformed partition values):")
try:
    event_partitions_df = run_trino_query(f"SELECT * FROM {CATALOG}.{SCHEMA_NAME}.\"{EVENTS_TABLE_NAME}$partitions\"")
    print(event_partitions_df)
except Exception as e:
    print(f"Could not query partitions directly: {e}")

Query executed successfully (returns_rows=False).
Table 'iceberg.trino_schema.events' with hidden partitioning created.
Query executed successfully (returns_rows=True).
Data inserted into 'iceberg.trino_schema.events'.

Events from 2023-10-26 (filter pushdown on hidden partition):
  event_id event_type                   event_ts  user_id
0   event1      click 2023-10-26 10:00:00.123456      101
1   event2       view 2023-10-26 11:30:00.654321      102

Partitions for events table (shows transformed partition values):
                                     partition  record_count  file_count  \
0  (event_ts_day: datetime.date(2023, 10, 26))             2           1   
1  (event_ts_day: datetime.date(2023, 10, 27))             2           1   

   total_size                                               data  
0         652  (event_id: (min: 'event1', max: 'event2', null...  
1         660  (event_id: (min: 'event3', max: 'event4', null...  


## 7. Data Compaction (OPTIMIZE)

In [9]:
# Insert more data to potentially create smaller files in employees table
insert_more_employees_sql = f"""
INSERT INTO {FQN_TABLE_NAME} VALUES
(6, 'Fiona White', 'Engineering', 75000.00, DATE '2023-01-10'),
(7, 'George Yellow', 'HR', 65000.00, DATE '2023-03-15')
"""
run_trino_query(insert_more_employees_sql, fetch_results=False) # New snapshot
run_trino_query(insert_more_employees_sql, fetch_results=False) # Another new snapshot
print("Inserted more data into employees table to create more files/snapshots.")

print("\nTable files before OPTIMIZE:")
files_before_optimize_df = run_trino_query(f"SELECT file_path, record_count FROM {CATALOG}.{SCHEMA_NAME}.\"{TABLE_NAME}$files\"")
print(files_before_optimize_df)

print("\nRunning OPTIMIZE (minor compaction by default on Trino):")
try:
    run_trino_query(f"ALTER TABLE {FQN_TABLE_NAME} EXECUTE OPTIMIZE", fetch_results=False)
    print("OPTIMIZE command executed.")
    print("\nTable files after OPTIMIZE:")
    files_after_optimize_df = run_trino_query(f"SELECT file_path, record_count FROM {CATALOG}.{SCHEMA_NAME}.\"{TABLE_NAME}$files\"")
    print(files_after_optimize_df)
    
    print("\nTable snapshots after OPTIMIZE (should show a 'replace' operation):")
    history_after_optimize_df = run_trino_query(f"SELECT snapshot_id, operation FROM {CATALOG}.{SCHEMA_NAME}.\"{TABLE_NAME}$snapshots\" ORDER BY committed_at DESC")
    print(history_after_optimize_df.head())
except Exception as e:
    print(f"OPTIMIZE command failed or is not fully supported for this setup: {e}")

Query executed successfully (returns_rows=True).
Query executed successfully (returns_rows=True).
Inserted more data into employees table to create more files/snapshots.

Table files before OPTIMIZE:
                                           file_path  record_count
0  s3a://iceberg-warehouse/trino_schema/employees...             1
1  s3a://iceberg-warehouse/trino_schema/employees...             1
2  s3a://iceberg-warehouse/trino_schema/employees...             1
3  s3a://iceberg-warehouse/trino_schema/employees...             1
4  s3a://iceberg-warehouse/trino_schema/employees...             2
5  s3a://iceberg-warehouse/trino_schema/employees...             2
6  s3a://iceberg-warehouse/trino_schema/employees...             1

Running OPTIMIZE (minor compaction by default on Trino):
Query executed successfully (returns_rows=True).
OPTIMIZE command executed.

Table files after OPTIMIZE:
                                           file_path  record_count
0  s3a://iceberg-warehouse/trino_s

## 8. Time Travel / Snapshot Reading

In [10]:
snapshots_df = run_trino_query(f"SELECT * FROM {CATALOG}.{SCHEMA_NAME}.\"{TABLE_NAME}$snapshots\" ORDER BY committed_at ASC")
print("\nAvailable snapshots for 'employees':")
print(snapshots_df)

if len(snapshots_df) > 1:
    # Try to get a snapshot before the last data modification (e.g., before OPTIMIZE or last INSERT)
    # This depends on how many operations were performed. Let's pick the first data snapshot.
    # The first snapshot is often table creation (empty), so pick one that likely has data.
    # Find first 'append' operation snapshot ID
    first_append_snapshot_id = None
    for index, row in snapshots_df.iterrows():
        if row['operation'] == 'append':
            first_append_snapshot_id = row['snapshot_id']
            break
            
    if first_append_snapshot_id:
        print(f"\nQuerying data from snapshot ID {first_append_snapshot_id} (first append operation):")
        query_snapshot_sql = f"SELECT * FROM {FQN_TABLE_NAME} FOR VERSION AS OF {first_append_snapshot_id}"
        snapshot_data_df = run_trino_query(query_snapshot_sql)
        print(snapshot_data_df)
    else:
        print("\nCould not find an 'append' snapshot for time travel example.")
else:
    print("\nNot enough snapshots to demonstrate time travel.")


Available snapshots for 'employees':
                      committed_at          snapshot_id     parent_id  \
0 2025-07-10 13:10:39.842000+00:00  2081820213866878126           NaN   
1 2025-07-10 13:11:50.079000+00:00  1527397584328712094  2.081820e+18   
2 2025-07-10 13:14:18.876000+00:00  3222837768186949944  1.527398e+18   
3 2025-07-10 13:14:19.038000+00:00  2108518526208650987  3.222838e+18   
4 2025-07-10 13:14:19.238000+00:00  1253729581276335010  2.108519e+18   

  operation                                      manifest_list  \
0    append  s3a://iceberg-warehouse/trino_schema/employees...   
1    append  s3a://iceberg-warehouse/trino_schema/employees...   
2    append  s3a://iceberg-warehouse/trino_schema/employees...   
3    append  s3a://iceberg-warehouse/trino_schema/employees...   
4   replace  s3a://iceberg-warehouse/trino_schema/employees...   

                                             summary  
0  {'trino_query_id': '20250710_130746_00032_9eri...  
1  {'trino_query

## 9. Show Table DDL

In [11]:
print(f"\nShow create table for '{FQN_TABLE_NAME}':")
create_table_stmt_df = run_trino_query(f"SHOW CREATE TABLE {FQN_TABLE_NAME}")
if not create_table_stmt_df.empty:
    print(create_table_stmt_df.iloc[0,0])
else:
    print("Could not retrieve DDL.")


Show create table for 'iceberg.trino_schema.employees':
CREATE TABLE iceberg.trino_schema.employees (
   id integer,
   name varchar,
   department varchar,
   salary decimal(10, 2),
   hire_date date
)
WITH (
   format = 'PARQUET',
   format_version = 2,
   location = 's3a://iceberg-warehouse/trino_schema/employees-df981e8adc1e49f9a8a37fd8ab04ceee',
   max_commit_retry = 4,
   partitioning = ARRAY['department']
)


## 10. Clean up (Optional)

In [14]:
# print(run_trino_query(f"DROP TABLE IF EXISTS {FQN_TABLE_NAME}", fetch_results=False))
# print(run_trino_query(f"DROP TABLE IF EXISTS {FQN_EVENTS_TABLE}", fetch_results=False))
# print(run_trino_query(f"DROP SCHEMA IF EXISTS {CATALOG}.{SCHEMA_NAME}", fetch_results=False))
# print("\nSchemas after potential cleanup:")
# schemas_after_cleanup_df = run_trino_query(f"SHOW SCHEMAS FROM {CATALOG}")
# print(schemas_after_cleanup_df)

print("\nTrino Iceberg Datalakehouse Demo (Phase 1) completed.")


Trino Iceberg Datalakehouse Demo (Phase 1) completed.
