In [1]:
from functools import partial
from pathlib import Path
from pprint import pprint
from time import sleep

In [2]:
import psycopg2
from psycopg2.extras import RealDictCursor

In [3]:
from ingestion import (
    DBNAME,
    HOST,
    PASSWORD,
    PORT,
    USER,
    ingest_data_files,
    reset_db_structures,
)

## Reset DB Structures

In [4]:
input_csvs_small = [Path("/workspaces/data-playground/data/user_events.small.csv")]
input_csvs_large = [
    Path("/workspaces/data-playground/data/user_events.large.part01.csv"),
    Path("/workspaces/data-playground/data/user_events.large.part02.csv"),
]

In [None]:
reset_db_structures()
ingest_data_files(input_csvs_small)

## Utility Functions

In [6]:
pprint = partial(pprint, sort_dicts=False, underscore_numbers=True)

In [7]:
def execute_trans_query(query: str, *, fetch_size: int = 0) -> list[dict]:
    with (
        psycopg2.connect(
            host=HOST,
            port=PORT,
            user=USER,
            password=PASSWORD,
            database=DBNAME,
            cursor_factory=RealDictCursor,
        ) as conn,
        conn.cursor() as cur,
    ):
        cur.execute(query)
        match fetch_size:
            case 0:
                return None
            case 1:
                return dict(cur.fetchone())
        return [dict(res) for res in cur.fetchmany(fetch_size)]

In [8]:
def execute_non_trans_query(query: str, *, fetch_size: int = 0) -> list[dict]:
    conn = psycopg2.connect(host=HOST, user=USER, password=PASSWORD, dbname=DBNAME)
    conn.autocommit = True
    cur = conn.cursor()
    try:
        cur.execute(query)
        match fetch_size:
            case 0:
                return None
            case 1:
                return dict(cur.fetchone())
        return [dict(res) for res in cur.fetchmany(fetch_size)]
    except Exception:
        cur.close()
        conn.close()
        raise
    finally:
        cur.close()
        conn.close()

In [9]:
def print_count(table_name="t_user_events"):
    query = f"SELECT count(1) FROM {table_name}"
    res = execute_trans_query(query, fetch_size=1)
    print(f"Count: {res['count']:_}")

In [10]:
def print_current_hypertables():
    query = """
        SELECT hypertable_name, num_dimensions, num_chunks, compression_enabled
        FROM timescaledb_information.hypertables;"""
    res = execute_trans_query(query, fetch_size=3)
    print("Current hypertables:")
    pprint(res)

In [11]:
def print_chunk_info(table_name="t_user_events", order_by_stmt="", fetch_size=3):
    query = f"""
        SELECT chunk_name, is_compressed, range_start, range_end
        FROM timescaledb_information.chunks
        WHERE hypertable_name = '{table_name}'
        {order_by_stmt};"""
    res = execute_trans_query(query, fetch_size=fetch_size)
    print("Current chunks info:")
    pprint(res)

In [12]:
def print_compression_settings(fetch_size=10):
    query = """
        SELECT *
        FROM timescaledb_information.compression_settings
        WHERE hypertable_name = 't_user_events';"""
    res = execute_trans_query(query, fetch_size=fetch_size)
    print("Compression settings:")
    pprint(res)

## Query Statements

### Setup

In [None]:
query = """
    SELECT extversion
    FROM pg_extension
    WHERE extname = 'timescaledb';"""
version = execute_trans_query(query, fetch_size=1)["extversion"]
print(f"TimescaleDB version: {version}")

In [None]:
print_current_hypertables()

In [None]:
print_chunk_info()

In [16]:
query = """
    SELECT create_hypertable(
                't_user_events',
                'event_time',
                chunk_time_interval => INTERVAL '1 day',
                migrate_data => TRUE,
                if_not_exists => TRUE);"""
res = execute_trans_query(query, fetch_size=0)

In [None]:
print_current_hypertables()

In [None]:
print_chunk_info()

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM t_user_events
    WHERE user_id = 789
        AND event_time > '2024-02-01'
        AND event_time < '2024-02-02';"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [20]:
query = """
    CREATE MATERIALIZED VIEW mvw_event_counts
    WITH (timescaledb.continuous) AS
    SELECT
        time_bucket('1 hour', event_time) AS bucket,
        event_type,
        COUNT(*) AS event_count
    FROM t_user_events
    GROUP BY bucket, event_type;"""
res = execute_non_trans_query(query, fetch_size=0)

In [21]:
query = """
    SELECT add_continuous_aggregate_policy(
                'mvw_event_counts',
                start_offset => INTERVAL '100 years',
                end_offset => INTERVAL '1 hour',
                schedule_interval => INTERVAL '1 day');"""
res = execute_trans_query(query, fetch_size=0)

In [None]:
query = """
    SELECT *
    FROM mvw_event_counts
    ORDER BY bucket ASC;"""
res = execute_trans_query(query, fetch_size=6)
pprint(res)

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM mvw_event_counts
    ORDER BY bucket ASC;"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM mvw_event_counts
    WHERE bucket >= '2020-09-24 11:00:00'
        AND bucket < '2020-09-24 12:00:00';"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [25]:
query = """
    SELECT set_chunk_time_interval('t_user_events', INTERVAL '1 hour');"""
res = execute_non_trans_query(query, fetch_size=0)
# NOTE: the new chunk interval applies only to future chunks

In [None]:
print_chunk_info(fetch_size=3)

In [27]:
query = """
    INSERT INTO t_user_events
    SELECT
        event_time - (INTERVAL '10 years'),
        event_type,
        product_id,
        category_id,
        category_code,
        brand,
        price,
        user_id,
        user_session
    FROM t_user_events;"""
res = execute_trans_query(query, fetch_size=0)

In [None]:
print_chunk_info(
    table_name="t_user_events",
    order_by_stmt="ORDER BY range_end ASC",
    fetch_size=3,
)

In [None]:
query = """
    SELECT
        event_time,
        event_type,
        category_code,
        user_id
    FROM t_user_events
    WHERE user_id = 1515915625519380411
    AND event_time > '2015-01-01';"""
res = execute_trans_query(query, fetch_size=3)
pprint(res)

In [None]:
query = """
    SELECT
        brand,
        avg(price) AS avg_price
    FROM t_user_events
    WHERE event_time > '2015-01-01'
    GROUP BY brand;"""
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    SELECT
        user_id,
        count(*) AS event_count
    FROM t_user_events
    WHERE event_time > '2015-01-01'
    GROUP BY user_id
    ORDER BY count(*) DESC;"""
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    SELECT DISTINCT event_type
    FROM t_user_events
    WHERE user_id = 1515915625554995474
        AND event_time > '2015-01-01';"""
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [33]:
query = """
    UPDATE t_user_events
    SET event_type = 'hover'
    WHERE user_id = 1515915625554995474
        AND event_time > '2015-01-01';"""
res = execute_trans_query(query, fetch_size=0)

In [None]:
query = """
    SELECT DISTINCT event_type
    FROM t_user_events
    WHERE user_id = 1515915625554995474
        AND event_time > '2015-01-01';"""
res = execute_trans_query(query, fetch_size=1)
pprint(res)

In [None]:
query = """
    SELECT count(*)
    FROM t_user_events
    WHERE event_time < '2015-01-01';"""
res = execute_trans_query(query, fetch_size=1)
pprint(res)

In [None]:
query = """
    DELETE FROM t_user_events
    WHERE event_time < '2015-01-01';"""
res = execute_trans_query(query, fetch_size=0)
pprint(res)

In [None]:
query = """
    SELECT count(*)
    FROM t_user_events
    WHERE event_time < '2015-01-01';"""
res = execute_trans_query(query, fetch_size=1)
pprint(res)

In [None]:
query = """
    SELECT event_time, user_session
    FROM t_user_events
    WHERE event_type = 'purchase'
        AND event_time > '2020-12-08'
        AND event_time < '2020-12-08 05:00:00 UTC';"""
res = execute_trans_query(query, fetch_size=3)
pprint(res)

In [39]:
query = """
    INSERT INTO t_user_events (
        event_time, event_type, product_id,
        category_id, category_code, brand,
        price, user_id, user_session)
VALUES
    ('2024-02-01 12:34:56', 'purchase', 123,
        456, 'electronics.smartphone', 'BrandX',
        599.99, 789, 'a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11'),
    ('2024-02-01 13:45:12', 'view', 124,
        457, 'electronics.tablet', 'BrandY',
        299.99, 790, 'b1eebc99-9c0b-4ef8-bb6d-6bb9bd380a12'),
    ('2024-02-01 14:56:23', 'cart', 125,
        458, 'electronics.laptop', 'BrandZ',
        999.99, 791, 'c2eebc99-9c0b-4ef8-bb6d-6bb9bd380a13');"""
res = execute_trans_query(query, fetch_size=0)

In [None]:
query = """
    SELECT *
    FROM t_user_events
    WHERE event_time > '2024-02-01'
        AND event_time < '2024-02-02';"""
res = execute_trans_query(query, fetch_size=3)
pprint(res)

In [41]:
query = """
    UPDATE t_user_events
    SET price = '00.99'
    WHERE product_id = 124;"""
res = execute_trans_query(query, fetch_size=0)

In [42]:
query = """
    CREATE INDEX ix_events_user_id ON t_user_events (user_id);"""
res = execute_trans_query(query, fetch_size=0)

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM t_user_events
    WHERE user_id = 789;"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM t_user_events
    WHERE user_id = 789
        AND event_time > '2024-02-01'
        AND event_time < '2024-02-02';"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [45]:
query = """
    CREATE INDEX ix_events_time_user_id ON t_user_events (event_time DESC, user_id);"""
res = execute_trans_query(query, fetch_size=0)

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM t_user_events
    WHERE user_id = 789;"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM t_user_events
    WHERE user_id = 789
        AND event_time > '2024-02-01'
        AND event_time < '2024-02-02';"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [48]:
sleep(60)
query = """
    DROP INDEX IF EXISTS ix_events_user_id;"""
res = execute_trans_query(query, fetch_size=0)

In [49]:
query = """
    CREATE INDEX idx_event_time_type ON t_user_events(event_time DESC, event_type)
        WITH (timescaledb.transaction_per_chunk);"""
res = execute_non_trans_query(query, fetch_size=0)

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM t_user_events
    WHERE user_id = 789;"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM t_user_events
    WHERE user_id = 789
        AND event_time > '2024-02-01'
        AND event_time < '2024-02-02';"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM t_user_events
    WHERE user_session = 'c2eebc99-9c0b-4ef8-bb6d-6bb9bd380a13';"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM t_user_events
    WHERE user_session = 'c2eebc99-9c0b-4ef8-bb6d-6bb9bd380a13'
        AND event_time > '2024-02-01'
        AND event_time < '2024-02-02';"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [54]:
query = """
    CREATE INDEX idx_event_session ON t_user_events USING HASH(user_session);"""
res = execute_trans_query(query, fetch_size=0)

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM t_user_events
    WHERE user_session = 'c2eebc99-9c0b-4ef8-bb6d-6bb9bd380a13';"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM t_user_events
    WHERE user_session = 'c2eebc99-9c0b-4ef8-bb6d-6bb9bd380a13'
        AND event_time > '2024-02-01'
        AND event_time < '2024-02-02';"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM t_user_events
    WHERE event_time > '2024-02-01'
        AND event_time < '2024-02-02';"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [58]:
query = """
    CREATE INDEX idx_event_time ON t_user_events (event_time);"""
res = execute_trans_query(query, fetch_size=0)

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM t_user_events
    WHERE event_time > '2024-02-01'
        AND event_time < '2024-02-02';"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [None]:
query = """
    SELECT show_chunks('t_user_events');"""
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    SELECT *
    FROM timescaledb_information.dimensions
    WHERE hypertable_name = 't_user_events';"""
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    SELECT COUNT(DISTINCT category_code) FROM t_user_events"""
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    EXPLAIN
    SELECT *
    FROM t_user_events
    WHERE category_code = 'computers.peripherals.wifi';"""
res = execute_trans_query(query, fetch_size=10)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [None]:
query = """
    EXPLAIN
    SELECT time_bucket('1 hour', event_time) AS bucket,
        category_code,
        AVG(price) AS avg_price
    FROM t_user_events
    WHERE event_time < '2015-01-01'
    GROUP BY bucket, category_code;"""
res = execute_trans_query(query, fetch_size=20)
print(*[v for dct in res for _, v in dct.items()], sep="\n")

In [None]:
query = """
    SELECT add_dimension(
                't_user_events',
                by_hash('category_code', 110));"""
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    SELECT *
    FROM timescaledb_information.dimensions
    WHERE hypertable_name = 't_user_events';"""
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    """
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    """
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    """
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    """
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    """
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    """
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    """
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    """
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    """
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    """
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    """
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    """
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    """
res = execute_trans_query(query, fetch_size=5)
pprint(res)

In [None]:
query = """
    """
res = execute_trans_query(query, fetch_size=5)
pprint(res)

## DB-Breaking Statements

### Setup

In [31]:
query = """
    CREATE TABLE t_user_events_month_chunk (
        event_time TIMESTAMPTZ,
        event_type VARCHAR(100),
        product_id INT,
        category_id BIGINT,
        category_code VARCHAR(100),
        brand VARCHAR(100),
        price NUMERIC,
        user_id BIGINT,
        user_session VARCHAR(100)
    );"""
res = execute_trans_query(query, fetch_size=0)

In [None]:
print_chunk_info(table_name="t_user_events_month_chunk")

In [33]:
query = """
    SELECT create_hypertable(
                't_user_events_month_chunk',
                'event_time',
                chunk_time_interval => INTERVAL '1 month'
    );"""
res = execute_trans_query(query, fetch_size=0)

In [34]:
query = """
    INSERT INTO t_user_events_month_chunk
    SELECT * FROM t_user_events;"""
res = execute_trans_query(query, fetch_size=0)

In [35]:
query = """
    SELECT drop_chunks(
            't_user_events_month_chunk',
            older_than => INTERVAL '6 months');"""
res = execute_trans_query(query, fetch_size=0)

In [None]:
sleep(60)
print_count(table_name="t_user_events_month_chunk")

In [None]:
print_chunk_info(table_name="t_user_events_month_chunk")

In [None]:
print_compression_settings()

In [39]:
query = """
    ALTER TABLE t_user_events SET (
        timescaledb.compress,
        timescaledb.compress_orderby = 'event_time DESC',
        timescaledb.compress_segmentby = 'user_id, product_id');"""
res = execute_trans_query(query, fetch_size=0)

In [40]:
query = """
    SELECT add_compression_policy(
        't_user_events',
        INTERVAL '30 days');"""
res = execute_trans_query(query, fetch_size=0)

In [None]:
print_compression_settings()

In [None]:
sleep(60)
print_chunk_info(fetch_size=3)

In [None]:
print_count()

In [44]:
query = """
    SELECT add_retention_policy(
                't_user_events',
                INTERVAL '100 years',
                if_not_exists => TRUE);"""
res = execute_trans_query(query, fetch_size=0)

In [None]:
print_count()

In [46]:
query = """
    SELECT remove_retention_policy('t_user_events');"""
res = execute_trans_query(query, fetch_size=0)

In [47]:
query = """
    SELECT add_retention_policy(
                't_user_events',
                INTERVAL '1 hour',
                if_not_exists => TRUE);"""
res = execute_trans_query(query, fetch_size=0)

In [None]:
sleep(60)
print_count()