In [1]:
# 1. Configuration & Connection

import duckdb
import gc # Garbage Collector
from pathlib import Path

# --- CONFIGURATION ---
BASE_DIR = Path(r"C:\Users\websi\OneDrive - UT Cloud\Semester\3. WS2025_26\DS500 Data Science Project (12 ECTS)\tankerkoenig_repo\tankerkoenig-data")
DB_PATH = BASE_DIR / "fuel_price_preparation.duckdb"
OUTPUT_PARQUET = BASE_DIR / "derived" / "features_sampled_diesel_2023_2024.parquet"

# Globs
PRICE_GLOBS = [
    str(BASE_DIR / "prices" / "2023" / "*" / "*-prices.csv"),
    str(BASE_DIR / "prices" / "2024" / "*" / "*-prices.csv")
]
STATION_GLOBS = [
    str(BASE_DIR / "stations" / "2023" / "*" / "*-stations.csv"),
    str(BASE_DIR / "stations" / "2024" / "*" / "*-stations.csv")
]

# --- ROBUST CONNECTION ---
# 1. Force close any existing connection variables
try:
    if 'con' in globals():
        con.close()
except:
    pass

# 2. Force Python to clear the memory (kill the "Zombie")
gc.collect()

# 3. Create the new connection
try:
    con = duckdb.connect(str(DB_PATH))
    con.execute("PRAGMA threads=8;")
    con.execute("SELECT setseed(0.42);")
    print(f"SUCCESS: Connected to {DB_PATH.name}")
except Exception as e:
    print("ERROR: Could not connect. If using OneDrive, pause syncing or restart the kernel.")
    print(e)

print(f"Connected to: {DB_PATH}")

SUCCESS: Connected to fuel_price_preparation.duckdb
Connected to: C:\Users\websi\OneDrive - UT Cloud\Semester\3. WS2025_26\DS500 Data Science Project (12 ECTS)\tankerkoenig_repo\tankerkoenig-data\fuel_price_preparation.duckdb


In [2]:
# 2. Data Ingestion (Bronze Layer)

# --- Prices ---
con.execute("DROP TABLE IF EXISTS prices_raw;")
con.execute(
    """
    CREATE TABLE prices_raw AS
    SELECT *
    FROM read_csv_auto(?, union_by_name = true);
    """,
    [PRICE_GLOBS],
)

# --- Stations ---
con.execute("DROP TABLE IF EXISTS stations_raw;")
con.execute(
    """
    CREATE TABLE stations_raw AS
    SELECT *
    FROM read_csv_auto(?, filename = true, union_by_name = true);
    """,
    [STATION_GLOBS],
)

print("Ingestion complete.")
con.sql(
    """
    SELECT 'Prices' AS type, COUNT(*) AS n FROM prices_raw
    UNION ALL
    SELECT 'Stations', COUNT(*) FROM stations_raw;
    """
).show()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Ingestion complete.
┌──────────┬───────────┐
│   type   │     n     │
│ varchar  │   int64   │
├──────────┼───────────┤
│ Prices   │ 273916339 │
│ Stations │  12523064 │
└──────────┴───────────┘



In [3]:
# 3. Station Processing: latest snapshot -> sample of 500 stations

con.execute("DROP TABLE IF EXISTS stations_final_sample;")

con.execute(
    """
    CREATE TABLE stations_final_sample AS
    WITH parsed_stations AS (
        SELECT
            *,
            CAST(
                regexp_extract(filename, '([0-9]{4}-[0-9]{2}-[0-9]{2})', 1)
                AS DATE
            ) AS snapshot_date
        FROM stations_raw
    ),
    latest_snapshot AS (
        SELECT * EXCLUDE (rn)
        FROM (
            SELECT
                *,
                ROW_NUMBER() OVER (
                    PARTITION BY uuid
                    ORDER BY snapshot_date DESC
                ) AS rn
            FROM parsed_stations
        )
        WHERE rn = 1
    ),
    sampled_stations AS (
        -- Random 500-station sample to keep data manageable
        SELECT *
        FROM latest_snapshot
        ORDER BY random()
        LIMIT 500
    )
    SELECT
        s.uuid AS station_uuid
    FROM sampled_stations s;
    """
)

con.sql("SELECT COUNT(*) AS n_stations FROM stations_final_sample").show()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌────────────┐
│ n_stations │
│   int64    │
├────────────┤
│        500 │
└────────────┘



In [6]:
# 4. Price Gridding and Forward Fill  (DIESEL only)

con.execute("DROP TABLE IF EXISTS grid_sampled_diesel_prepared;")

con.execute(
    """
    CREATE TABLE grid_sampled_diesel_prepared AS
    WITH

    -- 1. Filter prices to our 500 stations and diesel fuel only
    relevant_prices AS (
        SELECT
            p.station_uuid,
            CAST(p.date AS TIMESTAMP) AS ts_raw,            -- ensure scalar TIMESTAMP
            CAST(p.diesel AS DOUBLE) AS price_raw
        FROM prices_raw p
        WHERE p.station_uuid IN (SELECT station_uuid FROM stations_final_sample)
          AND p.diesel IS NOT NULL
          AND p.diesel > 0
    ),

    -- 2. Round timestamps to 30-minute floor, take average if multiple updates in 30 min
    prices_30min AS (
        SELECT
            station_uuid,
            date_trunc('hour', ts_raw)
                + INTERVAL (CASE WHEN EXTRACT(MINUTE FROM ts_raw) < 30
                                  THEN 0
                                  ELSE 30
                             END) MINUTE AS ts_30,
            AVG(price_raw) AS price_event
        FROM relevant_prices
        GROUP BY 1, 2
    ),

    -- 3. Build the per-station time grid (min to max timestamp)
    station_bounds AS (
        SELECT
            station_uuid,
            MIN(ts_30) AS min_ts,
            MAX(ts_30) AS max_ts
        FROM prices_30min
        GROUP BY 1
    ),

    full_grid AS (
        SELECT
            sb.station_uuid,
            gs.ts_30
        FROM station_bounds sb,
             -- IMPORTANT: table version of generate_series → scalar TIMESTAMP, not TIMESTAMP[]
             generate_series(sb.min_ts, sb.max_ts, INTERVAL 30 MINUTE) AS gs(ts_30)
    ),

    -- 4. Join grid with events and forward fill
    joined_grid AS (
        SELECT
            g.station_uuid,
            g.ts_30,
            p.price_event
        FROM full_grid g
        LEFT JOIN prices_30min p
          ON g.station_uuid = p.station_uuid
         AND g.ts_30 = p.ts_30
    )

    -- Final selection with forward fill
    SELECT
        station_uuid,
        ts_30,
        LAST_VALUE(price_event IGNORE NULLS) OVER (
            PARTITION BY station_uuid
            ORDER BY ts_30
        ) AS price
    FROM joined_grid;
    """
)

# Validation: ensure we didn't lose data
con.sql("SELECT COUNT(*) AS n_non_null FROM grid_sampled_diesel_prepared WHERE price IS NOT NULL").show()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌────────────┐
│ n_non_null │
│   int64    │
├────────────┤
│   15501865 │
└────────────┘



In [9]:
# 5. Feature engineering: daily lags + intra-day (cell) lags, export to parquet

con.execute("DROP TABLE IF EXISTS features_final;")

con.execute(
    """
    CREATE TABLE features_final AS
    WITH prepared_with_time AS (
        SELECT
            g.station_uuid,
            g.ts_30,
            timezone('Europe/Berlin', g.ts_30) AS ts_local,
            CAST(timezone('Europe/Berlin', g.ts_30) AS DATE) AS date,
            -- Time cell 0–47 (30-minute buckets)
            (EXTRACT(HOUR   FROM timezone('Europe/Berlin', g.ts_30)) * 2) +
            (EXTRACT(MINUTE FROM timezone('Europe/Berlin', g.ts_30)) / 30) AS time_cell,
            g.price
        FROM grid_sampled_diesel_prepared g
        WHERE g.price IS NOT NULL
    ),

    -- DAILY LAGS: same time_cell on previous days
    daily_lags AS (
        SELECT
            *,
            LAG(price, 1) OVER (
                PARTITION BY station_uuid, time_cell
                ORDER BY date
            ) AS price_lag_1d,
            LAG(price, 2) OVER (
                PARTITION BY station_uuid, time_cell
                ORDER BY date
            ) AS price_lag_2d,
            LAG(price, 3) OVER (
                PARTITION BY station_uuid, time_cell
                ORDER BY date
            ) AS price_lag_3d,
            LAG(price, 7) OVER (
                PARTITION BY station_uuid, time_cell
                ORDER BY date
            ) AS price_lag_7d
        FROM prepared_with_time
    ),

    -- INTRA-DAY LAGS: previous cells on the SAME day (today's within-day history)
    intraday_lags AS (
        SELECT
            *,
            LAG(price, 1) OVER (
                PARTITION BY station_uuid, date
                ORDER BY time_cell
            ) AS price_lag_1cell,
            LAG(price, 2) OVER (
                PARTITION BY station_uuid, date
                ORDER BY time_cell
            ) AS price_lag_2cell,
            LAG(price, 3) OVER (
                PARTITION BY station_uuid, date
                ORDER BY time_cell
            ) AS price_lag_3cell,
            LAG(price, 4) OVER (
                PARTITION BY station_uuid, date
                ORDER BY time_cell
            ) AS price_lag_4cell
        FROM daily_lags
    )

    SELECT
        station_uuid,
        ts_30,
        ts_local,
        date,
        time_cell,
        price,
        price_lag_1d,
        price_lag_2d,
        price_lag_3d,
        price_lag_7d,
        price_lag_1cell,
        price_lag_2cell,
        price_lag_3cell,
        price_lag_4cell
    FROM intraday_lags
    WHERE price_lag_1d IS NOT NULL
      AND price_lag_7d IS NOT NULL;
    """
)

output_dir = OUTPUT_PARQUET.parent
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Exporting to {OUTPUT_PARQUET} ...")
con.execute(
    f"COPY features_final TO '{OUTPUT_PARQUET}' (FORMAT PARQUET, COMPRESSION ZSTD);"
)
print("Done.")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Exporting to C:\Users\websi\OneDrive - UT Cloud\Semester\3. WS2025_26\DS500 Data Science Project (12 ECTS)\tankerkoenig_repo\tankerkoenig-data\derived\features_sampled_diesel_2023_2024.parquet ...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Done.
