# Kalshi Weather Market — Snapshot Analysis

Interactive exploration of live-collected market snapshots and orderbook data.  
All parquet files under `collector/data/` are auto-detected; new snapshots appear on re-run.

In [35]:
import re
from pathlib import Path
from datetime import datetime

import pandas as pd
import pyarrow.parquet as pq
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import ipywidgets as widgets
from IPython.display import display, Markdown

pd.set_option("display.max_columns", 30)
pd.set_option("display.max_rows", 100)
pd.set_option("display.float_format", "{:.2f}".format)

## 1. Auto-detect & load all parquet files

In [36]:
# ---------- paths (auto-detect) ----------
DATA_ROOT    = Path("../collector/data")
MKT_DIR      = DATA_ROOT / "market_snapshots"
OB_DIR       = DATA_ROOT / "orderbook_snapshots"
HIST_CANDLES = DATA_ROOT / "historical" / "candlesticks"
HIST_TRADES  = DATA_ROOT / "historical" / "trades"


def load_all_parquets(directory: Path) -> pd.DataFrame:
    """Read and concatenate every .parquet file in *directory*."""
    files = sorted(directory.glob("*.parquet")) if directory.exists() else []
    if not files:
        return pd.DataFrame()
    
    # Use pandas concat to handle varying schemas gracefully (like int vs float in ASOS tmpf)
    dfs = []
    for f in files:
        dfs.append(pd.read_parquet(f))
    return pd.concat(dfs, ignore_index=True)


def reconstruct_orderbooks(raw: pd.DataFrame) -> pd.DataFrame:
    """Stitch baseline + delta rows into a full book state at every snapshot timestamp.

    The collector writes two row types in snapshot_type:
      'baseline' : full book dump every N snapshots
      'delta'    : only changed price levels (quantity==0 means level removed)

    Returns a DataFrame with snapshot_type='reconstructed' where every row is
    the complete quantity at that price level as of that snapshot timestamp.
    Legacy files without snapshot_type are returned as-is.
    """
    if raw.empty:
        return raw.copy()
    if "snapshot_type" not in raw.columns:
        return raw.copy()

    raw = raw.sort_values("snapshot_ts").reset_index(drop=True)
    book: dict = {}   # book[market_ticker][side][price_cents] = quantity
    rows: list = []

    for ts in raw["snapshot_ts"].unique():
        snap = raw[raw["snapshot_ts"] == ts]
        snap_type = snap["snapshot_type"].iloc[0]

        if snap_type == "baseline":
            for tk in snap["market_ticker"].unique():
                book[tk] = {"yes": {}, "no": {}}
                for _, r in snap[snap["market_ticker"] == tk].iterrows():
                    book[tk][r["side"]][int(r["price_cents"])] = r["quantity"]
        else:  # delta -- apply only changed levels
            for _, r in snap.iterrows():
                tk = r["market_ticker"]
                book.setdefault(tk, {"yes": {}, "no": {}})
                p = int(r["price_cents"])
                if r["quantity"] == 0:
                    book[tk][r["side"]].pop(p, None)
                else:
                    book[tk][r["side"]][p] = r["quantity"]

        for tk, sides in book.items():
            for side, levels in sides.items():
                for price, qty in levels.items():
                    if qty > 0:
                        rows.append({
                            "snapshot_ts":   ts,
                            "market_ticker": tk,
                            "side":          side,
                            "price_cents":   price,
                            "quantity":      qty,
                            "snapshot_type": "reconstructed",
                        })

    return pd.DataFrame(rows)


# ----- load data -----
mkt_df     = load_all_parquets(MKT_DIR)
ob_raw_df  = load_all_parquets(OB_DIR)         # compact storage: baseline + delta rows
ob_df      = reconstruct_orderbooks(ob_raw_df)  # full book at every snapshot timestamp

candle_df  = load_all_parquets(HIST_CANDLES)
trade_df   = load_all_parquets(HIST_TRADES)

n_ob_files = len(list(OB_DIR.glob("*.parquet")))
print(f"Market snapshots     : {mkt_df.shape[0]:>8,} rows  from {len(list(MKT_DIR.glob('*.parquet')))} file(s)")
print(f"Orderbook (raw)      : {ob_raw_df.shape[0]:>8,} rows  from {n_ob_files} file(s)  [baseline + delta]")
print(f"Orderbook (rebuilt)  : {ob_df.shape[0]:>8,} rows  [full book at every snapshot]")
print(f"Hist candlesticks    : {candle_df.shape[0]:>8,} rows")
print(f"Hist trades          : {trade_df.shape[0]:>8,} rows")

if not ob_raw_df.empty and "snapshot_type" in ob_raw_df.columns:
    counts = ob_raw_df["snapshot_type"].value_counts()
    saved_pct = (1 - ob_raw_df.shape[0] / max(ob_df.shape[0], 1)) * 100
    print(f"\nRaw breakdown: {dict(counts)}  →  delta compression saves ~{saved_pct:.0f}% of rows")


Market snapshots     :   42,123 rows  from 3 file(s)
Orderbook (raw)      :    5,670 rows  from 3 file(s)  [baseline + delta]
Orderbook (rebuilt)  :    5,651 rows  [full book at every snapshot]
Hist candlesticks    :        0 rows
Hist trades          :        0 rows

Raw breakdown: {'baseline': np.int64(5481), 'delta': np.int64(189)}  →  delta compression saves ~-0% of rows


## 2. Parse event tickers & enrich data

Event ticker format: `KXHIGHCHI-26FEB11`  
- **Series** = `KXHIGHCHI` (Kalshi High Temp — Chicago)  
- **Date code** = `26FEB11` → 2026-02-11  

We extract city, target date, and a human-friendly label for every event.

In [37]:
CITY_CODES = {
    "CHI": "Chicago",
    "NY":  "New York",
    "MIA": "Miami",
    "DEN": "Denver",
    "AUS": "Austin",
    "HOU": "Houston",
    "PHL": "Philadelphia",
}

_EVENT_RE = re.compile(
    r"^(?P<series>KXHIGH(?P<city>[A-Z]+))"
    r"-(?P<yy>\d{2})(?P<mon>[A-Z]{3})(?P<dd>\d{2})$"
)


def parse_event_ticker(ticker: str) -> dict:
    """Extract structured fields from an event ticker string."""
    m = _EVENT_RE.match(ticker)
    if not m:
        return {"series": ticker, "city_code": "", "city": ticker,
                "target_date": None, "event_label": ticker}
    city_code = m.group("city")
    city = CITY_CODES.get(city_code, city_code)
    target_date = datetime.strptime(
        f"20{m.group('yy')}-{m.group('mon')}-{m.group('dd')}", "%Y-%b-%d"
    ).date()
    label = f"{city} — {target_date:%b %d, %Y}"
    return {
        "series": m.group("series"),
        "city_code": city_code,
        "city": city,
        "target_date": target_date,
        "event_label": label,
    }


def enrich_market_df(df: pd.DataFrame) -> pd.DataFrame:
    """Add parsed event metadata columns to a market-snapshot dataframe."""
    if df.empty:
        return df
    parsed = df["event_ticker"].map(parse_event_ticker).apply(pd.Series)
    df = pd.concat([df, parsed], axis=1)
    # Cents → probability
    df["mid_price"] = (df["yes_bid"] + df["yes_ask"]) / 2
    df["spread"]    = df["yes_ask"] - df["yes_bid"]
    return df


mkt = enrich_market_df(mkt_df)
if not mkt.empty:
    display(Markdown("### Detected events"))
    summary = (
        mkt.groupby(["event_ticker", "event_label"])
        .agg(
            contracts=("market_ticker", "nunique"),
            snapshots=("snapshot_ts", "nunique"),
            first_snap=("snapshot_ts", "min"),
            last_snap=("snapshot_ts", "max"),
        )
        .reset_index()
    )
    display(summary)

### Detected events

Unnamed: 0,event_ticker,event_label,contracts,snapshots,first_snap,last_snap
0,KXHIGHCHI-26FEB19,"Chicago — Feb 19, 2026",6,3513,2026-02-19 03:40:10.443637+00:00,2026-02-20 00:43:43.664683+00:00
1,KXHIGHNY-26FEB19,"New York — Feb 19, 2026",6,3510,2026-02-19 03:40:10.443637+00:00,2026-02-20 00:43:43.664683+00:00


## 3. Event selector

Choose which event to explore.  All downstream cells react to this choice.

In [38]:
event_tickers = sorted(mkt["event_ticker"].unique()) if not mkt.empty else []

event_dropdown = widgets.Dropdown(
    options=[(f"{t}  ({parse_event_ticker(t)['event_label']})", t) for t in event_tickers],
    description="Event:",
    style={"description_width": "60px"},
    layout=widgets.Layout(width="500px"),
)
display(event_dropdown)


def selected_event() -> str:
    return event_dropdown.value


def event_mkt() -> pd.DataFrame:
    """Market snapshot rows for the selected event."""
    return mkt[mkt["event_ticker"] == selected_event()].copy()


def event_ob() -> pd.DataFrame:
    """Reconstructed (delta-stitched) orderbook rows for the selected event.

    Every row is the full quantity at a price level as of that snapshot.
    Use this for all plotting and analysis.
    """
    tickers = event_mkt()["market_ticker"].unique()
    return ob_df[ob_df["market_ticker"].isin(tickers)].copy()


def event_ob_raw() -> pd.DataFrame:
    """Raw orderbook rows (baseline + delta) -- for debugging only.

    Delta rows contain only *changed* price levels, not the full book.
    Prefer event_ob() for any analysis or plotting.
    """
    tickers = event_mkt()["market_ticker"].unique()
    return ob_raw_df[ob_raw_df["market_ticker"].isin(tickers)].copy()


Dropdown(description='Event:', layout=Layout(width='500px'), options=(('KXHIGHCHI-26FEB19  (Chicago — Feb 19, …

## 4. Snapshot summary for selected event

In [39]:
em = event_mkt()
display(Markdown(f"### {selected_event()}  —  {parse_event_ticker(selected_event())['event_label']}"))
display(Markdown(f"**{em['snapshot_ts'].nunique()}** snapshots · "
                 f"**{em['market_ticker'].nunique()}** contracts · "
                 f"time range: `{em['snapshot_ts'].min()}` → `{em['snapshot_ts'].max()}`"))
display(Markdown("---"))

# Latest snapshot for each contract
latest = em.sort_values("snapshot_ts").groupby("market_ticker").last().reset_index()
latest_display = latest[[
    "market_ticker", "subtitle", "yes_bid", "yes_ask", "mid_price",
    "spread", "last_price", "volume", "open_interest",
]].sort_values("mid_price", ascending=False)

display(Markdown("#### Latest contract prices (cents = implied probability %)"))
display(latest_display.style.format({
    "mid_price": "{:.1f}¢",
    "spread": "{:.0f}¢",
    "volume": "{:,.0f}",
    "open_interest": "{:,.0f}",
}).bar(subset=["mid_price"], color="#5fba7d", vmin=0, vmax=100))

### KXHIGHCHI-26FEB19  —  Chicago — Feb 19, 2026

**3513** snapshots · **6** contracts · time range: `2026-02-19 03:40:10.443637+00:00` → `2026-02-20 00:43:43.664683+00:00`

---

#### Latest contract prices (cents = implied probability %)

Unnamed: 0,market_ticker,subtitle,yes_bid,yes_ask,mid_price,spread,last_price,volume,open_interest
0,KXHIGHCHI-26FEB19-B57.5,57° to 58°,99,100,99.5¢,1¢,22,113710,38946
1,KXHIGHCHI-26FEB19-B59.5,59° to 60°,0,1,0.5¢,1¢,6,60694,25582
2,KXHIGHCHI-26FEB19-B61.5,61° to 62°,0,1,0.5¢,1¢,3,52390,33040
3,KXHIGHCHI-26FEB19-B63.5,63° to 64°,0,1,0.5¢,1¢,1,30636,27564
4,KXHIGHCHI-26FEB19-T57,56° or below,0,1,0.5¢,1¢,74,217914,121954
5,KXHIGHCHI-26FEB19-T64,65° or above,0,1,0.5¢,1¢,1,5194,4868


## 5. Price evolution over time

Shows `yes_bid`, `yes_ask`, and `mid_price` for every contract across all snapshots.

In [40]:
em = event_mkt()

if em["snapshot_ts"].nunique() < 2:
    display(Markdown("> **Only 1 snapshot available** — price evolution chart will be "
                     "more useful once more snapshots are collected.  "
                     "Showing current values as a bar chart instead."))
    fig = px.bar(
        em.sort_values("mid_price", ascending=False),
        x="subtitle", y="mid_price",
        color="subtitle",
        title=f"Current implied probabilities — {selected_event()}",
        labels={"mid_price": "Mid price (¢ = implied %)", "subtitle": "Contract"},
    )
    fig.update_layout(showlegend=False, yaxis_range=[0, 100])
    fig.show()
else:
    fig = px.line(
        em.sort_values("snapshot_ts"),
        x="snapshot_ts", y="mid_price",
        color="subtitle",
        title=f"Mid-price evolution — {selected_event()}",
        labels={"mid_price": "Mid price (¢)", "snapshot_ts": "Time (UTC)"},
        markers=True,
    )
    fig.update_layout(
        yaxis_range=[0, 100],
        hovermode="x unified",
        legend_title_text="Contract",
    )
    fig.show()

    # Bid-ask spread over time
    fig2 = px.line(
        em.sort_values("snapshot_ts"),
        x="snapshot_ts", y="spread",
        color="subtitle",
        title=f"Bid-ask spread over time — {selected_event()}",
        labels={"spread": "Spread (¢)", "snapshot_ts": "Time (UTC)"},
        markers=True,
    )
    fig2.update_layout(hovermode="x unified", legend_title_text="Contract")
    fig2.show()

## 5b. Bucket probability distribution over time

Visualises how the **implied probability of each temperature bucket** evolves across
all collected snapshots for the selected event.  

- **Stacked area chart** — shows how probability mass shifts between buckets over time.  
- **Line chart** — same data in a standard multi-line view for easier comparison of individual buckets.  

Mid-price (avg of `yes_bid` and `yes_ask`) is used as the implied probability (in %).
Buckets are ordered by the lower temperature bound so the colour stack follows a
natural temperature gradient.

In [41]:
em = event_mkt()

if em.empty:
    display(Markdown("> No market data for the selected event."))
else:
    # --- prep ---
    def _bucket_sort_key(subtitle: str) -> float:
        """Extract numeric lower-bound from subtitle like '42° to 43°' or '39° or below'."""
        nums = re.findall(r"(\d+(?:\.\d+)?)", subtitle)
        return float(nums[0]) if nums else 0.0

    # Compute implied probability (%) = mid_price since mid_price is already in cents
    prob = em[["snapshot_ts", "subtitle", "mid_price"]].copy()
    prob["implied_prob"] = prob["mid_price"]  # cents ≈ %
    prob["_sort"] = prob["subtitle"].map(_bucket_sort_key)
    prob = prob.sort_values(["snapshot_ts", "_sort"])

    # Ordered bucket list (low temp → high temp)
    bucket_order = (
        prob[["subtitle", "_sort"]]
        .drop_duplicates()
        .sort_values("_sort")["subtitle"]
        .tolist()
    )

    # Pivot: rows = snapshot_ts, columns = subtitle (bucket)
    pivot = (
        prob.pivot_table(
            index="snapshot_ts", columns="subtitle",
            values="implied_prob", aggfunc="mean",
        )
        .reindex(columns=bucket_order)
        .sort_index()
        .fillna(0)
    )

    # ---------- 1. Stacked area chart ----------
    fig_area = go.Figure()
    for bucket in bucket_order:
        fig_area.add_trace(go.Scatter(
            x=pivot.index, y=pivot[bucket],
            mode="lines",
            name=bucket,
            stackgroup="prob",
            hovertemplate=f"{bucket}<br>%{{x|%H:%M:%S}}<br>%{{y:.1f}}%<extra></extra>",
        ))
    fig_area.update_layout(
        title=f"Bucket probability distribution over time — {selected_event()}",
        xaxis_title="Time (UTC)",
        yaxis_title="Implied probability (%)",
        yaxis_range=[0, 105],
        hovermode="x unified",
        legend_title_text="Bucket",
        height=500,
    )
    fig_area.show()

    # ---------- 2. Line chart (same data, easier to read individual buckets) ----------
    fig_line = go.Figure()
    for bucket in bucket_order:
        fig_line.add_trace(go.Scatter(
            x=pivot.index, y=pivot[bucket],
            mode="lines+markers",
            name=bucket,
            hovertemplate=f"{bucket}<br>%{{x|%H:%M:%S}}<br>%{{y:.1f}}%<extra></extra>",
        ))
    fig_line.update_layout(
        title=f"Per-bucket probability over time — {selected_event()}",
        xaxis_title="Time (UTC)",
        yaxis_title="Implied probability (%)",
        yaxis_range=[0, 100],
        hovermode="x unified",
        legend_title_text="Bucket",
        height=500,
    )
    fig_line.show()

    # ---------- 3. Summary table: prob change from first to last snapshot ----------
    if len(pivot) >= 2:
        first_snap = pivot.iloc[0]
        last_snap  = pivot.iloc[-1]
        delta = last_snap - first_snap
        summary = pd.DataFrame({
            "First snapshot (%)": first_snap,
            "Last snapshot (%)": last_snap,
            "Change (pp)": delta,
        }).rename_axis("Bucket")
        summary = summary.sort_values("Last snapshot (%)", ascending=False)
        display(Markdown("#### Probability shift: first → last snapshot"))
        display(
            summary.style
            .format("{:.1f}")
            .bar(subset=["Change (pp)"], color=["#d65f5f", "#5fba7d"], align="zero")
        )
    else:
        display(Markdown("> Only 1 snapshot — probability change table needs at least 2."))


#### Probability shift: first → last snapshot

Unnamed: 0_level_0,First snapshot (%),Last snapshot (%),Change (pp)
Bucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
57° to 58°,26.5,99.5,73.0
56° or below,69.5,0.5,-69.0
59° to 60°,5.5,0.5,-5.0
61° to 62°,2.5,0.5,-2.0
63° to 64°,1.5,0.5,-1.0
65° or above,0.5,0.5,0.0


## 6. Volume & open interest

In [42]:
em = event_mkt()

if em["snapshot_ts"].nunique() >= 2:
    fig = make_subplots(
        rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.08,
        subplot_titles=("Cumulative volume", "Open interest"),
    )
    for sub in em["subtitle"].unique():
        s = em[em["subtitle"] == sub].sort_values("snapshot_ts")
        fig.add_trace(go.Scatter(
            x=s["snapshot_ts"], y=s["volume"], name=sub, mode="lines+markers",
            legendgroup=sub,
        ), row=1, col=1)
        fig.add_trace(go.Scatter(
            x=s["snapshot_ts"], y=s["open_interest"], name=sub, mode="lines+markers",
            legendgroup=sub, showlegend=False,
        ), row=2, col=1)
    fig.update_layout(
        height=600, title_text=f"Volume & OI — {selected_event()}",
        hovermode="x unified",
    )
    fig.show()
else:
    latest = em.sort_values("snapshot_ts").groupby("market_ticker").last().reset_index()
    fig = px.bar(
        latest.sort_values("volume", ascending=True),
        y="subtitle", x="volume", orientation="h",
        color="subtitle",
        title=f"Volume by contract — {selected_event()}",
        labels={"volume": "Volume (contracts)", "subtitle": ""},
    )
    fig.update_layout(showlegend=False)
    fig.show()

    fig2 = px.bar(
        latest.sort_values("open_interest", ascending=True),
        y="subtitle", x="open_interest", orientation="h",
        color="subtitle",
        title=f"Open interest by contract — {selected_event()}",
        labels={"open_interest": "Open interest", "subtitle": ""},
    )
    fig2.update_layout(showlegend=False)
    fig2.show()

## 7. Orderbook depth (latest snapshot)

In [43]:
# event_ob() returns delta-stitched data: full book at every snapshot timestamp.
eob = event_ob()

if eob.empty:
    display(Markdown("> No orderbook data for this event."))
else:
    latest_ts_per = eob.groupby("market_ticker")["snapshot_ts"].max().reset_index()
    eob_latest = eob.merge(latest_ts_per, on=["market_ticker", "snapshot_ts"])

    sub_map = (
        mkt.drop_duplicates("market_ticker").set_index("market_ticker")["subtitle"].to_dict()
        if not mkt.empty else {}
    )

    contracts = sorted(eob_latest["market_ticker"].unique())
    n_contracts = len(contracts)
    cols = min(n_contracts, 3)
    rows_grid = (n_contracts + cols - 1) // cols

    fig = make_subplots(
        rows=rows_grid, cols=cols,
        subplot_titles=[sub_map.get(c, c) for c in contracts],
        horizontal_spacing=0.08, vertical_spacing=0.14,
    )

    for idx, contract in enumerate(contracts):
        r, c = divmod(idx, cols)
        cdf = eob_latest[eob_latest["market_ticker"] == contract]

        yes = cdf[cdf["side"] == "yes"].sort_values("price_cents", ascending=False).copy()
        if not yes.empty:
            yes["cum_qty"] = yes["quantity"].cumsum()
            fig.add_trace(go.Scatter(
                x=yes["price_cents"], y=yes["cum_qty"],
                mode="lines", fill="tozeroy", name="yes bids",
                marker_color="#2ca02c", legendgroup="yes", showlegend=(idx == 0),
                hovertemplate="yes %{x}\u00a2 \u2014 cum %{y:.0f}<extra></extra>",
            ), row=r+1, col=c+1)

        no = cdf[cdf["side"] == "no"].sort_values("price_cents", ascending=True).copy()
        if not no.empty:
            no["cum_qty"] = no["quantity"].cumsum()
            fig.add_trace(go.Scatter(
                x=no["price_cents"], y=no["cum_qty"],
                mode="lines", fill="tozeroy", name="no bids",
                marker_color="#d62728", legendgroup="no", showlegend=(idx == 0),
                hovertemplate="no %{x}\u00a2 \u2014 cum %{y:.0f}<extra></extra>",
            ), row=r+1, col=c+1)

    snap_ts = latest_ts_per["snapshot_ts"].max()
    fig.update_layout(
        height=320 * rows_grid,
        title_text=(
            f"Orderbook depth (cumulative) \u2014 {selected_event()}<br>"
            f"<sup>Snapshot: {snap_ts}  |  data: delta-stitched</sup>"
        ),
        legend_title="Side",
    )
    fig.update_xaxes(title_text="Price (\u00a2)", range=[0, 100])
    fig.update_yaxes(title_text="Cumulative quantity")
    fig.show()


## 7b. Orderbook depth — evolution over time

Shows how bid/ask walls shift across all collected snapshots for a chosen contract.  
Darker lines = more recent snapshots.


In [44]:
eob = event_ob()

if eob.empty:
    display(Markdown("> No orderbook data for this event."))
else:
    sub_map = (
        mkt.drop_duplicates("market_ticker").set_index("market_ticker")["subtitle"].to_dict()
        if not mkt.empty else {}
    )
    contracts = sorted(eob["market_ticker"].unique())

    contract_dd = widgets.Dropdown(
        options=[(sub_map.get(c, c), c) for c in contracts],
        description="Contract:",
        style={"description_width": "80px"},
        layout=widgets.Layout(width="500px"),
    )
    display(contract_dd)


    def plot_ob_evolution(contract):
        cdf = eob[eob["market_ticker"] == contract].copy()
        timestamps = sorted(cdf["snapshot_ts"].unique())
        n_ts = len(timestamps)
        if n_ts == 0:
            display(Markdown("> No snapshots for this contract."))
            return
        fig = go.Figure()

        def ramp(i, total, rgb):
            a = 0.2 + 0.8 * (i + 1) / total
            return f"rgba({rgb[0]},{rgb[1]},{rgb[2]},{a:.2f})"

        for i, ts in enumerate(timestamps):
            snap = cdf[cdf["snapshot_ts"] == ts]
            lbl = str(ts)[:19]

            yes = snap[snap["side"] == "yes"].sort_values("price_cents", ascending=False).copy()
            if not yes.empty:
                yes["cum_qty"] = yes["quantity"].cumsum()
                fig.add_trace(go.Scatter(
                    x=yes["price_cents"], y=yes["cum_qty"], mode="lines",
                    name=f"yes {lbl}", line=dict(color=ramp(i, n_ts, (44, 160, 44)), width=1.5),
                    legendgroup="yes", showlegend=(i == n_ts - 1),
                    legendgrouptitle_text="yes bids" if i == n_ts - 1 else None,
                    hovertemplate=f"{lbl}<br>yes %{{x}}\u00a2 cum %{{y:.0f}}<extra></extra>",
                ))

            no = snap[snap["side"] == "no"].sort_values("price_cents", ascending=True).copy()
            if not no.empty:
                no["cum_qty"] = no["quantity"].cumsum()
                fig.add_trace(go.Scatter(
                    x=no["price_cents"], y=no["cum_qty"], mode="lines",
                    name=f"no {lbl}", line=dict(color=ramp(i, n_ts, (214, 39, 40)), width=1.5),
                    legendgroup="no", showlegend=(i == n_ts - 1),
                    legendgrouptitle_text="no bids" if i == n_ts - 1 else None,
                    hovertemplate=f"{lbl}<br>no %{{x}}\u00a2 cum %{{y:.0f}}<extra></extra>",
                ))

        fig.update_layout(
            title=(
                f"Orderbook evolution \u2014 {sub_map.get(contract, contract)}<br>"
                f"<sup>{n_ts} snapshots  |  data: delta-stitched  |  darker = more recent</sup>"
            ),
            xaxis_title="Price (\u00a2)", yaxis_title="Cumulative quantity",
            xaxis=dict(range=[0, 100]), height=500,
        )
        fig.show()


    out = widgets.Output()
    display(out)

    def _on_contract_change(change):
        from IPython.display import clear_output
        with out:
            clear_output(wait=True)
            plot_ob_evolution(change["new"])

    contract_dd.observe(_on_contract_change, names="value")
    with out:
        plot_ob_evolution(contracts[0])


Dropdown(description='Contract:', layout=Layout(width='500px'), options=(('57° to 58°', 'KXHIGHCHI-26FEB19-B57…

Output()

## 8. Cross-event comparison

Compare the latest implied probability distribution across **all** events.  
Each event is a separate facet; the x-axis is the temperature bucket.

In [45]:
if mkt.empty:
    display(Markdown("> No market data loaded."))
else:
    # Latest snapshot per event × contract
    latest_all = (
        mkt.sort_values("snapshot_ts")
        .groupby(["event_ticker", "market_ticker"])
        .last()
        .reset_index()
    )

    # Sort subtitles by the numeric lower bound for a natural temperature axis
    def _sort_key(subtitle: str) -> float:
        """Extract a numeric sort key from subtitle like '42° to 43°' or '44° or above'."""
        nums = re.findall(r"(\d+(?:\.\d+)?)", subtitle)
        if nums:
            return float(nums[0])
        return 0.0

    latest_all["_sort"] = latest_all["subtitle"].map(_sort_key)
    latest_all = latest_all.sort_values(["event_label", "_sort"])

    n_events = latest_all["event_ticker"].nunique()

    fig = px.bar(
        latest_all,
        x="subtitle", y="mid_price",
        color="subtitle",
        facet_col="event_label",
        facet_col_wrap=min(n_events, 3),
        title="Implied probability distribution — all events",
        labels={"mid_price": "Mid price (¢ ≈ %)", "subtitle": "Temp range"},
        category_orders={"subtitle": latest_all["subtitle"].unique().tolist()},
    )
    fig.update_layout(
        showlegend=False,
        height=400 * ((n_events + 2) // 3),
        yaxis_range=[0, 100],
    )
    fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
    fig.show()

## 9. Snapshot cadence & data health

In [46]:
if mkt.empty:
    display(Markdown("> No data."))
else:
    snap_times = (
        mkt.drop_duplicates(subset=["snapshot_ts", "event_ticker"])
        .sort_values("snapshot_ts")
    )

    for evt in snap_times["event_ticker"].unique():
        s = snap_times[snap_times["event_ticker"] == evt]["snapshot_ts"].sort_values()
        deltas = s.diff().dropna().dt.total_seconds()
        info = parse_event_ticker(evt)
        display(Markdown(f"### {evt} — {info['event_label']}"))
        display(Markdown(
            f"- **Snapshots:** {len(s)}\n"
            f"- **Time span:** {s.min()} → {s.max()}\n"
            f"- **Interval — mean:** {deltas.mean():.1f}s, "
            f"**median:** {deltas.median():.1f}s, "
            f"**min:** {deltas.min():.1f}s, "
            f"**max:** {deltas.max():.1f}s"
        ))

    # Timeline dot plot — one dot per snapshot
    fig = px.strip(
        snap_times,
        x="snapshot_ts", y="event_label",
        color="event_label",
        title="Snapshot timeline",
        labels={"snapshot_ts": "UTC", "event_label": ""},
    )
    fig.update_traces(marker_size=5)
    fig.update_layout(showlegend=False, height=200 + 60 * len(event_tickers))
    fig.show()

### KXHIGHCHI-26FEB19 — Chicago — Feb 19, 2026

- **Snapshots:** 3513
- **Time span:** 2026-02-19 03:40:10.443637+00:00 → 2026-02-20 00:43:43.664683+00:00
- **Interval — mean:** 21.6s, **median:** 10.0s, **min:** 0.0s, **max:** 2878.3s

### KXHIGHNY-26FEB19 — New York — Feb 19, 2026

- **Snapshots:** 3510
- **Time span:** 2026-02-19 03:40:10.443637+00:00 → 2026-02-20 00:43:43.664683+00:00
- **Interval — mean:** 21.6s, **median:** 10.0s, **min:** 0.0s, **max:** 3085.6s

## 10. Historical candlesticks & trades (if backfilled)

In [47]:
if candle_df.empty and trade_df.empty:
    display(Markdown(
        "> No historical data yet.  Run the backfill script to populate:\n"
        "> ```bash\n"
        "> pred_env/bin/python pred_market_src/collector/backfill.py --start 2026-02-01\n"
        "> ```"
    ))
else:
    if not candle_df.empty:
        display(Markdown("### Candlestick data"))
        display(Markdown(f"Rows: {len(candle_df):,}  |  Events: {candle_df['event_ticker'].nunique()}"))

        # OHLC chart per event
        for evt in candle_df["event_ticker"].unique():
            edf = candle_df[candle_df["event_ticker"] == evt]
            for tk in edf["market_ticker"].unique():
                tdf = edf[edf["market_ticker"] == tk].sort_values("timestamp")
                fig = go.Figure(go.Candlestick(
                    x=tdf["timestamp"],
                    open=tdf["open_price"], high=tdf["high_price"],
                    low=tdf["low_price"], close=tdf["close_price"],
                ))
                fig.update_layout(title=f"{tk}", xaxis_title="Time", yaxis_title="Price")
                fig.show()

    if not trade_df.empty:
        display(Markdown("### Trade data"))
        display(Markdown(f"Rows: {len(trade_df):,}  |  Events: {trade_df['event_ticker'].nunique()}"))
        display(trade_df.head(20))

> No historical data yet.  Run the backfill script to populate:
> ```bash
> pred_env/bin/python pred_market_src/collector/backfill.py --start 2026-02-01
> ```

## 11. Raw data explorer

Quick peek at the raw dataframes for debugging.

In [48]:
display(Markdown("### Market snapshots (first 20 rows)"))
display(mkt.head(20))

display(Markdown("### Orderbook — raw storage (first 20 rows)"))
display(Markdown(
    "_Contains `baseline` (full book dump) and `delta` (changed levels only) rows. "
    "Not suitable for direct analysis — use the reconstructed view below._"
))
if not ob_raw_df.empty and "snapshot_type" in ob_raw_df.columns:
    display(ob_raw_df.head(20))
    display(ob_raw_df["snapshot_type"].value_counts().rename("row count").to_frame())
else:
    display(ob_raw_df.head(20))

display(Markdown("### Orderbook — reconstructed / delta-stitched (first 20 rows)"))
display(Markdown(
    "_Full book state at every snapshot timestamp after replaying baselines + deltas. "
    "Use `ob_df` / `event_ob()` for all analysis and plotting._"
))
display(ob_df.head(20))

if not ob_raw_df.empty and not ob_df.empty:
    raw_rows, rec_rows = ob_raw_df.shape[0], ob_df.shape[0]
    saved = (1 - raw_rows / max(rec_rows, 1)) * 100
    sign = "-" if saved > 0 else "+"
    display(Markdown(
        f"**Storage efficiency**: {raw_rows:,} raw rows → {rec_rows:,} reconstructed rows "
        f"({sign}{abs(saved):.0f}% via delta compression)"
    ))


### Market snapshots (first 20 rows)

Unnamed: 0,snapshot_ts,event_ticker,market_ticker,subtitle,yes_bid,yes_ask,last_price,volume,open_interest,trigger,series,city_code,city,target_date,event_label,mid_price,spread
0,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19,KXHIGHCHI-26FEB19-B61.5,61° to 62°,2,3,2,1690,1474,periodic,KXHIGHCHI,CHI,Chicago,2026-02-19,"Chicago — Feb 19, 2026",2.5,1
1,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19,KXHIGHCHI-26FEB19-B59.5,59° to 60°,5,6,6,1752,874,periodic,KXHIGHCHI,CHI,Chicago,2026-02-19,"Chicago — Feb 19, 2026",5.5,1
2,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19,KXHIGHCHI-26FEB19-T57,56° or below,69,70,69,14046,10720,periodic,KXHIGHCHI,CHI,Chicago,2026-02-19,"Chicago — Feb 19, 2026",69.5,1
3,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19,KXHIGHCHI-26FEB19-B63.5,63° to 64°,1,2,1,978,888,periodic,KXHIGHCHI,CHI,Chicago,2026-02-19,"Chicago — Feb 19, 2026",1.5,1
4,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19,KXHIGHCHI-26FEB19-B57.5,57° to 58°,26,27,26,3068,2924,periodic,KXHIGHCHI,CHI,Chicago,2026-02-19,"Chicago — Feb 19, 2026",26.5,1
5,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19,KXHIGHCHI-26FEB19-T64,65° or above,0,1,1,1848,1524,periodic,KXHIGHCHI,CHI,Chicago,2026-02-19,"Chicago — Feb 19, 2026",0.5,1
6,2026-02-19 03:40:10.443637+00:00,KXHIGHNY-26FEB19,KXHIGHNY-26FEB19-T47,48° or above,0,1,1,2288,2006,periodic,KXHIGHNY,NY,New York,2026-02-19,"New York — Feb 19, 2026",0.5,1
7,2026-02-19 03:40:10.443637+00:00,KXHIGHNY-26FEB19,KXHIGHNY-26FEB19-B42.5,42° to 43°,16,17,19,13080,7492,periodic,KXHIGHNY,NY,New York,2026-02-19,"New York — Feb 19, 2026",16.5,1
8,2026-02-19 03:40:10.443637+00:00,KXHIGHNY-26FEB19,KXHIGHNY-26FEB19-B40.5,40° to 41°,51,52,53,15468,13342,periodic,KXHIGHNY,NY,New York,2026-02-19,"New York — Feb 19, 2026",51.5,1
9,2026-02-19 03:40:10.443637+00:00,KXHIGHNY-26FEB19,KXHIGHNY-26FEB19-B44.5,44° to 45°,4,5,5,3168,2862,periodic,KXHIGHNY,NY,New York,2026-02-19,"New York — Feb 19, 2026",4.5,1


### Orderbook — raw storage (first 20 rows)

_Contains `baseline` (full book dump) and `delta` (changed levels only) rows. Not suitable for direct analysis — use the reconstructed view below._

Unnamed: 0,snapshot_ts,market_ticker,side,price_cents,quantity,snapshot_type
0,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B61.5,yes,2,59.0,baseline
1,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B61.5,yes,1,258.0,baseline
2,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B61.5,no,97,146.0,baseline
3,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B61.5,no,96,190.0,baseline
4,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B61.5,no,95,326.0,baseline
5,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B61.5,no,94,7.0,baseline
6,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B61.5,no,91,3.0,baseline
7,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B59.5,yes,5,95.0,baseline
8,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B59.5,yes,4,76.0,baseline
9,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B59.5,yes,3,450.0,baseline


Unnamed: 0_level_0,row count
snapshot_type,Unnamed: 1_level_1
baseline,5481
delta,189


### Orderbook — reconstructed / delta-stitched (first 20 rows)

_Full book state at every snapshot timestamp after replaying baselines + deltas. Use `ob_df` / `event_ob()` for all analysis and plotting._

Unnamed: 0,snapshot_ts,market_ticker,side,price_cents,quantity,snapshot_type
0,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B61.5,yes,2,59.0,reconstructed
1,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B61.5,yes,1,258.0,reconstructed
2,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B61.5,no,91,3.0,reconstructed
3,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B61.5,no,94,7.0,reconstructed
4,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B61.5,no,95,326.0,reconstructed
5,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B61.5,no,96,190.0,reconstructed
6,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19-B61.5,no,97,146.0,reconstructed
7,2026-02-19 03:40:10.443637+00:00,KXHIGHNY-26FEB19-B40.5,yes,47,300.0,reconstructed
8,2026-02-19 03:40:10.443637+00:00,KXHIGHNY-26FEB19-B40.5,yes,48,25.0,reconstructed
9,2026-02-19 03:40:10.443637+00:00,KXHIGHNY-26FEB19-B40.5,yes,49,42.0,reconstructed


**Storage efficiency**: 5,670 raw rows → 5,651 reconstructed rows (+0% via delta compression)

In [49]:
mkt.head()

Unnamed: 0,snapshot_ts,event_ticker,market_ticker,subtitle,yes_bid,yes_ask,last_price,volume,open_interest,trigger,series,city_code,city,target_date,event_label,mid_price,spread
0,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19,KXHIGHCHI-26FEB19-B61.5,61° to 62°,2,3,2,1690,1474,periodic,KXHIGHCHI,CHI,Chicago,2026-02-19,"Chicago — Feb 19, 2026",2.5,1
1,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19,KXHIGHCHI-26FEB19-B59.5,59° to 60°,5,6,6,1752,874,periodic,KXHIGHCHI,CHI,Chicago,2026-02-19,"Chicago — Feb 19, 2026",5.5,1
2,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19,KXHIGHCHI-26FEB19-T57,56° or below,69,70,69,14046,10720,periodic,KXHIGHCHI,CHI,Chicago,2026-02-19,"Chicago — Feb 19, 2026",69.5,1
3,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19,KXHIGHCHI-26FEB19-B63.5,63° to 64°,1,2,1,978,888,periodic,KXHIGHCHI,CHI,Chicago,2026-02-19,"Chicago — Feb 19, 2026",1.5,1
4,2026-02-19 03:40:10.443637+00:00,KXHIGHCHI-26FEB19,KXHIGHCHI-26FEB19-B57.5,57° to 58°,26,27,26,3068,2924,periodic,KXHIGHCHI,CHI,Chicago,2026-02-19,"Chicago — Feb 19, 2026",26.5,1


In [52]:
import plotly.express as px
import pandas as pd

# 1. Prep the data
# Convert timestamp and filter for the specific event
mkt['snapshot_ts'] = pd.to_datetime(mkt['snapshot_ts'])
event_id = 'KXHIGHCHI-26FEB19'
df_filtered = mkt[mkt['event_ticker'] == event_id].sort_values('snapshot_ts')

# 2. Create the Interactive Plot
fig = px.line(
    df_filtered, 
    x='snapshot_ts', 
    y='yes_ask', 
    color='subtitle',  # This creates a separate line/legend entry for each range
    title=f"Price Trends: {event_id}",
    labels={
        'snapshot_ts': 'Time',
        'yes_ask': 'Yes Ask Price (¢)',
        'subtitle': 'Temperature Range'
    },
    markers=True,
    template='plotly_white'
)

# 3. Optimize for "Checkbox" behavior and Zooming
fig.update_layout(
    hovermode='x unified', # Shows all active prices at once when hovering
    xaxis=dict(
        rangeslider=dict(visible=True), # Adds a 'zoom' slider at the bottom
        type='date'
    ),
    yaxis=dict(range=[0, 100]), # Standard 0-100 cent scale
    legend=dict(
        itemclick="toggle",      # Single click = Toggle (Checkbox behavior)
        itemdoubleclick="toggleothers" # Double click = Isolate range
    )
)

# 4. Display the plot
fig.show()

## 6. Weather Data Integration\n\nLoading METAR, ASOS 1-minute, and Daily Climate datasets.

In [None]:
# ---------- 6. Load Weather Data ----------
WEATHER_DIR = DATA_ROOT / "weather_obs"

metar_df = load_all_parquets(WEATHER_DIR / "metar")
asos_df = load_all_parquets(WEATHER_DIR / "asos_1min")
cli_df = load_all_parquets(WEATHER_DIR / "daily_climate")

print(f"METAR             : {metar_df.shape[0]:>8,} rows")
print(f"ASOS 1-min        : {asos_df.shape[0]:>8,} rows")
print(f"Daily Climate     : {cli_df.shape[0]:>8,} rows")

# Convert timestamp columns to DatetimeIndex to align with orderbook
if not metar_df.empty and 'valid_utc' in metar_df.columns:
    metar_df['valid_utc'] = pd.to_datetime(metar_df['valid_utc'])
if not asos_df.empty and 'valid_utc' in asos_df.columns:
    asos_df['valid_utc'] = pd.to_datetime(asos_df['valid_utc'])
if not cli_df.empty and 'valid_utc' in cli_df.columns:
    cli_df['valid_utc'] = pd.to_datetime(cli_df['valid_utc'])


## 7. Market vs Weather\n\nComparing orderbook trends with actual weather temperatures for the target location.

In [None]:
# ---------- 7. Market vs Weather ----------
# Filtering weather for the targeted city
# The event city code is available in selected_event() or em via parse_event_ticker
evt_info = parse_event_ticker(selected_event())
city_code = evt_info['city_code']
target_date = pd.to_datetime(evt_info['target_date']).tz_localize('UTC')

# Maps NY -> KNYC, CHI -> KMDW
STATION_MAP = {
    'NY': 'KNYC',
    'CHI': 'KMDW',
    'MIA': 'KMIA',
    'DEN': 'KDEN',
    'AUS': 'KAUS',
    'HOU': 'KIAH',
    'PHL': 'KPHL',
}
station = STATION_MAP.get(city_code)

if station and not metar_df.empty and not asos_df.empty:
    evt_metar = metar_df[metar_df['station'] == station].copy()
    evt_asos = asos_df[asos_df['station'] == station].copy()
    
    # Filter by target date (+/- 1 day) for better plotting
    evt_metar = evt_metar[
        (evt_metar['valid_utc'] >= target_date - pd.Timedelta(days=1)) & 
        (evt_metar['valid_utc'] <= target_date + pd.Timedelta(days=1))
    ].sort_values('valid_utc')
    
    evt_asos = evt_asos[
        (evt_asos['valid_utc'] >= target_date - pd.Timedelta(days=1)) & 
        (evt_asos['valid_utc'] <= target_date + pd.Timedelta(days=1))
    ].sort_values('valid_utc')
else:
    evt_metar = pd.DataFrame()
    evt_asos = pd.DataFrame()


In [None]:

# Now plot weather in parallel with orderbook
em_plot = event_mkt()
if not em_plot.empty:
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    
    colors = px.colors.qualitative.Plotly
    
    # Plot contracts mid prices
    for i, tk in enumerate(em_plot['market_ticker'].unique()):
        tk_df = em_plot[em_plot['market_ticker'] == tk].sort_values('snapshot_ts')
        subtitle = tk_df['subtitle'].iloc[0] if 'subtitle' in tk_df.columns else tk
        fig.add_trace(
            go.Scatter(x=tk_df['snapshot_ts'], y=tk_df['mid_price'], mode='lines', 
                       name=f"Contract: {subtitle}", line=dict(color=colors[i % len(colors)])),
            secondary_y=False,
        )
        
    # Plot weather
    if not evt_metar.empty and 'temp_f' in evt_metar.columns:
        fig.add_trace(
            go.Scatter(x=evt_metar['valid_utc'], y=evt_metar['temp_f'], mode='markers', 
                       name='METAR Temp (F)', marker=dict(symbol='square', size=8, color='black')),
            secondary_y=True,
        )
    if not evt_asos.empty and 'tmpf' in evt_asos.columns:
        fig.add_trace(
            go.Scatter(x=evt_asos['valid_utc'], y=evt_asos['tmpf'], mode='lines', 
                       name='ASOS 1-min Temp (F)', line=dict(dash='dot', color='red', width=2)),
            secondary_y=True,
        )
        
    fig.update_layout(
        title=f"Market vs Weather for {selected_event()}", 
        hovermode="x unified",
        height=600,
        legend=dict(yanchor="top", y=0.99, xanchor="left", x=1.05)
    )
    fig.update_yaxes(title_text="Implied Probability (cents)", secondary_y=False, range=[0, 100])
    fig.update_yaxes(title_text="Temperature (°F)", secondary_y=True)
    
    fig.show()
else:
    print("No orderbook data available to plot.")
