In [None]:
from datetime import datetime
from itertools import chain, count, repeat
from pathlib import Path
from typing import Iterator

import ipywidgets as widgets
import pandas as pd
import seaborn.objects as so
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True)

In [None]:
type Event = tuple[datetime, str, int, int | None, int | None, int | None, int | None]


def parse_entry(line: str, lines_iter: Iterator[str]) -> Iterator[Event]:
    # example with 3 log entries
    #
    # 2025-10-08T17:36:08.987-03:00 debug id 207 value 0
    # w: 0 39 36 74 146 301 597 1029 0 0 0 0 0 0 0 0 0
    # s: 66777 65256 2078 918 13016 30650 33489 0 0 0 0 0 0 0 0 0 0
    # 2025-10-08T17:36:09.237-03:00 detected level 2
    # 2025-10-08T17:36:09.237-03:00 debug id 208 value 0
    # w: 0 39 36 74 149 301 597 1029 0 0 0 0 0 0 0 0 0
    # s: 66777 66777 2087 919 13016 30650 33489 0 0 0 0 0 0 0 0 0 0
    words = line.split()

    timestamp, entry_type = datetime.fromisoformat(words[0]), words[1]
    match entry_type:
        case "detected":
            assert words[2] == "level"
            level = int(words[3])

            yield (
                timestamp,
                entry_type,
                level,
                None,
                None,
                None,
                None,
            )

        case "debug":
            assert words[2] == "id"
            idx = int(words[3])

            assert words[4] == "value"
            sample = int(words[5])

            w_header, *ws = next(lines_iter).split()
            s_header, *ss = next(lines_iter).split()

            assert w_header == "w:"
            assert s_header == "s:"
            assert len(ws) == len(ss)

            # basically explode w and s columns
            yield from zip(
                repeat(timestamp),
                repeat(entry_type),
                count(),  # 0, 1, 2...
                repeat(idx),
                repeat(sample),
                [int(w) for w in ws],
                [int(s) for s in ss],
            )

        case _:
            raise Exception("non-conforming log or bug in parser")


def parse_file(file_name: Path):
    lines_iter = iter(open(file_name))
    while (line := next(lines_iter, None)) is not None:
        yield from [(str(file_name), *entry) for entry in parse_entry(line, lines_iter)]


log_files = [
    f
    for f in Path("results").iterdir()
    if (f.is_file() and not f.name.endswith(".err"))
]

df = pd.DataFrame(
    chain.from_iterable([parse_file(log_file) for log_file in log_files]),
    columns=[
        "capture",
        "timestamp",
        "type",
        "decomposition_level",
        "index",
        "sample",
        "w",
        "s",
    ],
)

dtype_mapping = {
    "capture": "category",
    "type": "category",
    # Use nullable unsigned integers for columns that may contain missing values
    "decomposition_level": "UInt32",
    "index": "UInt32",
    "sample": "UInt32",
    "w": "UInt32",
    "s": "UInt32",
}

# Apply the type conversions
df = df.astype(dtype_mapping)

In [None]:
df_debug = df["type"] == "debug"
df[df_debug].drop_duplicates(subset=["capture", "index", "sample"]).groupby(
    "capture", observed=True
)["sample"].sum()

In [None]:
alpha = 3
beta = 2

df = df.sort_values(by=["capture", "index", "decomposition_level"])

prev_s = df.groupby(["capture", "index"], observed=True)["s"].shift(1)

df["drop"] = (beta * prev_s) > (1 * alpha * df["s"])

In [None]:
captures = df["capture"].unique()
timestamps = {}
for capture in captures:
    timestamps[capture] = df[df["capture"] == capture]["timestamp"]

initial_capture = captures[0]
initial_timestamps = df[df["capture"] == initial_capture]["timestamp"]

capture_dropdown = widgets.Dropdown(options=captures, description="Capture:")

timestamp_slider = widgets.SelectionSlider(
    options=initial_timestamps, description="Timestamp", layout={"width": "800px"}
)


def update_timestamp_options(change):
    selected_capture = change["new"]
    timestamp_slider.options = timestamps[selected_capture]


capture_dropdown.observe(update_timestamp_options, names="value")


def update_plot(capture, timestamp):
    filtered_data = df[(df["capture"] == capture) & (df["timestamp"] == timestamp)]
    p = (
        so.Plot(filtered_data, "decomposition_level", "s")
        .add(so.Line())
        .add(so.Dot(), color="drop")
        .scale(color=["red", "blue"])
    )
    display(p)


widgets.interactive(update_plot, capture=capture_dropdown, timestamp=timestamp_slider)

In [None]:
so.Plot(
    df[df["index"] == df.groupby("capture", observed=True)["index"].transform("max")],
    "decomposition_level",
    "s",
    color="capture",
).add(so.Line())


for capture in df["capture"].unique():
    filtered = df[
        (df["capture"] == capture)
        & (
            df["index"]
            == df.groupby("capture", observed=True)["index"].transform("max")
        )
    ]
    p = (
        so.Plot(filtered, "decomposition_level", "s")
        .add(so.Line())
        .add(so.Dot(), color="drop")
        .scale(color=["red", "blue"])
        .label(title=capture)
    )
    display(p)
    p.save(f"../plots/{capture}.pdf", format="pdf")