In [None]:
import re
from datetime import datetime
from itertools import chain, count, islice, repeat
from pathlib import Path
from typing import Iterator

import ipywidgets as widgets
import matplotlib.pyplot as plt
import pandas as pd
import seaborn.objects as so
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True)

In [None]:
type Event = tuple[str, str, str, str | None, str | None, str | None, str | None]


def parse_entry(line: str, lines_iter: Iterator[str]) -> Iterator[Event]:
    # example with 3 log entries
    #
    # 2025-10-08T17:36:08.987-03:00 debug id 207 value 0
    # w: 0 39 36 74 146 301 597 1029 0 0 0 0 0 0 0 0 0
    # s: 66777 65256 2078 918 13016 30650 33489 0 0 0 0 0 0 0 0 0 0
    # 2025-10-08T17:36:09.237-03:00 detected level 2
    # 2025-10-08T17:36:09.237-03:00 debug id 208 value 0
    # w: 0 39 36 74 149 301 597 1029 0 0 0 0 0 0 0 0 0
    # s: 66777 66777 2087 919 13016 30650 33489 0 0 0 0 0 0 0 0 0 0
    words = line.split()

    timestamp, entry_type = words[0], words[1]
    match entry_type:
        case "detected":
            assert words[2] == "level"
            level = words[3]

            yield (
                timestamp,
                entry_type,
                level,
                None,
                None,
                None,
                None,
            )

        case "debug":
            assert words[2] == "id"
            idx = words[3]

            assert words[4] == "value"
            sample = words[5]

            w_header, *ws = next(lines_iter).split()
            s_header, *ss = next(lines_iter).split()

            assert w_header == "w:"
            assert s_header == "s:"
            assert len(ws) == len(ss)

            # basically explode w and s columns
            yield from zip(
                repeat(timestamp),
                repeat(entry_type),
                map(str, count()),  # 0, 1, 2...
                repeat(idx),
                repeat(sample),
                ws,
                ss,
            )

        case _:
            raise Exception("non-conforming log or bug in parser")


def parse_file(file_name: Path):
    interval_ns = re.search(r"-t(\d+)", str(file_name)).group(1)

    lines_iter = iter(open(file_name))
    while (line := next(lines_iter, None)) is not None:
        yield from [
            (str(file_name), interval_ns, *entry)
            for entry in parse_entry(line, lines_iter)
        ]


log_files = [
    f
    for f in Path("results").iterdir()
    if (f.is_file() and not f.name.endswith(".err"))
]

df = pd.DataFrame(
    chain.from_iterable([parse_file(log_file) for log_file in log_files]),
    columns=[
        "capture",
        "interval_ns",
        "timestamp",
        "type",
        "decomposition_level",
        "index",
        "sample",
        "w",
        "s",
    ],
)

dtype_mapping = {
    "capture": "category",
    "interval_ns": "uint64",
    "type": "category",
    "decomposition_level": "uint16",
    # Use nullable unsigned integers for columns that may contain missing values
    "index": "UInt64",
    "sample": "UInt64",
    "w": "UInt64",
    "s": "UInt64",
}

# Apply the type conversions
df = df.astype(dtype_mapping)
df["timestamp"] = pd.to_datetime(df["timestamp"], format="ISO8601")

In [None]:
df_debug = df["type"] == "debug"
df[df_debug].drop_duplicates(subset=["capture", "index", "sample"]).groupby(
    "capture", observed=True
)["sample"].sum()

In [None]:
alpha = 3
beta = 2

df = df.sort_values(by=["capture", "index", "decomposition_level"])

prev_s = df.groupby(["capture", "index"], observed=True)["s"].shift(1)

df["drop"] = (beta * prev_s) > (2 * alpha * df["s"])

In [None]:
captures = df["capture"].unique()
timestamps = {}
for capture in captures:
    timestamps[capture] = df[df["capture"] == capture]["timestamp"]

initial_capture = captures[0]
initial_timestamps = df[df["capture"] == initial_capture]["timestamp"]

capture_dropdown = widgets.Dropdown(options=captures, description="Capture:")

timestamp_slider = widgets.SelectionSlider(
    options=initial_timestamps, description="Timestamp", layout={"width": "800px"}
)


def update_timestamp_options(change):
    selected_capture = change["new"]
    timestamp_slider.options = timestamps[selected_capture]


capture_dropdown.observe(update_timestamp_options, names="value")


def update_plot(capture, timestamp):
    filtered_data = df[(df["capture"] == capture) & (df["timestamp"] == timestamp)]
    p = (
        so.Plot(filtered_data, "decomposition_level", "s")
        .add(so.Line())
        .add(so.Dot(), color="drop")
        .scale(color=["red", "blue"])
    )
    display(p)


widgets.interactive(update_plot, capture=capture_dropdown, timestamp=timestamp_slider)

In [None]:
so.Plot(
    df[df["index"] == df.groupby("capture", observed=True)["index"].transform("max")],
    "decomposition_level",
    "s",
    color="capture",
).add(so.Line())


for capture in df["capture"].unique():
    filtered = df[
        (df["capture"] == capture)
        & (
            df["index"]
            == df.groupby("capture", observed=True)["index"].transform("max")
        )
    ]
    p = (
        so.Plot(filtered, "decomposition_level", "s")
        .add(so.Line())
        .add(so.Dot(), color="drop")
        .scale(color=["red", "blue"])
        .label(title=capture)
    )
    display(p)
# p.save(f"./plots/{capture}.pdf", format="pdf")

In [None]:
import numpy as np
import numpy.ma as ma


def level_to_period(interval):
    return lambda x: np.exp2(x) * interval


def period_to_level(interval):
    return lambda x: ma.log2(x / interval)


def ns_to_s(ns, pos):
    return ns / 1_000_000_000

In [None]:
import math

import matplotlib.ticker as ticker
import numpy as np
import scienceplots

plt.style.use(["default", "science", "grid", "ieee"])

max_index_series = df.groupby("capture", observed=True)["index"].transform("max")

captures = df["capture"].unique()

selected_captures = [
    # 'cobalt-strike-filtered-trimmed_-t250000000',
    # 'heartbleed-full_-t250000000',
    "trickbot-a-filtered-trimmed-further_-t60000000",
    "trickbot-b-filtered-trimmed-further_-t60000000",
]
captures = filter(lambda x: any(s in x for s in selected_captures), captures)

for capture in captures:
    filtered = df[
        (df["capture"] == capture)
        & (df["index"] == max_index_series)
        & (df["decomposition_level"] <= 13)
    ]
    interval = filtered["interval_ns"].iloc[0]

    fig, ax = plt.subplots()

    ax.set_yscale("symlog")

    ax.plot(filtered["decomposition_level"], filtered["s"])
    # colors = filtered["drop"].fillna(False).map({False: "blue", True: "red"}).tolist()
    # ax.scatter(filtered["decomposition_level"], filtered["s"], c=colors)
    ax.xaxis.minorticks_off()

    ax.xaxis.set_major_locator(ticker.MultipleLocator(base=1))
    # ax.set_title(capture)
    ax.set_xlabel("Decomposition Level")
    ax.set_ylabel("S value")

    # Secondary axis at the top showing period in seconds for each decomposition level
    secax = ax.secondary_xaxis(
        "top", functions=(level_to_period(interval), period_to_level(interval))
    )
    secax.tick_params("x", rotation=90)
    secax.set_xlabel("Period (s)")
    secax.xaxis.minorticks_off()
    secax.set_xticks(level_to_period(interval)(ax.get_xticks()), minor=False)
    secax.xaxis.set_major_formatter(ns_to_s)

    # plt.show()
    fig.savefig(f"{capture.removeprefix('results/')}.pdf")