In [1]:
# End-to-end time-series plotting with Plotly (no resampling, no label restrictions).
import os
import pandas as pd
import numpy as np
from pandas.api.types import is_datetime64_any_dtype, is_datetime64tz_dtype
import plotly.express as px

try:
    from caas_jupyter_tools import display_dataframe_to_user
except Exception:
    def display_dataframe_to_user(name, dataframe):
        print(f"[{name}]"); print(dataframe.head())

# ---- Config ----
CSV_PATH = "archive/Payload_data_UNSW.csv"

# ---- Helpers (patched) ----
def _guess_time_series(s: pd.Series) -> pd.Series:
    """Return pandas datetime series from a wide range of possible time formats."""
    if is_datetime64_any_dtype(s.dtype) or is_datetime64tz_dtype(s.dtype):
        return pd.to_datetime(s, utc=True, errors="coerce")
    dt = pd.to_datetime(s, utc=True, errors="coerce", infer_datetime_format=True)
    if dt.notna().mean() > 0.6:
        return dt
    sn = pd.to_numeric(s, errors="coerce")
    if sn.notna().sum() == 0:
        return dt
    q = sn.quantile(0.5)
    if q > 1e12:
        return pd.to_datetime(sn, unit="ns", utc=True, errors="coerce")
    elif q > 1e10:
        return pd.to_datetime(sn, unit="ms", utc=True, errors="coerce")
    elif q > 1e8:
        return pd.to_datetime(sn, unit="s", utc=True, errors="coerce")
    else:
        return pd.to_datetime(sn, unit="s", utc=True, errors="coerce")

def _find_time_column(df: pd.DataFrame) -> str | None:
    """Heuristic name-based search for a time column."""
    candidates = ["time","timestamp","ts","datetime","date","flow_start","start_time","start"]
    cols_map = {str(c).lower(): c for c in df.columns}
    for k in candidates:
        if k in cols_map:
            return cols_map[k]
    for c in df.columns:
        if "time" in str(c).lower():
            return c
    return None

def _auto_detect_time_column(df: pd.DataFrame) -> tuple[str, pd.Series] | tuple[None, None]:
    """
    If no obvious time column by name, try all columns:
    choose the one that parses to datetime with the highest non-null coverage.
    """
    best_col = None
    best_dt = None
    best_cov = -1.0
    for c in df.columns:
        try:
            dt = _guess_time_series(df[c])
            cov = dt.notna().mean()
            if cov > best_cov and cov >= 0.5:  # require at least 50% parse coverage
                best_cov, best_col, best_dt = cov, c, dt
        except Exception:
            continue
    if best_col is not None:
        return best_col, best_dt
    return None, None

# ---- Load ----
if not os.path.exists(CSV_PATH):
    alt = os.path.join("/mnt/data", CSV_PATH)
    if os.path.exists(alt):
        CSV_PATH = alt

if not os.path.exists(CSV_PATH):
    # Demo with more labels and a longer time span (7 days, 1-min cadence)
    rng = np.random.default_rng(0)
    t0 = pd.Timestamp.utcnow().floor("min") - pd.Timedelta(days=7)
    labels = ["normal","generic","exploits","fuzzers","reconnaissance","dos","backdoor","analysis","shellcode","worms"]
    demo = pd.DataFrame({
        "time": pd.date_range(t0, periods=7*24*60, freq="T"),
        "label_str": rng.choice(labels, size=7*24*60, p=np.array([0.45,0.15,0.06,0.06,0.08,0.08,0.03,0.03,0.03,0.03])),
        "record_volume": rng.poisson(lam=5, size=7*24*60)
    })
    df = demo
else:
    df = pd.read_csv(CSV_PATH)

# ---- Validate / enrich label ----
if "label_str" not in df.columns:
    if "label" in df.columns:
        df["label_str"] = df["label"].astype(str).fillna("unknown")
    else:
        df["label_str"] = "unknown"

# ---- Detect/parse time (no name restrictions; try auto-detect when needed) ----
time_col = _find_time_column(df)
dt_series = None
if time_col is not None:
    dt_series = _guess_time_series(df[time_col])

if time_col is None or dt_series.notna().sum() == 0:
    auto_col, auto_dt = _auto_detect_time_column(df)
    if auto_col is not None:
        time_col, dt_series = auto_col, auto_dt

# Fallback if still nothing usable
if time_col is None or dt_series is None or dt_series.notna().sum() == 0:
    # Create a synthetic monotonic time axis to avoid dropping data
    time_col = "time"
    dt_series = pd.date_range(pd.Timestamp.utcnow().floor("min"), periods=len(df), freq="S")

# Bind parsed time and drop nulls
df = df.assign(__dt=dt_series).dropna(subset=["__dt"])
df = df.sort_values("__dt")  # ensure ordered time

# ---- Build wide frame without resampling (no time/label restrictions) ----
metric_col = "record_volume" if "record_volume" in df.columns else None

if metric_col:
    # Aggregate duplicates on the same timestamp/label without downsampling
    wide = (df.groupby(["__dt", "label_str"], as_index=False)[metric_col]
              .sum()
              .pivot(index="__dt", columns="label_str", values=metric_col)
              .sort_index()
              .fillna(0.0))
else:
    # Count events per timestamp/label
    wide = (df.groupby(["__dt", "label_str"], as_index=False)
              .size()
              .pivot(index="__dt", columns="label_str", values="size")
              .sort_index()
              .fillna(0.0))

# ---- Show table (all rows; may be large) ----
try:
    display_dataframe_to_user(
        "Time x label_str (no resampling)",
        wide.reset_index().rename(columns={"__dt": "time"})
    )
except Exception:
    pass

# ---- Plot (Plotly + WebGL, rangeslider) ----
plot_df = (
    wide.reset_index()
        .rename(columns={"__dt": "time"})
        .melt(id_vars="time", var_name="label_str", value_name="value")
)

y_title = "Record volume" if metric_col else "Record count"

fig = px.line(
    plot_df,
    x="time",
    y="value",
    color="label_str",
    title="Record Volume over Time by label_str (no resampling)",
    render_mode="webgl"  # enables Scattergl for large data
)

fig.update_layout(
    hovermode="x unified",
    legend_title_text="label_str",
    margin=dict(l=40, r=20, t=60, b=40),
)

fig.update_xaxes(
    title_text="Time (UTC)",
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1h", step="hour", stepmode="backward"),
            dict(count=6, label="6h", step="hour", stepmode="backward"),
            dict(count=1, label="1d", step="day", stepmode="backward"),
            dict(count=7, label="7d", step="day", stepmode="backward"),
            dict(step="all", label="All")
        ])
    )
)
fig.update_yaxes(title_text=y_title)

fig.show()


  "time": pd.date_range(t0, periods=7*24*60, freq="T"),


[Time x label_str (no resampling)]
label_str                      time  analysis  backdoor  dos  exploits  \
0         2025-08-19 13:45:00+00:00       0.0       0.0  0.0       4.0   
1         2025-08-19 13:46:00+00:00       0.0       0.0  0.0       0.0   
2         2025-08-19 13:47:00+00:00       0.0       0.0  0.0       0.0   
3         2025-08-19 13:48:00+00:00       0.0       0.0  0.0       0.0   
4         2025-08-19 13:49:00+00:00       0.0       0.0  4.0       0.0   

label_str  fuzzers  generic  normal  reconnaissance  shellcode  worms  
0              0.0      0.0     0.0             0.0        0.0    0.0  
1              0.0      0.0     6.0             0.0        0.0    0.0  
2              0.0      0.0     7.0             0.0        0.0    0.0  
3              0.0      0.0     4.0             0.0        0.0    0.0  
4              0.0      0.0     0.0             0.0        0.0    0.0  
