In [None]:
from sensession.campaign import CampaignProcessor

import polars as pl
import hvplot.polars  # noqa: F401

import matplotlib.pyplot as plt
from IPython.display import display

# Data loading

## Zeroth preprocessing: "Cleaning"

The first steps only involve discarding invalid subcarriers and dropping values irrelevant for this notebook to save some memory.

In [None]:
all_receivers_expr = [
    pl.col("receiver_name") == "asus1",
    pl.col("receiver_name") == "asus2",
    pl.col("receiver_name") == "qca",
    pl.col("receiver_name") == "ESP1",
    pl.col("receiver_name") == "ESP2",
    pl.col("receiver_name") == "ax210",
    pl.col("receiver_name") == "iwl5300",
    pl.col("receiver_name") == "x310",
]

csi = pl.read_parquet("../../data/min/csi.parquet")
meta = pl.read_parquet("../../data/min/meta.parquet")

meta = meta.with_columns(
    session_nr=(pl.col("modified_idx") + 31).cast(pl.UInt32) * 10 + pl.col("rep_nr")
)
proc = CampaignProcessor(
    csi, meta, meta_attach_cols={"modified_idx", "session_nr"}, lazy=False
)

with pl.Config(tbl_cols=-1):
    display(proc.meta.head())
    # display(proc.csi.head())

## Raw Amplitudes

Next, we take a look at the raw amplitudes during warmup sessions.

Warmup sessions consist of repeatedly ($500$ times) retransmitting the exact same unmodified WiFi frame, i.e. where we expect flat CSI.

In [None]:
redundant_cols = [
    "timestamp",
    proc.capture_index,
    proc.antenna_index,
    "stream_idxs",
    "antenna_rssi",
    "rssi",
]

proc2 = (
    proc.correct_rssi_by_agc()
    .unwrap()
    .drop(*redundant_cols)
    .remove_guard_subcarriers()
    .remove_dc_subcarrier()
)


proc2.csi.filter(pl.col("collection_name").str.contains("warmup")).hvplot.scatter(
    x="subcarrier_idxs",
    y="csi_abs",
    by="collection_name",
    groupby=["receiver_name", "antenna_idxs"],
)

## Correlation

We next check how the CSI magnitudes of the warmup sessions are correlated. This means that, for each subcarrier $i$, we interpret the corresponding CSI Amplitude $A_i$ as a random variable. We then compute the Pearson Correlation Coefficients between all of them:

$$
\begin{align}
\rho_{i, j} = \frac{
        \mathbb{E}[A_i - \mu_i] \mathbb{E}[A_j - \mu_j]
    }{
        \sigma_i \sigma_j
    }
\end{align}
$$

where $\mu_i$ and $\sigma_i$ are the mean value and standard deviation of $A_i$, respectively.

The idea is to find out how the "natural fluctuations" are correlated. Does an increase in amplitude in subcarrier $i$ imply an increase in amplitude in subcarrier $j$?

In [None]:
def plot_corr(df: pl.DataFrame):
    # NOTE: This assumes that we only have one stream and one antenna in the data.
    # # Pivot such that one column for each subcarrier
    csi = df.filter(pl.col("collection_name").str.contains("warmup")).filter(
        pl.col("antenna_idxs") == 0
    )

    display(csi)
    csi = (
        csi.pivot(index="stream_capture_num", on="subcarrier_idxs", values="csi_abs")
        .join(
            proc.csi.select("stream_capture_num", "receiver_name"),
            on="stream_capture_num",
        )
        .drop("stream_capture_num")
    )

    # Calculate correlation - Between all subcarrier pairs and for every receiver separately
    # That is, we want to calculate C_ij(r), i and j being subcarriers, r being the receiver.
    # Need to use DataFrame.corr() since pl.corr() only works for 2 variables and we
    # have n_subcarrier cross-correlations to consider
    corr = (
        csi.group_by("receiver_name")
        .map_groups(
            # NOTE:
            # - Have to drop because receiver_name mustn't be considered in correlation
            # - Join receiver_name back after calculation
            # - Join back index (`i` in above description)
            lambda x: (
                x.drop("receiver_name")
                .corr()
                .with_columns(receiver_name=pl.lit(x["receiver_name"][0]))
                .with_columns(subcarrier_idx_a=pl.Series(x.columns[:-1]))
            )
        )
        .fill_nan(0)
    )

    # Unpivot to get a DataFrame with columns:
    # r | i | j | C_ij(r)
    corr = corr.unpivot(
        index=["receiver_name", "subcarrier_idx_a"],
        variable_name="subcarrier_idx_b",
        value_name="correlation",
    )

    # Plot!
    return corr.hvplot.heatmap(
        x="subcarrier_idx_a",
        y="subcarrier_idx_b",
        C="correlation",
        groupby="receiver_name",
        colormap="rainbow",
    )


plot_corr(proc2.csi)

## First preprocessing: Scaling and trending

Next, we normalize every CSI packet amplitude individually, using

$$
\begin{align}
A_i \mapsto \frac{A_i}{\sum_j A_j}
\end{align}
$$

The sum across subcarriers is a voltage quantity, hence this normalization is akin to ignoring voltage-scaling.

We also normalize phases $\varphi_i$ by fixing the outermost subcarrier phases to zero and applying a phase correction in between.
Assume a symmetric subcarrier presence from $-K$ to $K$, then:

$$
\begin{align}
\varphi_i \mapsto \varphi_i - (i-K) * \frac{\varphi_K - \varphi_{-K}}{2K} - \varphi_{-K}
\end{align}
$$

We investigate both amplitudes, a few phases, as well as the correlation from above after these steps.

In [None]:
exclude_expr = (pl.col("subcarrier_idxs") > (pl.col("modified_idx") + 1)) | (
    pl.col("subcarrier_idxs") < (pl.col("modified_idx") - 1)
)

display(proc2.csi)
proc3 = proc2.scale_magnitude(exclude_expr=exclude_expr).detrend_phase()


proc3.csi.filter(pl.col("collection_name").str.contains("warmup")).hvplot.scatter(
    x="subcarrier_idxs",
    y="csi_abs",
    by="collection_name",
    groupby=["receiver_name", "antenna_idxs"],
)

In [None]:
# Plot first ten captured phases of the first session
proc.csi.filter(pl.col("sequence_number").is_in(range(90, 100))).filter(
    pl.col("collection_name").str.contains("warmup")
).hvplot.scatter(
    x="subcarrier_idxs",
    y="csi_phase",
    by="collection_name",
    groupby=["receiver_name", "antenna_idxs"],
)

In [None]:
proc.csi.filter(pl.col("sequence_number").is_in(range(999, 1000))).filter(
    ~pl.col("collection_name").str.contains("warmup")
).hvplot.scatter(
    x="subcarrier_idxs",
    y="csi_phase",
    by="collection_name",
    groupby=["receiver_name", "antenna_idxs"],
)

In [None]:
import numpy as np

test = (
    csi.join(meta, on="meta_id")
    .filter(pl.col("receiver_name") == "asus1")
    .filter(~pl.col("collection_name").str.contains("warmup"))
)
test = test.group_by("meta_id").agg("csi_phase")

phases = np.array(test.get_column("csi_phase").to_list())

phases = phases.squeeze()[-1][0]
# phases[19] -= np.pi
# phases = np.unwrap(phases)
phases = np.unwrap(phases, period=np.pi)

plt.plot(phases)

In [None]:
plot_corr(proc3.csi)

## Second Preprocessing: Shape equalization

In every experiment, we first run a warmup session with $M=500$ unmodified frames. Afterwards follows our actual run with $1000$ modified frames. From the warmups, we can compute an average shape profile for magnitudes and phases

$$
\begin{align}
\bar{\phi}_i &= \frac{1}{500} \sum_{t=1}^{M} \phi_i(t) \\
\bar{A}_i &= \frac{1}{500} \sum_{t=1}^{M} A_i(t)
\end{align}
$$

Then, we equalize on each subcarrier and packet individually to "flatten" out the inherent average shape, likely introduced by Filters and similar hardware components.

$$
\begin{align}
\varphi_i(t) &\mapsto \frac{\varphi_i(t)}{\bar{\phi}_i} \\
A_i(t) &\mapsto \frac{A_i(t)}{\bar{A}_i}
\end{align}
$$

In [None]:
# proc = (
#     proc3
#     .equalize_magnitude()
#     .equalize_phase()
# )

filtered = proc.csi.filter(~pl.col("collection_name").str.contains("warmup"))

filtered.hvplot.scatter(
    x="subcarrier_idxs",
    y="csi_abs",
    by="collection_name",
    groupby=["receiver_name", "antenna_idxs"],
)

In [None]:
filtered.hvplot.scatter(
    x="subcarrier_idxs",
    y="csi_phase",
    by="collection_name",
    groupby=["receiver_name", "antenna_idxs"],
)

In [None]:
# Subselect a few sessions to plot, otherwise the plot is too convoluted.

proc.csi.group_by(
    "receiver_name",
    "antenna_idxs",
    "collection_name",
    "subcarrier_idxs",
    maintain_order=True,
).agg(pl.col("csi_abs").mean()).hvplot.line(
    x="subcarrier_idxs",
    y="csi_abs",
    by="collection_name",
    groupby=["receiver_name", "antenna_idxs"],
    ylim=[0, 4.8],
)

In [None]:
# Also look at the phases!
proc.csi.group_by(
    "receiver_name",
    "antenna_idxs",
    "collection_name",
    "subcarrier_idxs",
    maintain_order=True,
).agg(pl.col("csi_phase").mean()).hvplot.line(
    x="subcarrier_idxs",
    y="csi_phase",
    by="collection_name",
    groupby=["receiver_name", "antenna_idxs"],
)

## Scaling detection

Finally, we check how the per-subcarrier scaling is detected. That is, we plot the average detected scaling after all our normalization steps and the applied theoretical scaling. The actual scaling is marked with a box marker, the detected one with a cross. Color-coding is applied for the different scale values

In [None]:
import numpy as np

csi = proc.csi
meta = proc.meta
csi = csi.filter(~pl.col("collection_name").str.contains("warmup"))

diff = meta.item(0, "scale_range") / 1000
csi = csi.with_columns(scale_factor=pl.col("sequence_number") * diff)

csi = csi.join(proc.meta.select("meta_id", "modified_idx"), on="meta_id").filter(
    pl.col("subcarrier_idxs") == pl.col("modified_idx")
)

modality = "csi_abs"

mean_csi = csi.group_by(
    "collection_name",
    "antenna_idxs",
    "receiver_name",
    "modified_idx",
    "scale_factor",
    maintain_order=True,
).agg(pl.col(modality).mean())


scat1 = mean_csi.hvplot.scatter(
    x="modified_idx",
    y=modality,
    c="scale_factor",
    groupby=["receiver_name", "antenna_idxs"],
    marker="x",
    size=10,
    cmap="rainbow",
)
scat2 = mean_csi.hvplot.scatter(
    x="modified_idx",
    y="scale_factor",
    c="scale_factor",
    groupby=["receiver_name", "antenna_idxs"],
    marker="s",
    size=10,
    cmap="rainbow",
)
display(scat1 * scat2)

In [None]:
csi2 = csi.with_columns(
    err=(pl.col("csi_abs") - pl.col("scale_factor")) ** 2
)  # .filter(pl.col("sequence_number") <= 998)
csi3 = csi.with_columns(err=(pl.col("csi_abs") - pl.col("scale_factor")) ** 2).filter(
    pl.col("sequence_number") <= 998
)

# csi2 = csi2.group_by("receiver_name", "antenna_idxs", maintain_order=True).mean().select("err", "receiver_name")
csi2
csi2.hvplot.scatter(
    x="scale_factor", y="csi_abs", groupby=["receiver_name", "antenna_idxs"]
)

In [None]:
import polars as pl
import numpy as np
import pandas as pd
from scipy.stats import spearmanr, kendalltau
from sklearn.linear_model import HuberRegressor
from sklearn.metrics import r2_score
from sklearn.feature_selection import mutual_info_regression
import pingouin as pg

data = []

# 1) Outer loop over both dataframes
for ds_name, csi in [("csi2", csi2), ("csi3", csi3)]:
    # exactly as before
    csi = csi.filter(~pl.col("csi_abs").is_nan())

    for (receiver_name, modified_idx, antenna_idx), group in csi.group_by(
        ["receiver_name", "modified_idx", "antenna_idxs"], maintain_order=True
    ):
        x = group["scale_factor"].to_numpy().ravel()
        y = group["csi_abs"].to_numpy().ravel()

        # ---- rank metrics ----
        rho, _ = spearmanr(x, y)
        tau, _ = kendalltau(x, y)
        fisher_z = np.inf if abs(rho) == 1 else 0.5 * np.log((1 + rho) / (1 - rho))
        spearman = -np.log10(max(1e-15, 1 - rho))
        pct_bend = pg.corr(pd.Series(x), pd.Series(y), method="percbend")["r"].iat[0]
        biweight = pg.corr(pd.Series(x), pd.Series(y), method="bicor")["r"].iat[0]
        shepherd = pg.corr(pd.Series(x), pd.Series(y), method="shepherd")["r"].iat[0]
        skipped = pg.corr(pd.Series(x), pd.Series(y), method="skipped")["r"].iat[0]

        # ---- model-based metrics ----
        huber = HuberRegressor().fit(x.reshape(-1, 1), y)
        r2_huber = r2_score(y, huber.predict(x.reshape(-1, 1)))
        mi = mutual_info_regression(x.reshape(-1, 1), y, discrete_features=False)[0]

        # collect into long form
        metrics = {
            "spearman": spearman,
            "mutual": mi,
            "pct_bend": pct_bend,
            "biweight": biweight,
            "shepherd": shepherd,
            "skipped": skipped,
            "fisher_z": fisher_z,
            "huber": r2_huber,
            "kendall": tau,
        }
        for kind, val in metrics.items():
            data.append(
                {
                    "dataset": ds_name,  # ← new field
                    "receiver_name": receiver_name,
                    "antenna_idxs": antenna_idx,
                    "kind": kind,
                    "value": val,
                }
            )

# build & sort exactly as before
df = pl.DataFrame(data).filter(~pl.col("value").is_nan())

order = ["x310", "qca", "ax210", "iwl5300", "asus1", "asus2", "ESP1", "ESP2"]
order_df = pl.DataFrame(
    {
        "receiver_name": order,
        "order_rank": list(range(len(order))),
    }
)

df_sorted = (
    df.join(order_df, on="receiver_name", how="left")
    .with_columns(
        pl.col("order_rank").fill_null(len(order)),
        pl.col("antenna_idxs").cast(pl.UInt8),
    )
    .sort("order_rank")
    .drop("order_rank")
)

In [None]:
# how many metrics → number of columns
n_kinds = df_sorted.select("kind").unique().shape[0]

# 3) facet by kind→columns and dataset→rows, then set cols=n_kinds
df_sorted.hvplot.scatter(
    x="receiver_name",
    y="value",
    by=["kind", "dataset"],  # ← new facetting
    color="antenna_idxs",
    subplots=True,
    cmap="Dark2",
).cols(2)