In [None]:
# Imports
import numpy as np
import polars as pl
from datetime import timedelta

# Loading the data

First load and inspect the data

**NOTE:** I removed the antenna dependency for now, since we are only considering data captures with one antenna. We should probably have multiple-antenna captures as well, though. In the simplest case, the code should still work since we can flatten out the CSI values (instead of $54$ we would have $n \cdot 54$, $n$ being the anntena number).

**NOTE2:** I have not removed the phases, since we are really interested in phases. However, so far, the curves are only used to scale amplitudes, since phases are a bit more complex.. We should not forget that they should be part of the equation though.

**Question:** If we use multiple antennas, which provide very important information about the received radiation (e.g. directivity), would we also change the network architectures in some way to target that? In a similar way that RGB values sometimes have separate layers for R, G and B values? We should keep that in mind.

In [None]:
# Example parameters for this notebook
curves_file = "../data/random_dyncurves_30db/curves.parquet"
exp_db_file = "../data/random_dyncurves_30db/preprocessed.parquet"

curve_df = pl.read_parquet(curves_file)
csi_df = pl.read_parquet(exp_db_file)


def glimpse(df: pl.DataFrame, num_vals: int = 5):
    print(
        "-------------------------------------------------------------\n"
        + f"A glimpse of the first {num_vals} entries in the data frame:"
    )
    with pl.Config() as cfg:
        cfg.set_tbl_cols(-1)
        cfg.set_tbl_width_chars(140)
        print(df.limit(num_vals))
        print()


glimpse(curve_df, 2)
glimpse(csi_df, 2)

# Note on dimensions

We now always have $700$ values for every session, some of which will be null. Just to exemplify this, I searched for one by hand

In [None]:
missing_datapoint_idx = 88
with pl.Config() as cfg:
    cfg.set_tbl_cols(-1)
    cfg.set_tbl_width_chars(140)
    print(csi_df[missing_datapoint_idx])

# Subsampling

I rewrote the subsampling to be a bit more configurable and easier to understand. I am not using it per default, but feel free to!

In [None]:
def subsample(
    df: pl.DataFrame,
    target_num: int,
    num_packets_total: int = 700,
    safe_period_ms: int = 500,
):
    """
    Subsample each of the 700 packet CSI sequences to a target of target_num
    packets. This helps in fighting data loss.
    """
    # Period in which there is at least one value. If this isn't the case,
    # subsampling will crash. This is fine because in such a case subsampling
    # doesn't make much sense. Missing values are essentially substituted by
    # the closest available ones. If that "closest" is actually far away, that
    # would suck.
    safe_period = timedelta(milliseconds=safe_period_ms)

    # Ensure we are actually performing "sub" sampling
    assert target_num < num_packets_total, "can not supersample!"

    print(
        "Subsampling series to circumvent missing values: \n"
        + f"Subsampling ratio: {target_num}/{num_packets_total}"
    )

    # NOTE: The dynamic groupby itself does not perform subsampling, it simply
    # performs windowed grouping. That means that every "subsample_timedelta",
    # it will group all values within a "500ms" period from then.
    # To subsample, we take the very first appearing values in all the windows.
    aggs = [
        pl.col("csi_abs").first(),
        pl.col("csi_phase").first(),
        pl.col("session_id").first(),
        pl.col("receiver_name").first(),
        pl.col("num_curve").first(),
        pl.col("subcarrier_idxs").first(),
        pl.col("antenna_rssi").first(),
    ]

    # Drop null values, then perform the subsampling and truncate to the target
    # subsampling number
    def grouper(group: pl.DataFrame) -> pl.DataFrame:
        min_time = group.select(pl.min("timestamp")).item()
        max_time = group.select(pl.max("timestamp")).item()
        total_time = max_time - min_time
        subsample_timedelta = total_time / target_num

        return (
            group.sort("timestamp")
            .group_by_dynamic(
                "timestamp", every=subsample_timedelta, period=safe_period
            )
            .agg(*aggs)
            .head(target_num)
        )

    return (
        df.drop_nulls()
        .group_by("session_id", "receiver_name", maintain_order=True)
        .map_groups(grouper)
    )


# Opt-in to subsampling by uncommenting this line.
# Feel free to change the target dimension as well, 152 is arbitrary.
# csi_df = subsample(csi_df, 152)

# Numpy Conversion

Conversion to numpy is actually straightforward now. This produces a few arrays, namely:

- Both the absolute CSI values and the phases
- RSSI, a single valued quantity per packet describing signal strength
- The groundtruth curve array the CSI is meant to represent (In this case only the absolute value)
- Labels corresponding to the receiver and curve

**NOTE:** Importantly, this data still contains None values if downsampling wasn't performed. Imputation should be considered.

In [None]:
def col_to_numpy(df: pl.DataFrame, column: str) -> np.ndarray:
    return np.array(df.get_column(column).to_list())


# -- Session regrouping
numpy_df = csi_df.group_by(
    "session_id",
    "receiver_name",
    "num_curve",
    "subcarrier_idxs",
    maintain_order=True,
).agg("csi_abs", "csi_phase", "antenna_rssi", "timestamp")


# ---------------------------------------------------------------------
# Now convert to array, first the data ...
# NOTE: to_numpy doesnt do nested lists, so instead we take the roundabout way over lists
csi_abs = col_to_numpy(numpy_df, "csi_abs")
csi_phase = col_to_numpy(numpy_df, "csi_phase")
rssi = col_to_numpy(numpy_df, "antenna_rssi")

# ... then the labels
curve_labels = col_to_numpy(numpy_df, "num_curve")
receiver_labels = col_to_numpy(numpy_df, "receiver_name")
labels = np.array(list(zip(receiver_labels, curve_labels)))

# ... and the ground truth curves
# NOTE: Curves are currently one-dimensional, but that is just due to the experiment.
# Technically, we could use different curves for all 54 subcarriers.
# Hence, we should treat the curves as being 700x54 dimensional!
num_subcs = 54
curves = col_to_numpy(curve_df, "curve")
curves = np.repeat(curves[:, :, np.newaxis], 54, axis=2)
curve_groundtruth = curves[curve_labels, :, :]

# Note: num_capture is just a time index or the sequence number, if you will
print(
    "Dimensions: session x num_capture x csi_subcarrier\n"
    + f"CSI absolute values shape                        : {csi_abs.shape}\n"
    + f"CSI phase values shape                           : {csi_phase.shape}\n"
    + f"Ground truth of curves shape                     : {curve_groundtruth.shape}\n"
    + f"Labels shape (receiver name and curve identifier): {labels.shape}"
)