In [None]:
# Imports
import numpy as np
import polars as pl

# import seaborn as sns
from datetime import timedelta
import scipy.io

In [None]:
def glimpse(df: pl.DataFrame, num_vals: int = 5):
    print(
        "-------------------------------------------------------------\n"
        + f"A glimpse of the first {num_vals} entries in the data frame:"
    )
    with pl.Config() as cfg:
        cfg.set_tbl_cols(-1)
        cfg.set_tbl_width_chars(140)
        print(df.limit(num_vals))
        print()


# Example parameters for this notebook
curves_file = "../data/random_doppler/curves.parquet"
exp_db_file = "../data/random_doppler/preprocessed.parquet"

curve_df = pl.read_parquet(curves_file)
csi_df = pl.read_parquet(exp_db_file)

glimpse(curve_df, 2)
glimpse(csi_df, 2)

In [None]:
def subsample(df: pl.DataFrame, target_num: int, num_packets_total: int = 2000):
    """
    Subsample each of the 700 packet CSI sequences to a target of target_num
    packets. This helps in fighting data loss.
    """
    # Period in which there is at least one value. If this isn't the case,
    # subsampling will crash. This is fine because in such a case subsampling
    # doesn't make much sense. Missing values are essentially substituted by
    # the closest available ones. If that "closest" is actually far away, that
    # would suck.
    safe_period = timedelta(milliseconds=500)

    # Ensure we are actually performing "sub" sampling
    assert target_num < num_packets_total, "can not supersample!"

    print(
        "Subsampling series to circumvent missing values: \n"
        + f"Subsampling ratio: {target_num}/{num_packets_total}"
    )

    # NOTE: The dynamic groupby itself does not perform subsampling, it simply
    # performs windowed grouping. That means that every "subsample_timedelta",
    # it will group all values within a "500ms" period from then.
    # To subsample, we take the very first appearing values in all the windows.
    aggs = [
        pl.col("csi_abs").first(),
        pl.col("csi_phase").first(),
        pl.col("session_id").first(),
        pl.col("receiver_name").first(),
        pl.col("num_curve").first(),
        pl.col("subcarrier_idxs").first(),
        pl.col("antenna_rssi").first(),
    ]

    # Drop null values, then perform the subsampling and truncate to the target
    # subsampling number
    def grouper(group: pl.DataFrame) -> pl.DataFrame:
        min_time = group.select(pl.min("timestamp")).item()
        max_time = group.select(pl.max("timestamp")).item()
        total_time = max_time - min_time
        subsample_timedelta = total_time / target_num

        return (
            group.sort("timestamp")
            .group_by_dynamic(
                "timestamp", every=subsample_timedelta, period=safe_period
            )
            .agg(*aggs)
            .head(target_num)
        )

    return (
        df.drop_nulls()
        .group_by("session_id", "receiver_name", maintain_order=True)
        .map_groups(grouper)
    )


csi_df = subsample(csi_df, 192)

In [None]:
print(csi_df)

In [None]:
def col_to_numpy(df: pl.DataFrame, column: str) -> np.ndarray:
    return np.array(df.get_column(column).to_list())


# -- Session regrouping
numpy_df = csi_df.group_by(
    "session_id",
    "receiver_name",
    "num_curve",
    "subcarrier_idxs",
    maintain_order=True,
).agg("csi_abs", "csi_phase", "antenna_rssi", "timestamp")


# ---------------------------------------------------------------------
# Now convert to array, first the data ...
csi_abs = col_to_numpy(numpy_df, "csi_abs")[:, :, 2:-2]
csi_abs = np.swapaxes(csi_abs, 1, 2)
csi_phase = col_to_numpy(numpy_df, "csi_phase")[:, :, 2:-2]
csi_phase = np.swapaxes(csi_phase, 1, 2)

# ... then the labels
curve_labels = col_to_numpy(numpy_df, "num_curve")
receiver_labels = col_to_numpy(numpy_df, "receiver_name")
labels = np.array(list(zip(receiver_labels, curve_labels)))

# Note: num_capture is just a time index or the sequence number, if you will
print(
    "Dimensions: session x num_capture x csi_subcarrier\n"
    + f"CSI absolute values shape                        : {csi_abs.shape}\n"
    + f"CSI phase values shape                           : {csi_phase.shape}\n"
    + f"Labels shape (receiver name and curve identifier): {labels.shape}"
)

mat_dict_abs = {
    "train_data": csi_abs,
    "train_activity_label": curve_labels,
    "train_receiver_label": receiver_labels,
}

mat_dict_phs = {
    "train_data": csi_phase,
    "train_activity_label": curve_labels,
    "train_receiver_label": receiver_labels,
}

scipy.io.savemat("../data/random_doppler/train_data_split_amp.mat", mat_dict_abs)
scipy.io.savemat("../data/random_doppler/train_data_split_pha.mat", mat_dict_phs)