# Time-Ordered Data Preparation for Multi-Gateway RSSI Fingerprinting

### Goals
- Load the cleaned multi-gateway dataset.
- Enforce strict time ordering.
- Leakage-safe 80/20 split by physical uplink (all gateway receptions on one side).
- 5 time-series folds within the training slice (no shuffling).
- Save time-ordered train/test splits and fold assignments for downstream models.

In [16]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit

pd.set_option("display.max_columns", 120)

DATA_PATH = "../LoRaWAN Localization - Advanced Files/all3_gateways_kalman_filtered.csv"
SAVE_DIR = "../LoRaWAN Localization - Advanced Files"
TEST_FRAC = 0.20   # hold out most recent 20%
N_FOLDS = 5        # folds within training slice
GAP = 0            # set >0 to enforce a temporal gap between train/val folds

gateway_col = "gateway"
device_col = "device_id"
f_cnt_col = "f_count"

target_col = "exp_pl_gw0"  # adjust if your target differs

feature_names = [
    "co2", "humidity", "pm25", "pressure", "temperature",
    "rssi", "snr", "SF", "frequency", "f_count", "p_count", "toa",
    "distance_gw0", "c_walls_gw0", "w_walls_gw0", "n_power", "esp",
]
context_cols = ["time", gateway_col, device_col, "uplink_id"]

### Load, Normalize Time, and Sort

In [17]:
df = pd.read_csv(DATA_PATH, low_memory=False)

if "time" not in df.columns:
    raise KeyError("Column 'time' not found.")

df["time"] = pd.to_datetime(df["time"], errors="coerce", utc=True)
df = df.dropna(subset=["time"]).sort_values("time").reset_index(drop=True)

print("Data shape after load/sort:", df.shape)
print("Time span:", df["time"].min(), "->", df["time"].max())
print("Columns:", df.columns.tolist())

Data shape after load/sort: (3411800, 25)
Time span: 2024-10-01 00:01:07.420593+00:00 -> 2025-09-30 23:59:55.971870+00:00
Columns: ['time', 'gateway', 'device_id', 'co2', 'humidity', 'pm25', 'pressure', 'temperature', 'rssi', 'snr', 'SF', 'frequency', 'f_count', 'p_count', 'toa', 'distance_gw0', 'c_walls_gw0', 'w_walls_gw0', 'exp_pl_gw0', 'n_power', 'esp', 'filtered_rssi', 'kf_rssi', 'exp_pl_filtered_gw0', 'exp_pl_kf_gw0']


### Uplink Identity (prevents cross-gateway leakage)

In [18]:
df = df.copy()
if "uplink_id" not in df.columns:
    if {device_col, f_cnt_col}.issubset(df.columns):
        df["uplink_id"] = (
            df[device_col].fillna("na_device").astype(str)
            + "_"
            + df[f_cnt_col].fillna(-1).astype(int).astype(str)
            + "_"
            + df["time"].dt.floor("1s").astype(str)
        )
        print("uplink_id constructed from device + f_count + floored time.")
    else:
        df["uplink_id"] = pd.NA
        print("uplink_id not built (missing device/f_count); splits will use row order only.")

uplink_id constructed from device + f_count + floored time.


### Leakage-Safe 80/20 Time Split by Uplink

In [19]:
if df["uplink_id"].isna().all():
    split_idx = int(len(df) * (1 - TEST_FRAC))
    train_df = df.iloc[:split_idx].copy()
    test_df  = df.iloc[split_idx:].copy()
else:
    uplink_first_time = df.groupby("uplink_id", dropna=False)["time"].min().sort_values()
    split_uplink_idx = int(len(uplink_first_time) * (1 - TEST_FRAC))
    train_uplinks = set(uplink_first_time.index[:split_uplink_idx])
    test_uplinks  = set(uplink_first_time.index[split_uplink_idx:])
    train_df = df[df["uplink_id"].isin(train_uplinks)].copy()
    test_df  = df[df["uplink_id"].isin(test_uplinks)].copy()

train_df = train_df.sort_values("time").reset_index(drop=True)
test_df  = test_df.sort_values("time").reset_index(drop=True)

print(f"Train samples: {len(train_df)}, Test samples: {len(test_df)}")
print(f"Train window: {train_df.time.min()} -> {train_df.time.max()}")
print(f"Test  window: {test_df.time.min()} -> {test_df.time.max()}")

Train samples: 2714781, Test samples: 697019
Train window: 2024-10-01 00:01:07.420593+00:00 -> 2025-08-12 14:10:35.483231+00:00
Test  window: 2025-08-12 14:10:39.834945+00:00 -> 2025-09-30 23:59:55.971870+00:00


In [20]:
# Verify uplink_id exclusivity across train/test
train_uids = set(train_df["uplink_id"].dropna().unique())
test_uids  = set(test_df["uplink_id"].dropna().unique())
overlap = train_uids & test_uids
print("Overlap uplink_ids:", len(overlap))  # expect 0

print("Unique uplinks total:", df["uplink_id"].nunique())
print("Train unique uplinks:", train_df["uplink_id"].nunique())
print("Test unique uplinks :", test_df["uplink_id"].nunique())
print(
    "Train + Test equals total:",
    train_df["uplink_id"].nunique() + test_df["uplink_id"].nunique() == df["uplink_id"].nunique(),
)

Overlap uplink_ids: 0
Unique uplinks total: 2111204
Train unique uplinks: 1688963
Test unique uplinks : 422241
Train + Test equals total: True


### Feature/Target Matrices

In [21]:
missing_feats = [f for f in feature_names if f not in df.columns]
if missing_feats:
    raise KeyError(f"Missing features in data: {missing_feats}")
if target_col not in df.columns:
    raise KeyError(f"Target column '{target_col}' not found.")

X_train = train_df[feature_names].to_numpy()
y_train = train_df[target_col].to_numpy()
time_train = train_df["time"].to_numpy()

X_test = test_df[feature_names].to_numpy()
y_test = test_df[target_col].to_numpy()
time_test = test_df["time"].to_numpy()

print("X_train:", X_train.shape, "| X_test:", X_test.shape)
print("y_train:", y_train.shape, "| y_test:", y_test.shape)

X_train: (2714781, 17) | X_test: (697019, 17)
y_train: (2714781,) | y_test: (697019,)


### Save Time-Ordered Train/Test Splits

In [22]:
os.makedirs(SAVE_DIR, exist_ok=True)

def assemble_out(df_part):
    cols_out = list(feature_names) + [target_col, "time"]
    for c in context_cols:
        if c not in cols_out and c in df_part.columns:
            cols_out.append(c)
    return df_part[cols_out].copy()

train_out = assemble_out(train_df)
test_out  = assemble_out(test_df)

train_out.to_csv(f"{SAVE_DIR}/train.csv", index=False)
test_out.to_csv(f"{SAVE_DIR}/test.csv", index=False)
print(f"Saved train.csv and test.csv to {SAVE_DIR}")

Saved train.csv and test.csv to ../LoRaWAN Localization - Advanced Files


### Time-Series Cross-Validation Folds (Training Slice Only)

In [23]:
# Time-series folds grouped by uplink_id (training slice only)
tscv = TimeSeriesSplit(n_splits=N_FOLDS, gap=GAP)
fold_assignments = np.zeros(len(train_df), dtype=int)

if train_df["uplink_id"].isna().all():
    # Fallback: row-wise folds (still time-ordered)
    for fold_num, (_, val_idx) in enumerate(tscv.split(train_df), start=1):
        fold_assignments[val_idx] = fold_num
        val_start, val_end = time_train[val_idx].min(), time_train[val_idx].max()
        print(f"Fold {fold_num}: val window {val_start} -> {val_end}")
else:
    uplink_first = train_df.groupby("uplink_id", dropna=False)["time"].min().sort_values()
    uplinks_ordered = uplink_first.index.to_numpy()

    for fold_num, (tr_u_idx, val_u_idx) in enumerate(tscv.split(uplinks_ordered), start=1):
        tr_u = set(uplinks_ordered[tr_u_idx])
        val_u = set(uplinks_ordered[val_u_idx])

        val_mask = train_df["uplink_id"].isin(val_u)
        fold_assignments[val_mask.to_numpy().nonzero()[0]] = fold_num

        val_times = train_df.loc[val_mask, "time"]
        print(f"Fold {fold_num}: val window {val_times.min()} -> {val_times.max()}")

np.save(f"{SAVE_DIR}/train_folds.npy", fold_assignments)
print(f"Saved time-ordered {N_FOLDS}-fold assignments to {SAVE_DIR}/train_folds.npy")

Fold 1: val window 2024-11-22 21:14:17.215715+00:00 -> 2025-01-12 19:32:13.537631+00:00
Fold 2: val window 2025-01-12 19:32:48.589100+00:00 -> 2025-03-16 00:02:36.281205+00:00
Fold 3: val window 2025-03-16 00:02:42.676390+00:00 -> 2025-05-06 12:10:28.102176+00:00
Fold 4: val window 2025-05-06 12:10:46.114562+00:00 -> 2025-06-29 23:09:03.071242+00:00
Fold 5: val window 2025-06-29 23:09:15.793512+00:00 -> 2025-08-12 14:10:35.483231+00:00
Saved time-ordered 5-fold assignments to ../LoRaWAN Localization - Advanced Files/train_folds.npy


### Notes
- Encode `gateway` before modeling (one-hot/embedding/target encoding); raw string is kept for context.
- If you expect gateway RSSI bias, calibrate per gateway before training.
- Increase `GAP` if you want a temporal buffer between train/val folds.
- For strict leakage control, keep the uplink-based split and apply the same grouping when batching for models.