In [1]:
# imports
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit

In [2]:
# load, sort by time, and define splits
DATA_PATH = '../all_data_files/cleaned_dataset_per_device.csv'
SAVE_DIR = '../Comprehensive ML - Files & Plots etc'
TEST_SIZE = 0.2  # hold out the most recent 20%

df = pd.read_csv(DATA_PATH, parse_dates=['time'])
df = df.sort_values('time').reset_index(drop=True)

feature_names = [
    'distance', 'frequency', 'c_walls', 'w_walls', 'co2', 'humidity',
    'pm25', 'pressure', 'temperature', 'snr'
]
target_col = 'exp_pl'

split_idx = int(len(df) * (1 - TEST_SIZE))
train_df = df.iloc[:split_idx].copy()
test_df  = df.iloc[split_idx:].copy()

X_train = train_df[feature_names].to_numpy()
y_train = train_df[target_col].to_numpy()
time_train = train_df['time'].to_numpy()

X_test = test_df[feature_names].to_numpy()
y_test = test_df[target_col].to_numpy()
time_test = test_df['time'].to_numpy()

print(f"Training samples: {len(train_df)}, Test samples: {len(test_df)}")
print(f"Train window: {train_df.time.min()} -> {train_df.time.max()}")
print(f"Test  window: {test_df.time.min()} -> {test_df.time.max()}")

Training samples: 1663627, Test samples: 415907
Train window: 2024-10-01 00:01:07.420593+00:00 -> 2025-08-12 17:18:53.293125+00:00
Test  window: 2025-08-12 17:19:02.126782+00:00 -> 2025-09-30 23:59:55.971870+00:00


In [3]:
# save the time-ordered splits
os.makedirs(SAVE_DIR, exist_ok=True)

train_df_out = train_df[feature_names + ['PL', 'time']] if 'PL' in train_df else None
if train_df_out is None:
    train_df_out = train_df[feature_names].copy()
    train_df_out['PL'] = y_train
    train_df_out['time'] = time_train

test_df_out = test_df[feature_names + ['PL', 'time']] if 'PL' in test_df else None
if test_df_out is None:
    test_df_out = test_df[feature_names].copy()
    test_df_out['PL'] = y_test
    test_df_out['time'] = time_test

train_df_out.to_csv(f"{SAVE_DIR}/train.csv", index=False)
test_df_out.to_csv(f"{SAVE_DIR}/test.csv", index=False)
print(f"Saved train.csv and test.csv to {SAVE_DIR}")

Saved train.csv and test.csv to ../Comprehensive ML - Files & Plots etc


In [None]:
# time-series folds for CV (training set only)
N_SPLITS = 5
GAP = 0  # increase if you want a gap between train/val in time units

tscv = TimeSeriesSplit(n_splits=N_SPLITS, gap=GAP)
fold_assignments = np.zeros(len(train_df), dtype=int)

for fold_num, (_, val_idx) in enumerate(tscv.split(train_df)):
    fold_assignments[val_idx] = fold_num
    val_start, val_end = time_train[val_idx].min(), time_train[val_idx].max()
    print(f"Fold {fold_num+1}: val window {val_start} -> {val_end}")

np.save(f"{SAVE_DIR}/train_folds.npy", fold_assignments)
print(f"Saved time-ordered 5-fold assignments to {SAVE_DIR}/train_folds.npy")

Fold 1: val window 2024-11-22 16:14:36.579002+00:00 -> 2025-01-12 19:00:02.336990+00:00
Fold 2: val window 2025-01-12 19:00:12.120847+00:00 -> 2025-03-16 11:49:03.885341+00:00
Fold 3: val window 2025-03-16 11:49:27.286563+00:00 -> 2025-05-07 02:59:11.265952+00:00
Fold 4: val window 2025-05-07 02:59:25.336805+00:00 -> 2025-06-30 14:22:42.066099+00:00
Fold 5: val window 2025-06-30 14:22:54.211608+00:00 -> 2025-08-12 17:18:53.293125+00:00
Saved time-ordered 5-fold assignments to ../Comprehensive ML - Files & Plots etc/train_folds.npy
