### DATA PREPARATION: Mainly Feature Selection and  Train/Test Split

In [11]:
#  Packages and Libraries 
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold

In [None]:
# Path to the dataset file
dataset_path = '../all_data_files/cleaned_dataset_per_device.csv'

# Load the dataset
try:
    df = pd.read_csv(dataset_path)
    print("\nDataset loaded successfully!\n")
except FileNotFoundError:
    print(f"File not found at the specified path: {dataset_path}")
    import sys
    sys.exit()

# Display dataset information.
print("\nDataset Information:")
df.info()

print("\nFirst Five Rows of the Dataset:")
display(df.head())


Dataset loaded successfully!


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
Index: 2079735 entries, 22623 to 2102357
Data columns (total 20 columns):
 #   Column       Dtype              
---  ------       -----              
 0   time         datetime64[ns, UTC]
 1   device_id    object             
 2   co2          float64            
 3   humidity     float64            
 4   pm25         float64            
 5   pressure     float64            
 6   temperature  float64            
 7   rssi         float64            
 8   snr          float64            
 9   SF           int64              
 10  frequency    float64            
 11  f_count      float64            
 12  p_count      float64            
 13  toa          float64            
 14  distance     int64              
 15  c_walls      int64              
 16  w_walls      int64              
 17  exp_pl       float64            
 18  n_power      float64            
 19  esp          float64            

Unnamed: 0,time,device_id,co2,humidity,pm25,pressure,temperature,rssi,snr,SF,frequency,f_count,p_count,toa,distance,c_walls,w_walls,exp_pl,n_power,esp
22623,2024-10-01 00:00:07.374717+00:00,ED5,443.0,45.5,0.49,313.78,22.4,-102.0,10.5,9,868.3,9867.0,10755.0,0.246784,40,2,2,119.26,-112.870778,-102.370778
22624,2024-10-01 00:00:25.715822+00:00,ED2,452.0,40.6,0.45,314.79,22.85,-63.0,9.5,10,868.5,5814.0,6641.0,0.452608,23,0,2,80.26,-72.961836,-63.461836
22625,2024-10-01 00:00:50.271706+00:00,ED4,434.0,37.33,0.04,315.56,24.08,-81.0,8.5,10,867.3,9334.0,10752.0,0.452608,37,0,5,98.26,-90.073822,-81.573822
22626,2024-10-01 00:01:07.420593+00:00,ED5,440.0,45.46,0.46,313.8,22.41,-102.0,10.2,9,868.1,9868.0,10756.0,0.246784,40,2,2,119.26,-112.596121,-102.396121
22627,2024-10-01 00:01:25.764905+00:00,ED2,451.0,40.59,0.43,314.81,22.83,-66.0,11.0,10,867.9,5815.0,6642.0,0.452608,23,0,2,83.26,-77.331956,-66.331956



Time window (UTC): 2024-10-01 00:00:00+00:00 to 2025-10-01 00:00:00+00:00 (exclusive end)
Min time in df: 2024-10-01 00:00:07.374717+00:00 | Max time in df: 2025-09-30 23:59:55.971870+00:00


In [None]:
#Feature Extraction & Train-Test Split

# Extract columns from DataFrame
time_all        = df['time'].values
device_all      = df['device_id'].values
d_all           = df['distance'].values
frequency_all   = df['frequency'].values 
c_walls_all     = df['c_walls'].values
w_walls_all     = df['w_walls'].values
PL_all          = df['exp_pl'].values
co2_all         = df['co2'].values
humidity_all    = df['humidity'].values
pm25_all        = df['pm25'].values
pressure_all    = df['pressure'].values
temperature_all = df['temperature'].values
snr_all         = df['snr'].values
SF_all          = df['SF'].values
toa_all         = df['toa'].values
rssi_all        = df['rssi'].values
n_power_all     = df['n_power'].values
esp_all         = df['esp'].values
# NOTE: 'f_count' and 'p_count' are excluded

# Combine all features into one array (N x 15)
X_all = np.column_stack((
    d_all, frequency_all, c_walls_all, w_walls_all,
    co2_all, humidity_all, pm25_all, pressure_all, temperature_all,
    snr_all, SF_all, toa_all, rssi_all, n_power_all, esp_all
))

y_all = PL_all  # Target
feature_names = [
    'distance', 'frequency', 'c_walls', 'w_walls',
    'co2', 'humidity', 'pm25', 'pressure', 'temperature',
    'snr', 'SF', 'toa', 'rssi', 'n_power', 'esp'
]

# Train-test split (keep metadata aligned)
X_train, X_test, y_train, y_test, time_train, time_test, dev_train, dev_test = train_test_split(
    X_all, y_all, time_all, device_all, test_size=0.2, random_state=50
)
print(f"\nTraining samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}\n")

# Prepare folder and save 
save_dir = '../Extended Parametric Regression Files+Plots'
os.makedirs(save_dir, exist_ok=True)

# Save train and test splits as CSVs (include device_id + time)
df_train = pd.DataFrame(X_train, columns=feature_names)
df_train['PL']        = y_train
df_train['time']      = time_train
df_train['device_id'] = dev_train

df_test = pd.DataFrame(X_test, columns=feature_names)
df_test['PL']        = y_test
df_test['time']      = time_test
df_test['device_id'] = dev_test

df_train.to_csv(f"{save_dir}/train.csv", index=False)
df_test.to_csv(f"{save_dir}/test.csv", index=False)
print(f"Saved train.csv and test.csv to {save_dir}\n")


Training samples: 1663788, Test samples: 415947

Saved train.csv and test.csv to ../Extended Parametric Regression Files+Plots



In [14]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1663788 entries, 0 to 1663787
Data columns (total 18 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   distance     1663788 non-null  float64       
 1   frequency    1663788 non-null  float64       
 2   c_walls      1663788 non-null  float64       
 3   w_walls      1663788 non-null  float64       
 4   co2          1663788 non-null  float64       
 5   humidity     1663788 non-null  float64       
 6   pm25         1663788 non-null  float64       
 7   pressure     1663788 non-null  float64       
 8   temperature  1663788 non-null  float64       
 9   snr          1663788 non-null  float64       
 10  SF           1663788 non-null  float64       
 11  toa          1663788 non-null  float64       
 12  rssi         1663788 non-null  float64       
 13  n_power      1663788 non-null  float64       
 14  esp          1663788 non-null  float64       
 15  PL           16

In [None]:
# Generate and Save 5-Fold Indices for Training Set

kf = KFold(n_splits=5, shuffle=True, random_state=50)
fold_assignments = np.zeros(len(X_train), dtype=int)

for fold_num, (_, val_idx) in enumerate(kf.split(X_train)):
    fold_assignments[val_idx] = fold_num

# Save fold assignments as numpy array
np.save(f"{save_dir}/train_folds.npy", fold_assignments)
print(f"\nSaved 5-fold assignments as train_folds.npy in {save_dir}\n")


Saved 5-fold assignments as train_folds.npy in ../Extended Parametric Regression Files+Plots

