In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold

In [2]:
# Define the path to the dataset
dataset_path = '../all_data_files/cleaned_dataset_per_device.csv'

# Load the dataset
try:
    df = pd.read_csv(dataset_path)
    print("\nDataset loaded successfully.\n")
except FileNotFoundError:
    print(f"File not found at the specified path: {dataset_path}")
    import sys
    sys.exit()

# Display dataset information
print("\nDataset Information:")
df.info()

print("\nFirst Five Rows of the Dataset:")
display(df.head())


Dataset loaded successfully.


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1415654 entries, 0 to 1415653
Data columns (total 20 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   time         1415654 non-null  object 
 1   device_id    1415654 non-null  object 
 2   co2          1415654 non-null  float64
 3   humidity     1415654 non-null  float64
 4   pm25         1415654 non-null  float64
 5   pressure     1415654 non-null  float64
 6   temperature  1415654 non-null  float64
 7   rssi         1415654 non-null  float64
 8   snr          1415654 non-null  float64
 9   SF           1415654 non-null  int64  
 10  frequency    1415654 non-null  float64
 11  f_count      1415654 non-null  float64
 12  p_count      1415654 non-null  float64
 13  toa          1415654 non-null  float64
 14  distance     1415654 non-null  int64  
 15  c_walls      1415654 non-null  int64  
 16  w_walls      1415654 non-null  int64 

Unnamed: 0,time,device_id,co2,humidity,pm25,pressure,temperature,rssi,snr,SF,frequency,f_count,p_count,toa,distance,c_walls,w_walls,exp_pl,n_power,esp
0,2024-09-26 11:01:52.383162+00:00,ED3,645.0,54.18,0.32,300.48,23.87,-74.0,12.5,9,867.7,95.0,105.0,0.246784,18,1,2,91.26,-86.737602,-74.237602
1,2024-09-26 11:02:08.387851+00:00,ED0,539.0,49.34,0.39,299.69,24.57,-48.0,8.5,9,867.5,82.0,109.0,0.246784,10,0,0,65.26,-57.073822,-48.573822
2,2024-09-26 11:02:14.517123+00:00,ED1,471.0,45.29,0.32,300.8,25.64,-56.0,12.5,9,867.1,81.0,107.0,0.246784,8,1,0,73.26,-68.737602,-56.237602
3,2024-09-26 11:02:24.466271+00:00,ED2,659.0,49.09,0.44,301.18,24.57,-70.0,9.0,8,868.5,77.0,113.0,0.133632,23,0,2,87.26,-79.514969,-70.514969
4,2024-09-26 11:02:26.872032+00:00,ED4,423.0,38.86,0.13,301.69,27.71,-90.0,11.5,8,867.9,3597.0,4224.0,0.133632,37,0,5,107.26,-101.797062,-90.297062


In [3]:
# =========== Feature Extraction & Train-Test Split ===========

# Extract columns from DataFrame
time_all        = df['time'].values
d_all           = df['distance'].values
frequency_all   = df['frequency'].values 
c_walls_all     = df['c_walls'].values
w_walls_all     = df['w_walls'].values
PL_all          = df['exp_pl'].values
co2_all         = df['co2'].values
humidity_all    = df['humidity'].values
pm25_all        = df['pm25'].values
pressure_all    = df['pressure'].values
temperature_all = df['temperature'].values
snr_all         = df['snr'].values

# Combine all features into one array (N x 10)
X_all = np.column_stack((
    d_all, frequency_all, c_walls_all, w_walls_all, co2_all, humidity_all,
    pm25_all, pressure_all, temperature_all, snr_all
))

y_all = PL_all  # Target
feature_names = [
    'distance', 'frequency', 'c_walls', 'w_walls', 'co2', 'humidity', 
    'pm25', 'pressure', 'temperature', 'snr'
]

# Train-test split
X_train, X_test, y_train, y_test, time_train, time_test = train_test_split(
    X_all, y_all, time_all, test_size=0.2, random_state=50
)
print(f"\nTraining samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}\n")

# ==== Prepare folder and save ====
save_dir = '../Comprehensive ML - Files & Plots etc.'
os.makedirs(save_dir, exist_ok=True)

# Save train and test splits as CSVs
df_train = pd.DataFrame(X_train, columns=feature_names)
df_train['PL'] = y_train
df_train['time'] = time_train
df_test = pd.DataFrame(X_test, columns=feature_names)
df_test['PL'] = y_test
df_test['time'] = time_test

df_train.to_csv(f"{save_dir}/train.csv", index=False)
df_test.to_csv(f"{save_dir}/test.csv", index=False)
print(f"Saved train.csv and test.csv to {save_dir}\n")


Training samples: 1132523, Test samples: 283131

Saved train.csv and test.csv to ../Comprehensive ML - Files & Plots etc.



In [4]:
# =========== Generate and Save 5-Fold Indices for Training Set ===========

kf = KFold(n_splits=5, shuffle=True, random_state=50)
fold_assignments = np.zeros(len(X_train), dtype=int)

for fold_num, (_, val_idx) in enumerate(kf.split(X_train)):
    fold_assignments[val_idx] = fold_num

# Save fold assignments as numpy array
np.save(f"{save_dir}/train_folds.npy", fold_assignments)
print(f"\nSaved 5-fold assignments as train_folds.npy in {save_dir}\n")


Saved 5-fold assignments as train_folds.npy in ../Comprehensive ML - Files & Plots etc.

