In [6]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.nn.utils import spectral_norm
from scipy import signal
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random
from scipy import stats
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from scipy import signal

def set_seeds(seed=42):
    torch.manual_seed(seed)
    torch.mps.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seeds()

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: mps


### Setup Dataset Class

In [7]:
class SensorDataset(Dataset):
    def __init__(self, X, y, session_ids, metadata, window_size=3):
        self.X = []
        self.y = []
        self.metadata = metadata
        self.session_ids = session_ids
        self.window_size = window_size

        # Group data by session
        session_groups = {}
        for idx, session_id in enumerate(session_ids):
            session_groups.setdefault(session_id, []).append((X[idx], y[idx]))

        # Create sliding windows within each session
        for session_id, session_data in session_groups.items():
            for i in range(len(session_data) - window_size + 1):
                steps = [data[0] for data in session_data[i:i+window_size]]
                window_X = np.vstack(steps)
                window_y = session_data[i+window_size-1][1]
                self.X.append(window_X)
                self.y.append(window_y)

        # Now extract features for all windows, then append duration
        self.X, self.y = self.extract_features()
    
    def extract_features(self):
        features = []
        labels = []
        for i in range(len(self.X)):
            X_win, y_val = self.X[i], self.y[i]
            sequence_features = []
            # Compute basic statistical features for each sensor
            for j in range(X_win.shape[1]):
                sensor_data = X_win[:, j]
                freqs, psd = signal.welch(sensor_data, fs=100)
                sequence_features.extend([
                    np.mean(sensor_data),
                    np.std(sensor_data),
                    np.min(sensor_data),
                    np.max(sensor_data),
                    np.median(sensor_data),
                    stats.skew(sensor_data),
                    stats.kurtosis(sensor_data),
                    np.percentile(sensor_data, 25),
                    np.percentile(sensor_data, 75),
                    np.ptp(sensor_data),
                    np.sum(psd),
                    np.mean(psd),
                    np.max(psd),
                    freqs[np.argmax(psd)]
                ])
            # Compute trend and dynamics features for each sensor
            for j in range(X_win.shape[1]):
                sensor_data = X_win[:, j]
                if len(sensor_data) > 5:
                    detrended = signal.detrend(sensor_data)
                    trend = sensor_data - detrended
                    sequence_features.append(np.mean(trend))
                    first_diff = np.diff(sensor_data)
                    sequence_features.extend([
                        np.mean(np.abs(first_diff)),
                        np.std(first_diff)
                    ])
                    if len(first_diff) > 1:
                        second_diff = np.diff(first_diff)
                        sequence_features.append(np.mean(np.abs(second_diff)))
            features.append(sequence_features)
            labels.append(y_val.item())
        
        # Append the session-level duration as a new feature for all samples
        features = self.add_mean_duration_feature(features)
        return np.array(features), np.array(labels)
    
    def add_mean_duration_feature(self, X_features):
        # Use the same number of session_ids as there are samples
        session_ids = self.session_ids[:len(X_features)]
        mean_durations = np.zeros(len(X_features))
        for session_id in np.unique(session_ids):
            session_mask = session_ids == session_id
            session_duration = self.metadata.loc[self.metadata['session_id'] == session_id, 'duration'].mean()
            mean_durations[session_mask] = session_duration
        mean_durations = mean_durations.reshape(-1, 1)
        return np.hstack((X_features, mean_durations))
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return (torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.long))

### Load Data

In [8]:
X_original = np.load('sensor_data.npy')  # Shape: (num_steps, time_steps, num_sensors)
metadata = pd.read_csv('combined_metadata.csv')
y = metadata['has_ms'].values

### Split Data

In [9]:
# Split by session for sequence integrity while stratifying
sessions = metadata['session_id'].values
unique_sessions = np.unique(sessions)

# Create a mapping of session_id to MS status
session_to_ms_status = {}
for session_id in unique_sessions:
    # Get all rows for this session
    session_mask = metadata['session_id'] == session_id
    # If any row has MS, the whole session is labeled as MS
    has_ms = any(metadata.loc[session_mask, 'has_ms'] == 1)
    session_to_ms_status[session_id] = 1 if has_ms else 0

# Create lists of session IDs by MS status
ms_sessions = [s for s, status in session_to_ms_status.items() if status == 1]
non_ms_sessions = [s for s, status in session_to_ms_status.items() if status == 0]

# Perform stratified split on MS and non-MS sessions separately
train_ms, temp_ms = train_test_split(ms_sessions, test_size=0.3, random_state=42, shuffle=True)
train_non_ms, temp_non_ms = train_test_split(non_ms_sessions, test_size=0.3, random_state=42, shuffle=True)

# Further split temp sets into validation and test
val_ms, test_ms = train_test_split(temp_ms, test_size=0.5, random_state=42, shuffle=True)
val_non_ms, test_non_ms = train_test_split(temp_non_ms, test_size=0.5, random_state=42, shuffle=True)

# Combine MS and non-MS sessions for each split
train_sessions = train_ms + train_non_ms
val_sessions = val_ms + val_non_ms
test_sessions = test_ms + test_non_ms

train_indices = metadata['session_id'].isin(train_sessions)
val_indices = metadata['session_id'].isin(val_sessions)
test_indices = metadata['session_id'].isin(test_sessions)

In [10]:
# Downsample X to have 10 timesteps instead of 100
X = X_original.copy()#reshape(X_original.shape[0], 25, -1, X_original.shape[2]).mean(axis=2)

X_train, X_val, X_test = X[train_indices], X[val_indices], X[test_indices]
y_train, y_val, y_test = y[train_indices], y[val_indices], y[test_indices]

# Normalize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.reshape(-1, X.shape[2])).reshape(X_train.shape)
X_val = scaler.transform(X_val.reshape(-1, X.shape[2])).reshape(X_val.shape)
X_test = scaler.transform(X_test.reshape(-1, X.shape[2])).reshape(X_test.shape)

# Session IDs for reference
train_sessions_ids = sessions[train_indices]
val_sessions_ids = sessions[val_indices]
test_sessions_ids = sessions[test_indices]

window_size = 4

# Create datasets
train_dataset = SensorDataset(X_train, y_train, train_sessions_ids, metadata, window_size=window_size)
val_dataset = SensorDataset(X_val, y_val, val_sessions_ids, metadata, window_size=window_size)
test_dataset = SensorDataset(X_test, y_test, test_sessions_ids, metadata, window_size=window_size)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

KeyboardInterrupt: 

### Traditional

In [None]:
X_train_features, y_train = train_dataset.X, train_dataset.y
X_val_features, y_val = val_dataset.X, val_dataset.y
X_test_features, y_test = test_dataset.X, test_dataset.y

In [None]:
X_combined = np.vstack((X_train_features, X_val_features))
y_combined = np.concatenate((y_train, y_val))

# Best parameters: {'learning_rate': 0.2, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 1.0}
optimized_gb = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.2,
    max_depth=5,
    min_samples_split=2,
    random_state=42
)

optimized_gb.fit(X_combined, y_combined)
y_test_pred = optimized_gb.predict(X_test_features)

# Calculate metrics
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

Accuracy: 0.9519
Precision: 0.9020
Recall: 0.9604
F1 Score: 0.9303

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96      1509
           1       0.90      0.96      0.93       757

    accuracy                           0.95      2266
   macro avg       0.94      0.95      0.95      2266
weighted avg       0.95      0.95      0.95      2266



#### GB Regular CV:
Accuracy: 0.8883 ± 0.0505
Precision: 0.8854 ± 0.0345
Recall: 0.7929 ± 0.1382
F1 Score: 0.8308 ± 0.0894

In [None]:
import pickle

# Save the model to a file
with open('optimized_gb_model.pkl', 'wb') as file:
    pickle.dump(optimized_gb, file)

print("Model saved to 'optimized_gb_model2.pkl'")

Model saved to 'optimized_gb_model.pkl'


In [42]:
import numpy as np
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Combine training and validation sets
X_combined = np.vstack((X_train_features, X_val_features))
y_combined = np.concatenate((y_train, y_val))

# Define the base estimator with your tuned parameters
base_gb = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.2,
    max_depth=3,
    min_samples_split=2,
    random_state=42
)

# Create an ensemble of multiple instances of your GradientBoostingClassifier
ensemble_gb = BaggingClassifier(
    estimator=base_gb,
    n_estimators=5,          # number of copies
    max_samples=0.8,         # each model trains on 80% of the combined data (bootstrapped)
    bootstrap=True,
    random_state=42
)

# Train the ensemble on the combined training set
ensemble_gb.fit(X_combined, y_combined)

# Evaluate on the test set
y_test_pred = ensemble_gb.predict(X_test_features)

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"Ensemble Accuracy: {accuracy:.4f}")
print(f"Ensemble Precision: {precision:.4f}")
print(f"Ensemble Recall: {recall:.4f}")
print(f"Ensemble F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

Ensemble Accuracy: 0.9629
Ensemble Precision: 0.9170
Ensemble Recall: 0.9775
Ensemble F1 Score: 0.9463

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      1509
           1       0.92      0.98      0.95       757

    accuracy                           0.96      2266
   macro avg       0.95      0.97      0.96      2266
weighted avg       0.96      0.96      0.96      2266



In [1]:
from data_preprocessing import get_train_test_datasets
import numpy as np
import pandas as pd
import pickle
import random
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import torch


def set_seeds(seed=42):
    torch.manual_seed(seed)
    torch.mps.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seeds()

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using device: {device}")

def evaluate_traditional_model(model, test_dataset):
    X_test, y_test = test_dataset.X, test_dataset.y
    y_test_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_test_pred))

def save_model(model, output_path):
    with open(output_path, 'wb') as file:
        pickle.dump(model, file)

    print(f"Model saved to {output_path}")

Using device: mps


In [None]:
X = np.load('sensor_data.npy')
metadata = pd.read_csv('combined_metadata.csv')
y = metadata['has_ms'].values

train_dataset, test_dataset, scaler = get_train_test_datasets(X, y, metadata, test_size=0.1)

In [3]:
def train_traditional_model(train_dataset, test_dataset=None, save_path=''):
    X_train, y_train = train_dataset.X, train_dataset.y

    base_gb = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.2,
        max_depth=3,
        min_samples_split=2,
        random_state=42
    )

    # Create an ensemble of multiple instances of your GradientBoostingClassifier
    ensemble_gb = BaggingClassifier(
        estimator=base_gb,
        n_estimators=5,
        max_samples=0.9,
        bootstrap=True,
        random_state=42
    )

    # Train the ensemble on the training set
    ensemble_gb.fit(X_train, y_train)

    if test_dataset:
        evaluate_traditional_model(ensemble_gb, test_dataset)

    if save_path != '':
        save_model(ensemble_gb, save_path)

    return ensemble_gb

model = train_traditional_model(train_dataset, test_dataset)

Accuracy: 0.9541
Precision: 0.9568
Recall: 0.9061
F1 Score: 0.9307

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.97      1937
           1       0.96      0.91      0.93      1001

    accuracy                           0.95      2938
   macro avg       0.95      0.94      0.95      2938
weighted avg       0.95      0.95      0.95      2938



In [None]:
from data_preprocessing import prep_collection_for_inference

print(prep_collection_for_inference('./data/test', scaler))

1


In [3]:
from sklearn.model_selection import GridSearchCV

gb = GradientBoostingClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_leaf': [1, 3, 5]
}

grid = GridSearchCV(gb, param_grid, cv=3, n_jobs=-1)
grid.fit(train_dataset.X, train_dataset.y)
best_gb = grid.best_estimator_

In [6]:
from sklearn.feature_selection import RFECV

gb = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.2,
        max_depth=3,
        min_samples_split=2,
        random_state=42
    )

selector = RFECV(estimator=gb, step=1, cv=5, n_jobs=-1)
selector.fit(train_dataset.X, train_dataset.y)

selector.ranking_

[97,  1,  2, 13, 36, 67,  1, 22,  9,  1, 57,  8, 34, 72, 84, 17, 43,
    7, 66, 14,  1,  1, 19, 46, 74, 65, 85, 33, 51,  1, 55, 29, 32, 87,
    1,  1,  1,  1, 93, 94, 90, 95, 40,  1, 41, 49, 39, 35,  1,  1,  1,
    25, 24, 81, 54, 92, 96,  1, 18, 45, 37, 77,  1,  1,  1,  1, 91, 89,
    53, 70, 83,  6, 15, 38, 44, 23,  1,  1,  1, 16, 88, 86, 52, 61, 63,
    1, 27, 62, 20, 21,  1,  1, 12,  3, 58, 71, 64, 50, 82,  5, 10,  1,
    73, 75, 28, 78, 80, 11, 26, 60, 68,  1, 31,  1, 56, 69,  4, 30, 79,
    1, 59, 47, 76, 48,  1, 42,  1]

In [7]:
selector.ranking_

array([97,  1,  2, 13, 36, 67,  1, 22,  9,  1, 57,  8, 34, 72, 84, 17, 43,
        7, 66, 14,  1,  1, 19, 46, 74, 65, 85, 33, 51,  1, 55, 29, 32, 87,
        1,  1,  1,  1, 93, 94, 90, 95, 40,  1, 41, 49, 39, 35,  1,  1,  1,
       25, 24, 81, 54, 92, 96,  1, 18, 45, 37, 77,  1,  1,  1,  1, 91, 89,
       53, 70, 83,  6, 15, 38, 44, 23,  1,  1,  1, 16, 88, 86, 52, 61, 63,
        1, 27, 62, 20, 21,  1,  1, 12,  3, 58, 71, 64, 50, 82,  5, 10,  1,
       73, 75, 28, 78, 80, 11, 26, 60, 68,  1, 31,  1, 56, 69,  4, 30, 79,
        1, 59, 47, 76, 48,  1, 42,  1])

selector.support:

[False,  True, False, False, False, False,  True, False, False,
    True, False, False, False, False, False, False, False, False,
    False, False,  True,  True, False, False, False, False, False,
    False, False,  True, False, False, False, False,  True,  True,
    True,  True, False, False, False, False, False,  True, False,
    False, False, False,  True,  True,  True, False, False, False,
    False, False, False,  True, False, False, False, False,  True,
    True,  True,  True, False, False, False, False, False, False,
    False, False, False, False,  True,  True,  True, False, False,
    False, False, False, False,  True, False, False, False, False,
    True,  True, False, False, False, False, False, False, False,
    False, False,  True, False, False, False, False, False, False,
    False, False, False,  True, False,  True, False, False, False,
    False, False,  True, False, False, False, False,  True, False,
    True]

In [8]:
selector.support_ 

array([False,  True, False, False, False, False,  True, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False,  True,  True, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True,  True,
        True,  True, False, False, False, False, False,  True, False,
       False, False, False,  True,  True,  True, False, False, False,
       False, False, False,  True, False, False, False, False,  True,
        True,  True,  True, False, False, False, False, False, False,
       False, False, False, False,  True,  True,  True, False, False,
       False, False, False, False,  True, False, False, False, False,
        True,  True, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False,  True, False,  True, False, False, False,
       False, False,  True, False, False, False, False,  True, False,
        True])

In [10]:
total_features = train_dataset.X.shape[1]
num_sensors = (total_features - 1) // 14  # each sensor contributes 14 features
base_features = ["mean", "std", "min", "max", "median", "skew", "kurtosis",
                 "25th_percentile", "75th_percentile", "ptp", "sum_psd", "mean_psd",
                 "max_psd", "freq_at_max_psd"]
feature_mapping = {}
idx = 0
for sensor in range(num_sensors):
    for feat in base_features:
        feature_mapping[idx] = f"sensor_{sensor}_{feat}"
        idx += 1
feature_mapping[idx] = "window_duration_mean"

for i in range(total_features):
    print(f"Feature index {i}: {feature_mapping[i]}, Ranking: {selector.ranking_[i]}, Selected: {selector.support_[i]}")

Feature index 0: sensor_0_mean, Ranking: 97, Selected: False
Feature index 1: sensor_0_std, Ranking: 1, Selected: True
Feature index 2: sensor_0_min, Ranking: 2, Selected: False
Feature index 3: sensor_0_max, Ranking: 13, Selected: False
Feature index 4: sensor_0_median, Ranking: 36, Selected: False
Feature index 5: sensor_0_skew, Ranking: 67, Selected: False
Feature index 6: sensor_0_kurtosis, Ranking: 1, Selected: True
Feature index 7: sensor_0_25th_percentile, Ranking: 22, Selected: False
Feature index 8: sensor_0_75th_percentile, Ranking: 9, Selected: False
Feature index 9: sensor_0_ptp, Ranking: 1, Selected: True
Feature index 10: sensor_0_sum_psd, Ranking: 57, Selected: False
Feature index 11: sensor_0_mean_psd, Ranking: 8, Selected: False
Feature index 12: sensor_0_max_psd, Ranking: 34, Selected: False
Feature index 13: sensor_0_freq_at_max_psd, Ranking: 72, Selected: False
Feature index 14: sensor_1_mean, Ranking: 84, Selected: False
Feature index 15: sensor_1_std, Ranking: 17,

In [5]:
support = [False,  True, False, False, False, False,  True, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False,  True,  True, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True,  True,
        True,  True, False, False, False, False, False,  True, False,
       False, False, False,  True,  True,  True, False, False, False,
       False, False, False,  True, False, False, False, False,  True,
        True,  True,  True, False, False, False, False, False, False,
       False, False, False, False,  True,  True,  True, False, False,
       False, False, False, False,  True, False, False, False, False,
        True,  True, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False,  True, False,  True, False, False, False,
       False, False,  True, False, False, False, False,  True, False,
        True]

X_train_selected = train_dataset.X[:, support]
X_test_selected = test_dataset.X[:, support]

gb = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.2,
        max_depth=3,
        min_samples_split=2,
        random_state=42
    )

ensemble_gb = BaggingClassifier(
    estimator=gb,
    n_estimators=5,
    max_samples=0.9,
    bootstrap=True,
    random_state=42
)

# Train the ensemble on the training set
ensemble_gb.fit(X_train_selected, train_dataset.y)

y_test = test_dataset.y
y_test_pred = ensemble_gb.predict(X_test_selected)

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))


Accuracy: 0.9568
Precision: 0.9679
Recall: 0.9031
F1 Score: 0.9344

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.97      1937
           1       0.97      0.90      0.93      1001

    accuracy                           0.96      2938
   macro avg       0.96      0.94      0.95      2938
weighted avg       0.96      0.96      0.96      2938

