In [1]:
print("Installing necessary libraries...")
import sys
!{sys.executable} -m pip install -q pandas scikit-learn xgboost imbalanced-learn
print("Libraries installed successfully.")

Installing necessary libraries...
Libraries installed successfully.


In [1]:
print("Connecting to Google Drive...")

try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive successfully mounted.")
except Exception as e:
    print(f"Failed to connect to Google Drive: {e}")
    raise

Connecting to Google Drive...
Mounted at /content/drive
Google Drive successfully mounted.


In [2]:
TRAINING_CSV_PATH = '/content/drive/MyDrive/ANPHY/master_training_dataset_imputed.csv'
TEST_DATA_FOLDER = '/content/drive/MyDrive/ANPHY/test_subjects/'

In [3]:
import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# --- Core ML Libraries ---
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier

# --- Models ---
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression

# --- Imbalance Handling ---
from imblearn.over_sampling import ADASYN

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

# =============================================================================
# --- Configuration ---
# =============================================================================
TRAINING_CSV_PATH = '/content/drive/MyDrive/ANPHY/master_training_dataset_imputed.csv'
TEST_DATA_FOLDER = '/content/drive/MyDrive/ANPHY/test_subjects/'
TARGET_COLUMN = 'sleep_stage'
ROLLING_WINDOW_SIZE = 5 # 5 epochs = 2.5 minutes

# =============================================================================
# --- Helper Functions ---
# =============================================================================
def create_trend_features(df, window_size):
    """
    Creates advanced trend features (moving averages and std deviations)
    for key signals, grouped by subject.
    """
    print("Creating advanced trend features...")
    # Define the key features we want to create trends for
    key_features = ['emg_rms', 'eog_std', 'eog_kurtosis'] + [col for col in df.columns if 'hjorth' in col]

    all_subjects_data = []
    for subject, subject_df in df.groupby('subject_id'):
        subject_df = subject_df.reset_index(drop=True)
        # Calculate rolling mean and std for each key feature
        for col in key_features:
            if col in subject_df.columns:
                subject_df[f'{col}_rolling_mean_{window_size}'] = subject_df[col].rolling(window=window_size, min_periods=1).mean()
                subject_df[f'{col}_rolling_std_{window_size}'] = subject_df[col].rolling(window=window_size, min_periods=1).std()

        # Fill any remaining NaNs at the beginning
        subject_df.fillna(method='bfill', inplace=True)
        subject_df.fillna(method='ffill', inplace=True)
        all_subjects_data.append(subject_df)

    return pd.concat(all_subjects_data, ignore_index=True)

# =============================================================================
# --- Main Execution ---
# =============================================================================
print("--- Starting State-of-the-Art Stacking Ensemble Pipeline ---")

# --- 1. Load, Engineer, and Preprocess Training Data ---
print("\n--- [Phase 1] Preparing Training Data ---")
train_df_raw = pd.read_csv(TRAINING_CSV_PATH)
# **NEW:** Add the advanced trend features
train_df = create_trend_features(train_df_raw, ROLLING_WINDOW_SIZE)

feature_cols = [c for c in train_df.columns if c not in ['subject_id', 'sleep_stage', 'is_rem']]
X_train = train_df[feature_cols]
y_train_text = train_df[TARGET_COLUMN]

le = LabelEncoder()
y_train = le.fit_transform(y_train_text)
class_names = le.classes_
print(f"Labels encoded. Class order: {list(class_names)}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

print("Handling class imbalance with ADASYN...")
adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train_scaled, y_train)
print(f"Resampled training data shape: {X_train_resampled.shape}")

# --- 2. Build and Train the Stacking Ensemble Model ---
print("\n--- [Phase 2] Building and Training the Ensemble of Experts ---")

# Define the "Expert" base models with good, tuned parameters
estimators = [
    ('xgb', xgb.XGBClassifier(n_estimators=200, max_depth=7, learning_rate=0.2, subsample=0.7, colsample_bytree=0.7, random_state=42, n_jobs=-1)),
    ('lgbm', lgb.LGBMClassifier(random_state=42, n_jobs=-1)),
    ('rf', RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1)),
    ('mlp', MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=300, random_state=42))
]

# Define the "Manager" meta-model
# This model's job is to learn from the predictions of the experts.
meta_model = LogisticRegression(max_iter=1000)

# Create the Stacking Classifier
stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=meta_model,
    cv=3, # Use cross-validation to generate predictions for the meta-model
    n_jobs=-1
)

print("Training the full Stacking Ensemble... (this may take some time)")
stacking_model.fit(X_train_resampled, y_train_resampled)
print("Ensemble model trained successfully.")

# --- 3. Final Evaluation on Each Unseen Test Subject ---
print("\n--- [Phase 3] Evaluating Tuned Model on Unseen Test Subjects ---")
test_files = glob.glob(os.path.join(TEST_DATA_FOLDER, '*.csv'))
all_results = []

for test_file in test_files:
    subject_id = os.path.basename(test_file).split('_')[0]
    print("\n" + "="*60)
    print(f"--- Testing on Subject: {subject_id} ---")

    # Load and prepare this single subject's data
    raw_test_df = pd.read_csv(test_file)
    test_df = create_trend_features(raw_test_df, ROLLING_WINDOW_SIZE)

    # Align columns and separate X/y
    X_test = test_df[feature_cols]
    y_test_text = test_df[TARGET_COLUMN]

    # Preprocess
    y_test = le.transform(y_test_text)
    X_test_scaled = scaler.transform(X_test)

    # Evaluate the best model on this subject
    y_pred = stacking_model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    rem_f1 = f1_score(y_test, y_pred, labels=[list(class_names).index('REM')], average='macro', zero_division=0)

    print(f"Performance for {subject_id}: Accuracy={accuracy:.3f}, REM F1={rem_f1:.3f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=class_names, zero_division=0))

    all_results.append({'Subject': subject_id, 'Accuracy': accuracy, 'REM_F1_Score': rem_f1})

# --- 4. Grand Finale: Averaged Performance Summary ---
print("\n" + "="*60)
print("--- FINAL RESULT: AVERAGED PERFORMANCE ACROSS ALL TEST SUBJECTS ---")
results_df = pd.DataFrame(all_results)
avg_accuracy = results_df['Accuracy'].mean()
std_accuracy = results_df['Accuracy'].std()
avg_rem_f1 = results_df['REM_F1_Score'].mean()
std_rem_f1 = results_df['REM_F1_Score'].std()
print(f"Average Overall Accuracy: {avg_accuracy:.4f} ± {std_accuracy:.4f}")
print(f"Average REM F1-Score:   {avg_rem_f1:.4f} ± {std_rem_f1:.4f}")
print("="*60)


--- Starting State-of-the-Art Stacking Ensemble Pipeline ---

--- [Phase 1] Preparing Training Data ---
Creating advanced trend features...
Labels encoded. Class order: ['N1', 'N2', 'N3', 'REM', 'Wake']
Handling class imbalance with ADASYN...
Resampled training data shape: (41112, 153)

--- [Phase 2] Building and Training the Ensemble of Experts ---
Training the full Stacking Ensemble... (this may take some time)
Ensemble model trained successfully.

--- [Phase 3] Evaluating Tuned Model on Unseen Test Subjects ---

--- Testing on Subject: EPCTL04 ---
Creating advanced trend features...
Performance for EPCTL04: Accuracy=0.852, REM F1=0.792

Classification Report:
              precision    recall  f1-score   support

          N1       0.55      0.44      0.49        95
          N2       0.84      0.88      0.86       214
          N3       0.99      0.90      0.94       124
         REM       0.79      0.80      0.79        74
        Wake       0.91      0.97      0.94       255

   

In [4]:
import joblib
import os

# =============================================================================
# --- Configuration ---
# =============================================================================
# Define the folder where you want to save your final, trained model components.
MODEL_OUTPUT_FOLDER = '/content/drive/MyDrive/ANPHY/final_model/'
os.makedirs(MODEL_OUTPUT_FOLDER, exist_ok=True) # Create folder if it doesn't exist

# Define the filenames for each component of our pipeline.
MODEL_FILE_PATH = os.path.join(MODEL_OUTPUT_FOLDER, 'sleep_staging_model.joblib')
SCALER_FILE_PATH = os.path.join(MODEL_OUTPUT_FOLDER, 'feature_scaler.joblib')
LABEL_ENCODER_PATH = os.path.join(MODEL_OUTPUT_FOLDER, 'label_encoder.joblib')

# =============================================================================
# --- Main Execution ---
# =============================================================================
print("\n" + "="*60)
print("--- SAVING THE FINAL TRAINED MODEL AND PIPELINE COMPONENTS ---")
print("="*60)

# --- 1. Save the Stacking Ensemble Model ---
# This is our "Ensemble of Experts" model that we trained.
print(f"Saving the trained Stacking Ensemble model to: {MODEL_FILE_PATH}")
# The 'stacking_model' variable should exist from the previous cell where we trained it.
joblib.dump(stacking_model, MODEL_FILE_PATH)
print("Model saved successfully.")

# --- 2. Save the Feature Scaler ---
# This is CRITICAL. To make predictions on a new subject, you must scale
# their features in the exact same way as the training data.
print(f"\nSaving the feature scaler to: {SCALER_FILE_PATH}")
# The 'scaler' variable was fitted on our training data in the previous cell.
joblib.dump(scaler, SCALER_FILE_PATH)
print("Scaler saved successfully.")

# --- 3. Save the Label Encoder ---
# This is also CRITICAL. It holds the mapping between the numerical predictions
# (e.g., 4) and the human-readable sleep stage labels (e.g., 'REM').
print(f"\nSaving the label encoder to: {LABEL_ENCODER_PATH}")
# The 'le' variable was fitted on our training labels in the previous cell.
joblib.dump(le, LABEL_ENCODER_PATH)
print("Label encoder saved successfully.")

print("\n--- Saving Complete ---")
print("You can now load these three files in a new script to make predictions on new subjects without retraining.")




--- SAVING THE FINAL TRAINED MODEL AND PIPELINE COMPONENTS ---
Saving the trained Stacking Ensemble model to: /content/drive/MyDrive/ANPHY/final_model/sleep_staging_model.joblib
Model saved successfully.

Saving the feature scaler to: /content/drive/MyDrive/ANPHY/final_model/feature_scaler.joblib
Scaler saved successfully.

Saving the label encoder to: /content/drive/MyDrive/ANPHY/final_model/label_encoder.joblib
Label encoder saved successfully.

--- Saving Complete ---
You can now load these three files in a new script to make predictions on new subjects without retraining.
