In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02010030rest 20160324 1054..csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02020025rest 20150713 1519..csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02010013rest 20150703 1333..csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02020016rest 20150701 1040..csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02020015_rest 20150630 1527.csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02010022restnew 20150724 14.csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02020027rest 20150713 1049..csv
/kaggle/input/preprocessed-raw-mat-csv/mat-csv-actual/mat-csv-actual/csv_files_from_mat2/02010008_rest 20150619 1653.csv
/kaggle/input/preprocessed-raw-m

# RAW CSV USING leave-one-out cross-validation with paramter tuning

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, LeaveOneGroupOut, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_curve, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.calibration import CalibratedClassifierCV
from scipy.stats import randint, uniform
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# --- 1. SETUP AND DATA LOADING ---
directory_path = '/kaggle/input/preprocessed-raw-mat-csv/raw-csv-actual/raw-csv-actual/csv_from_raw1'
output_dir = '/kaggle/working'
os.makedirs(output_dir, exist_ok=True)

subject_labels = {
    # --- Depressed (MDD) Patients - Label 1 ---
    '0201': 1, '0202': 1, # Add all MDD subject ID prefixes here
    
    # --- Healthy Controls (HC) - Label 0 ---
    '0203': 0, '0204': 0, # Add all HC subject ID prefixes here
}

print("Loading, sampling, and labeling data...")
data_frames = []
for filename in os.listdir(directory_path):
    subject_prefix = filename[:4]
    if subject_prefix in subject_labels:
        data = pd.read_csv(os.path.join(directory_path, filename))
        data['label'] = subject_labels[subject_prefix]
        data['subject'] = subject_prefix
        sample = data.sample(frac=0.25, random_state=42)
        data_frames.append(sample)

combined_data = pd.concat(data_frames, ignore_index=True)
num_subjects = len(combined_data['subject'].unique())
print(f"Loaded {combined_data.shape[0]} rows of data from {num_subjects} unique subjects.")

# --- NEW CODE BLOCK: VERIFY THE LABELED DATA ---
print("\n--- Verifying the Combined and Labeled DataFrame ---")
print("First 5 rows of the dataset:")
print(combined_data.head())
print("\nLast 5 rows of the dataset:")
print(combined_data.tail())
print("\nDistribution of Labels (0=Healthy, 1=Depressed):")
print(combined_data['label'].value_counts())
print("--- Verification Complete ---\n")
# --- END OF NEW CODE BLOCK ---

if 'time' in combined_data.columns: combined_data.drop(columns=['time'], inplace=True)

# --- 2. FEATURE/TARGET DEFINITION AND PREPROCESSING ---
y = combined_data['label']
groups = combined_data['subject']
X = combined_data.drop(columns=['label', 'subject'])

if X.isnull().sum().sum() > 0:
    print(f"Found {X.isnull().sum().sum()} missing values. Imputing with column means.")
    X = X.fillna(X.mean())

# --- 3. HYPERPARAMETER TUNING ---
print("\n--- Starting Hyperparameter Tuning ---")
param_dist = {
    'KNN': {'pca__n_components': randint(20, 50), 'knn__n_neighbors': randint(3, 20)},
    'Random Forest': {'n_estimators': randint(100, 500), 'max_depth': [10, 20, 30, None]},
    'XGBoost': {'n_estimators': randint(100, 500), 'learning_rate': uniform(0.01, 0.2), 'max_depth': randint(3, 10)},
    'Linear SVM': {'base_estimator__C': uniform(0.1, 10)},
    'Logistic Regression': {'C': uniform(0.1, 10)}
}

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

best_params = {}
models_to_tune = [ 'Random Forest', 'XGBoost', 'Linear SVM', 'Logistic Regression']

for name in models_to_tune:
    print(f"Tuning {name}...")
    if name == 'KNN':
        model_instance = Pipeline([('pca', PCA()), ('knn', KNeighborsClassifier(n_jobs=-1))])
    else:
        model_instance = {
        'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1),
        'Linear SVM': CalibratedClassifierCV(LinearSVC(random_state=42, dual=False, max_iter=3000)),
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
    }[name]
    
    X_train_tune, _, y_train_tune, _ = train_test_split(X_scaled, y, train_size=0.5, stratify=y, random_state=42)
    
    random_search = RandomizedSearchCV(estimator=model_instance, param_distributions=param_dist[name], 
                                       n_iter=10, cv=2, scoring='accuracy', n_jobs=-1, random_state=42, verbose=1)
    random_search.fit(X_train_tune, y_train_tune)
    best_params[name] = random_search.best_params_
    print(f"Best params for {name}: {best_params[name]}")

# --- 4. DEFINE MODELS WITH TUNED PARAMETERS ---
# tuned_knn_pipeline = Pipeline([
#     ('pca', PCA(n_components=best_params.get('KNN', {}).get('pca__n_components', 30))),
#     ('knn', KNeighborsClassifier(n_neighbors=best_params.get('KNN', {}).get('knn__n_neighbors', 5), n_jobs=-1))
# ])

tuned_models = {
    'Tuned KNN (PCA)': tuned_knn_pipeline,
    'Tuned Linear SVM': CalibratedClassifierCV(LinearSVC(C=best_params.get('Linear SVM', {}).get('base_estimator__C', 1.0), random_state=42, dual=False, max_iter=3000)),
    'Tuned RF': RandomForestClassifier(random_state=42, n_jobs=-1, **best_params.get('Random Forest', {})),
    'Tuned XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1, **best_params.get('XGBoost', {})),
    'Tuned LogReg': LogisticRegression(max_iter=1000, random_state=42, **best_params.get('Logistic Regression', {})),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'LDA': LinearDiscriminantAnalysis()
}
cmap = plt.colormaps.get('tab10')
colors = [cmap(i) for i in np.linspace(0, 1, len(tuned_models))]

# --- 5. GENERATE PLOTS FROM A SINGLE TRAIN-TEST SPLIT (for visualization) ---
print("\n--- Generating Reports and ROC Curves from a single split ---")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

fig_reports, axes_reports = plt.subplots(4, 2, figsize=(18, 24)); axes_reports = axes_reports.flatten()
fig_reports.suptitle('Classification Reports for All Models (Raw EEG Data)', fontsize=24, y=1.0)
if len(tuned_models) % 2 != 0: fig_reports.delaxes(axes_reports[-1])

fig_roc, ax_roc = plt.subplots(figsize=(12, 10))
ax_roc.set_title('ROC Curves for Depression Classification (Raw EEG Data)', fontsize=16)

for i, (model_name, model) in enumerate(tuned_models.items()):
    print(f"Fitting {model_name} for plotting...")
    pipeline_plot = Pipeline([('scaler', StandardScaler()), ('model', model)])
    pipeline_plot.fit(X_train, y_train)
    y_pred = pipeline_plot.predict(X_test)
    
    report_text = classification_report(y_test, y_pred, target_names=['Healthy', 'Depressed'])
    axes_reports[i].axis('off'); axes_reports[i].set_title(model_name, fontsize=16, pad=20)
    axes_reports[i].text(0.01, 0.95, report_text, family='monospace', fontsize=14, va='top')
    
    if hasattr(pipeline_plot, "predict_proba"):
        y_proba = pipeline_plot.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        roc_auc = roc_auc_score(y_test, y_proba)
        ax_roc.plot(fpr, tpr, color=colors[i], lw=2.5, label=f'{model_name} (AUC = {roc_auc:.3f})')

fig_reports.tight_layout(pad=3.0)
fig_reports.savefig(os.path.join(output_dir, "all_classification_reports_raw_csv.png"))
plt.close(fig_reports)
print(f"Saved consolidated classification reports to: {os.path.join(output_dir, 'all_classification_reports_raw_csv.png')}")

ax_roc.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Guess')
ax_roc.legend(loc='lower right', fontsize=12)
ax_roc.grid(alpha=0.5)
fig_roc.savefig(os.path.join(output_dir, "all_roc_curves_raw_csv.png"))
plt.close(fig_roc)
print(f"Saved consolidated ROC curves to: {os.path.join(output_dir, 'all_roc_curves_raw_csv.png')}")

# --- 6. LEAVE-ONE-SUBJECT-OUT CROSS-VALIDATION ---
print("\n--- Performing Leave-One-Subject-Out Cross-Validation ---")
logo = LeaveOneGroupOut()
cv_scores = {}
for model_name, model in tuned_models.items():
    print(f"Evaluating {model_name} with LOSO-CV...")
    pipeline_cv = Pipeline([('scaler', StandardScaler()), ('model', model)])
    scores = cross_val_score(pipeline_cv, X, y, cv=logo, groups=groups, scoring='accuracy', n_jobs=-1)
    cv_scores[model_name] = scores
    print(f"{model_name} Mean Accuracy: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

# --- 7. VISUALIZE LOSO-CV RESULTS ---
fig_cv, ax_cv = plt.subplots(figsize=(14, 9))
model_names = list(cv_scores.keys())
mean_accuracies = [np.mean(s) for s in cv_scores.values()]
std_devs = [np.std(s) for s in cv_scores.values()]
ax_cv.bar(model_names, mean_accuracies, yerr=std_devs, capsize=5, color=colors, alpha=0.8)
ax_cv.set_title(f'Leave-One-Subject-Out Cross-Validation Accuracy (n={num_subjects} subjects)', fontsize=16)
ax_cv.set_ylabel('Mean Accuracy', fontsize=14); ax_cv.set_ylim(0, 1.05)
plt.xticks(rotation=45, ha="right")
for i, acc in enumerate(mean_accuracies): ax_cv.text(i, acc + 0.05, f"{acc:.3f}", ha='center')
fig_cv.tight_layout()
fig_cv.savefig(os.path.join(output_dir, "loso_cv_accuracy_barchart.png"))
plt.close(fig_cv)
print(f"Saved LOSO-CV results chart to: {os.path.join(output_dir, 'loso_cv_accuracy_barchart.png')}")

# --- 8. GENERATE LEARNING CURVES ---
print("\n--- Generating Learning Curves ---")
fig_lc, axes_lc = plt.subplots(4, 2, figsize=(18, 24), sharey=True)
if len(tuned_models) % 2 != 0: fig_lc.delaxes(axes_lc.flatten()[-1])
axes_lc = axes_lc.flatten()
fig_lc.suptitle('Learning Curves for All Models', fontsize=24, y=1.0)
train_sizes_abs = np.linspace(0.1, 1.0, 7)

for i, (model_name, model) in enumerate(tuned_models.items()):
    print(f"Generating learning curve for {model_name}...")
    pipeline_lc = Pipeline([('scaler', StandardScaler()), ('model', model)])
    train_sizes, train_scores, test_scores = learning_curve(
        pipeline_lc, X, y, cv=5, n_jobs=-1, train_sizes=train_sizes_abs, scoring='accuracy')
    
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    
    ax = axes_lc[i]
    ax.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    ax.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    ax.set_title(model_name, fontsize=16)
    ax.set_xlabel("Training examples"); ax.set_ylabel("Accuracy")
    ax.grid(); ax.legend(loc="best")

fig_lc.tight_layout(pad=3.0)
fig_lc.savefig(os.path.join(output_dir, "all_learning_curves_raw_csv.png"))
plt.close(fig_lc)
print(f"Saved learning curves to: {os.path.join(output_dir, 'all_learning_curves_raw_csv.png')}")

print("\n--- Process Complete ---")

Loading, sampling, and labeling data...
Loaded 2434650 rows of data from 3 unique subjects.

--- Verifying the Combined and Labeled DataFrame ---
First 5 rows of the dataset:
         E1        E2        E3        E4            E5        E6  \
0  0.000004 -0.000004 -0.000010 -0.000004  4.862534e-06 -0.000009   
1  0.000009 -0.000008 -0.000006 -0.000002 -9.369657e-07  0.000002   
2  0.000003 -0.000004 -0.000009 -0.000002  9.702576e-08  0.000003   
3 -0.000008 -0.000002 -0.000007 -0.000001 -3.968677e-05  0.000007   
4 -0.000016 -0.000021 -0.000003 -0.000002  4.623818e-06 -0.000003   

             E7        E8        E9       E10  ...      E122      E123  \
0  1.255296e-07 -0.000012 -0.000010 -0.000009  ...  0.000010 -0.000011   
1  2.801928e-06 -0.000012 -0.000008 -0.000006  ... -0.000009 -0.000010   
2  4.612941e-06 -0.000014 -0.000010 -0.000011  ... -0.000004  0.000002   
3  1.789049e-06 -0.000012  0.000003  0.000003  ... -0.000010 -0.000004   
4  2.116167e-06 -0.000019 -0.000007 -0.0

KeyboardInterrupt: 

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, LeaveOneGroupOut, learning_curve, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.calibration import CalibratedClassifierCV
from scipy.stats import randint, uniform
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# --- 1. SETUP AND DATA LOADING ---
directory_path = '/kaggle/input/preprocessed-raw-mat-csv/raw-csv-actual/raw-csv-actual/csv_from_raw1'
output_dir = '/kaggle/working'
os.makedirs(output_dir, exist_ok=True)

subject_labels = {
    # --- Depressed (MDD) Patients - Label 1 ---
    '0201': 1, '0202': 1, # Add all MDD subject ID prefixes here
    
    # --- Healthy Controls (HC) - Label 0 ---
    '0203': 0, '0204': 0, # Add all HC subject ID prefixes here
}

print("Loading, sampling, and labeling data...")
data_frames = []
for filename in os.listdir(directory_path):
    subject_prefix = filename[:4]
    if subject_prefix in subject_labels:
        data = pd.read_csv(os.path.join(directory_path, filename))
        data['label'] = subject_labels[subject_prefix]
        data['subject'] = subject_prefix
        sample = data.sample(frac=0.1, random_state=42)  # CHANGED: Reduced frac to 0.1 from 0.25 to shrink data size overall
        data_frames.append(sample)

combined_data = pd.concat(data_frames, ignore_index=True)
num_subjects = len(combined_data['subject'].unique())
print(f"Loaded {combined_data.shape[0]} rows of data from {num_subjects} unique subjects.")

# --- NEW CODE BLOCK: VERIFY THE LABELED DATA ---
print("\n--- Verifying the Combined and Labeled DataFrame ---")
print("First 5 rows of the dataset:")
print(combined_data.head())
print("\nLast 5 rows of the dataset:")
print(combined_data.tail())
print("\nDistribution of Labels (0=Healthy, 1=Depressed):")
print(combined_data['label'].value_counts())
print("--- Verification Complete ---\n")
# --- END OF NEW CODE BLOCK ---

if 'time' in combined_data.columns: combined_data.drop(columns=['time'], inplace=True)

# --- 2. FEATURE/TARGET DEFINITION AND PREPROCESSING ---
y = combined_data['label']
groups = combined_data['subject']
X = combined_data.drop(columns=['label', 'subject'])

if X.isnull().sum().sum() > 0:
    print(f"Found {X.isnull().sum().sum()} missing values. Imputing with column means.")
    X = X.fillna(X.mean())

# --- 3. HYPERPARAMETER TUNING ---
print("\n--- Starting Hyperparameter Tuning ---")
param_dist = {
    'KNN': {'pca__n_components': randint(20, 50), 'knn__n_neighbors': randint(3, 10)},  # CHANGED: Narrowed n_neighbors range to 3-10 for faster tuning
    'Random Forest': {'n_estimators': randint(100, 500), 'max_depth': [10, 20, 30, None]},
    'XGBoost': {'n_estimators': randint(100, 500), 'learning_rate': uniform(0.01, 0.2), 'max_depth': randint(3, 10)},
    'Linear SVM': {'base_estimator__C': uniform(0.1, 10)},
    'Logistic Regression': {'C': uniform(0.1, 10)}
}

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

best_params = {}
models_to_tune = ['KNN', 'Random Forest', 'XGBoost', 'Linear SVM', 'Logistic Regression']

for name in models_to_tune:
    print(f"Tuning {name}...")
    if name == 'KNN':
        model_instance = Pipeline([('pca', PCA()), ('knn', KNeighborsClassifier(n_jobs=-1))])
    else:
        model_instance = {
            'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1),
            'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1),
            'Linear SVM': CalibratedClassifierCV(LinearSVC(random_state=42, dual=False, max_iter=3000)),
            'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
        }[name]
    
    # CHANGED: Further subsample for tuning to 20% of full data to speed up, especially for KNN
    X_tune, _, y_tune, _ = train_test_split(X_scaled, y, train_size=0.2, stratify=y, random_state=42)
    X_train_tune, _, y_train_tune, _ = train_test_split(X_tune, y_tune, train_size=0.8, stratify=y_tune, random_state=42)  # 80% of 20% = 16% effective
    
    random_search = RandomizedSearchCV(estimator=model_instance, param_distributions=param_dist[name], 
                                       n_iter=10 if name == 'KNN' else 15,  # CHANGED: Reduced n_iter to 10 for KNN only
                                       cv=2 if name == 'KNN' else 3,  # CHANGED: Reduced cv to 2 for KNN only
                                       scoring='accuracy', n_jobs=-1, random_state=42, verbose=2)  # CHANGED: verbose=2 for progress
    random_search.fit(X_train_tune, y_train_tune)
    best_params[name] = random_search.best_params_
    print(f"Best params for {name}: {best_params[name]}")

# --- 4. DEFINE MODELS WITH TUNED PARAMETERS ---
tuned_knn_pipeline = Pipeline([
    ('pca', PCA(n_components=best_params.get('KNN', {}).get('pca__n_components', 30))),
    ('knn', KNeighborsClassifier(n_neighbors=best_params.get('KNN', {}).get('knn__n_neighbors', 5), n_jobs=-1))
])

tuned_models = {
    'Tuned KNN (PCA)': tuned_knn_pipeline,
    'Tuned Linear SVM': CalibratedClassifierCV(LinearSVC(C=best_params.get('Linear SVM', {}).get('base_estimator__C', 1.0), random_state=42, dual=False, max_iter=3000)),
    'Tuned RF': RandomForestClassifier(random_state=42, n_jobs=-1, **best_params.get('Random Forest', {})),
    'Tuned XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1, **best_params.get('XGBoost', {})),
    'Tuned LogReg': LogisticRegression(max_iter=1000, random_state=42, **best_params.get('Logistic Regression', {})),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'LDA': LinearDiscriminantAnalysis()
}
cmap = plt.colormaps.get('tab10')
colors = [cmap(i) for i in np.linspace(0, 1, len(tuned_models))]

# --- 5. GENERATE PLOTS FROM A SINGLE TRAIN-TEST SPLIT (for visualization) ---
print("\n--- Generating Reports, Confusion Matrices, and ROC Curves from a single split ---")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Classification Reports Figure (unchanged)
fig_reports, axes_reports = plt.subplots(4, 2, figsize=(18, 24)); axes_reports = axes_reports.flatten()
fig_reports.suptitle('Classification Reports for All Models (Raw EEG Data)', fontsize=24, y=1.0)
if len(tuned_models) % 2 != 0: fig_reports.delaxes(axes_reports[-1])

# NEW: Confusion Matrices Figure
fig_cm, axes_cm = plt.subplots(4, 2, figsize=(18, 24)); axes_cm = axes_cm.flatten()
fig_cm.suptitle('Confusion Matrices for All Models (Raw EEG Data)', fontsize=24, y=1.0)
if len(tuned_models) % 2 != 0: fig_cm.delaxes(axes_cm[-1])

# ROC Figure (unchanged)
fig_roc, ax_roc = plt.subplots(figsize=(12, 10))
ax_roc.set_title('ROC Curves for Depression Classification (Raw EEG Data)', fontsize=16)

for i, (model_name, model) in enumerate(tuned_models.items()):
    print(f"Fitting {model_name} for plotting...")
    pipeline_plot = Pipeline([('scaler', StandardScaler()), ('model', model)])
    pipeline_plot.fit(X_train, y_train)
    y_pred = pipeline_plot.predict(X_test)
    
    # Classification Report
    report_text = classification_report(y_test, y_pred, target_names=['Healthy', 'Depressed'])
    axes_reports[i].axis('off'); axes_reports[i].set_title(model_name, fontsize=16, pad=20)
    axes_reports[i].text(0.01, 0.95, report_text, family='monospace', fontsize=14, va='top')
    
    # NEW: Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes_cm[i], cbar=False)
    axes_cm[i].set_title(model_name, fontsize=16)
    axes_cm[i].set_xlabel('Predicted'); axes_cm[i].set_ylabel('True')
    axes_cm[i].set_xticklabels(['Healthy', 'Depressed']); axes_cm[i].set_yticklabels(['Healthy', 'Depressed'])
    
    # ROC Curve
    if hasattr(pipeline_plot, "predict_proba"):
        y_proba = pipeline_plot.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        roc_auc = roc_auc_score(y_test, y_proba)
        ax_roc.plot(fpr, tpr, color=colors[i], lw=2.5, label=f'{model_name} (AUC = {roc_auc:.3f})')

# Save Reports
fig_reports.tight_layout(pad=3.0)
fig_reports.savefig(os.path.join(output_dir, "all_classification_reports_raw_csv.png"))
plt.close(fig_reports)
print(f"Saved consolidated classification reports to: {os.path.join(output_dir, 'all_classification_reports_raw_csv.png')}")

# Save Confusion Matrices
fig_cm.tight_layout(pad=3.0)
fig_cm.savefig(os.path.join(output_dir, "all_confusion_matrices_raw_csv.png"))
plt.close(fig_cm)
print(f"Saved consolidated confusion matrices to: {os.path.join(output_dir, 'all_confusion_matrices_raw_csv.png')}")

# Save ROC
ax_roc.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Guess')
ax_roc.legend(loc='lower right', fontsize=12)
ax_roc.grid(alpha=0.5)
fig_roc.savefig(os.path.join(output_dir, "all_roc_curves_raw_csv.png"))
plt.close(fig_roc)
print(f"Saved consolidated ROC curves to: {os.path.join(output_dir, 'all_roc_curves_raw_csv.png')}")

# --- 6. LEAVE-ONE-SUBJECT-OUT CROSS-VALIDATION ---
print("\n--- Performing Leave-One-Subject-Out Cross-Validation ---")
logo = LeaveOneGroupOut()
cv_scores = {}
for model_name, model in tuned_models.items():
    print(f"Evaluating {model_name} with LOSO-CV...")
    pipeline_cv = Pipeline([('scaler', StandardScaler()), ('model', model)])
    scores = cross_val_score(pipeline_cv, X, y, cv=logo, groups=groups, scoring='accuracy', n_jobs=-1)
    cv_scores[model_name] = scores
    print(f"{model_name} Mean Accuracy: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

# --- 7. VISUALIZE LOSO-CV RESULTS ---
fig_cv, ax_cv = plt.subplots(figsize=(14, 9))
model_names = list(cv_scores.keys())
mean_accuracies = [np.mean(s) for s in cv_scores.values()]
std_devs = [np.std(s) for s in cv_scores.values()]
ax_cv.bar(model_names, mean_accuracies, yerr=std_devs, capsize=5, color=colors, alpha=0.8)
ax_cv.set_title(f'Leave-One-Subject-Out Cross-Validation Accuracy (n={num_subjects} subjects)', fontsize=16)
ax_cv.set_ylabel('Mean Accuracy', fontsize=14); ax_cv.set_ylim(0, 1.05)
plt.xticks(rotation=45, ha="right")
for i, acc in enumerate(mean_accuracies): ax_cv.text(i, acc + 0.05, f"{acc:.3f}", ha='center')
fig_cv.tight_layout()
fig_cv.savefig(os.path.join(output_dir, "loso_cv_accuracy_barchart.png"))
plt.close(fig_cv)
print(f"Saved LOSO-CV results chart to: {os.path.join(output_dir, 'loso_cv_accuracy_barchart.png')}")

# --- 8. GENERATE LEARNING CURVES ---
print("\n--- Generating Learning Curves ---")
fig_lc, axes_lc = plt.subplots(4, 2, figsize=(18, 24), sharey=True)
if len(tuned_models) % 2 != 0: fig_lc.delaxes(axes_lc.flatten()[-1])
axes_lc = axes_lc.flatten()
fig_lc.suptitle('Learning Curves for All Models', fontsize=24, y=1.0)
train_sizes_abs = np.linspace(0.1, 1.0, 7)

for i, (model_name, model) in enumerate(tuned_models.items()):
    print(f"Generating learning curve for {model_name}...")
    pipeline_lc = Pipeline([('scaler', StandardScaler()), ('model', model)])
    train_sizes, train_scores, test_scores = learning_curve(
        pipeline_lc, X, y, cv=5, n_jobs=-1, train_sizes=train_sizes_abs, scoring='accuracy')
    
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    
    ax = axes_lc[i]
    ax.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    ax.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    ax.set_title(model_name, fontsize=16)
    ax.set_xlabel("Training examples"); ax.set_ylabel("Accuracy")
    ax.grid(); ax.legend(loc="best")

fig_lc.tight_layout(pad=3.0)
fig_lc.savefig(os.path.join(output_dir, "all_learning_curves_raw_csv.png"))
plt.close(fig_lc)
print(f"Saved learning curves to: {os.path.join(output_dir, 'all_learning_curves_raw_csv.png')}")

print("\n--- Process Complete ---")

Loading, sampling, and labeling data...
Loaded 973858 rows of data from 3 unique subjects.

--- Verifying the Combined and Labeled DataFrame ---
First 5 rows of the dataset:
         E1        E2        E3        E4            E5        E6  \
0  0.000004 -0.000004 -0.000010 -0.000004  4.862534e-06 -0.000009   
1  0.000009 -0.000008 -0.000006 -0.000002 -9.369657e-07  0.000002   
2  0.000003 -0.000004 -0.000009 -0.000002  9.702576e-08  0.000003   
3 -0.000008 -0.000002 -0.000007 -0.000001 -3.968677e-05  0.000007   
4 -0.000016 -0.000021 -0.000003 -0.000002  4.623818e-06 -0.000003   

             E7        E8        E9       E10  ...      E122      E123  \
0  1.255296e-07 -0.000012 -0.000010 -0.000009  ...  0.000010 -0.000011   
1  2.801928e-06 -0.000012 -0.000008 -0.000006  ... -0.000009 -0.000010   
2  4.612941e-06 -0.000014 -0.000010 -0.000011  ... -0.000004  0.000002   
3  1.789049e-06 -0.000012  0.000003  0.000003  ... -0.000010 -0.000004   
4  2.116167e-06 -0.000019 -0.000007 -0.00



[CV] END ...........knn__n_neighbors=7, pca__n_components=34; total time=  56.1s
[CV] END ...........knn__n_neighbors=5, pca__n_components=27; total time=  51.8s
[CV] END ...........knn__n_neighbors=9, pca__n_components=45; total time= 1.0min
[CV] END ...........knn__n_neighbors=5, pca__n_components=30; total time=  52.7s
[CV] END ...........knn__n_neighbors=5, pca__n_components=41; total time=  54.9s
[CV] END .....................max_depth=30, n_estimators=448; total time=27.3min
[CV] END ...........knn__n_neighbors=9, pca__n_components=39; total time=  57.7s
[CV] END ...........knn__n_neighbors=7, pca__n_components=40; total time=  57.1s
[CV] END ...........knn__n_neighbors=5, pca__n_components=42; total time=  58.6s
[CV] END ...........knn__n_neighbors=7, pca__n_components=23; total time=  50.0s
[CV] END ...........knn__n_neighbors=7, pca__n_components=21; total time=  49.3s
[CV] END .....................max_depth=30, n_estimators=448; total time=27.4min
[CV] END ...........knn__n_n