In [1]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, matthews_corrcoef,
    precision_score, recall_score, f1_score, confusion_matrix
)
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd
import numpy as np

# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [2]:
# Load Data
data = pd.read_csv("/content/gdrive/MyDrive/Bashar_Staging/TCGA LUAD staging merged_data_I_II.csv")

In [4]:
# Filter binary classes: Stage I and Stage III
data_binary = data[data['Stage'].isin(['Stage I', 'Stage III'])]

# Split features and target
X = data_binary.drop(columns=['Stage'])
y = data_binary['Stage']
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # Stage I -> 0, Stage III -> 1

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.24, stratify=y_encoded, random_state=42
)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

# Display resampled distribution
resampled_counts = pd.Series(y_resampled).value_counts()
resampled_labels = [le.inverse_transform([i])[0] for i in resampled_counts.index]
print("Resampled class distribution:")
for label, count in zip(resampled_labels, resampled_counts):
    print(f"{label}: {count}")
print()

# Train AdaBoost Classifier with optimized parameters
model = AdaBoostClassifier(n_estimators=150, learning_rate=0.8, random_state=42)
model.fit(X_resampled, y_resampled)

# Predict
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]  # Probability for class 1

# Metrics
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
mcc = matthews_corrcoef(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

# Confusion Matrix and Specificity
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

# Print all metrics
print("Confusion Matrix:")
print(conf_matrix)
print(f"\nAccuracy: {acc:.4f}")
print(f"AUC: {auc:.4f}")
print(f"MCC: {mcc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall/Sensitivity: {recall:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"F1 Score: {f1:.4f}")

Resampled class distribution:
Stage I: 353
Stage III: 353

Confusion Matrix:
[[105   7]
 [ 20   3]]

Accuracy: 0.8000
AUC: 0.5396
MCC: 0.0975
Precision: 0.3000
Recall/Sensitivity: 0.1304
Specificity: 0.9375
F1 Score: 0.1818
