In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, matthews_corrcoef,
    precision_score, recall_score, f1_score, confusion_matrix
)
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd
import numpy as np
from scipy.stats import randint

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [12]:
data = pd.read_csv("/content/gdrive/MyDrive/Bashar_Staging/TCGA LUAD staging merged_data_I_II.csv")

In [13]:
data_binary = data[data['Stage'].isin(['Stage I', 'Stage III'])]

# Encode labels
X = data_binary.drop(columns=['Stage'])
y = data_binary['Stage']
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

# Hyperparameter tuning
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(10, 50),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'class_weight': ['balanced', 'balanced_subsample'],
}

base_rf = RandomForestClassifier(random_state=42)

search = RandomizedSearchCV(
    base_rf, param_distributions=param_dist,
    n_iter=30, scoring='f1', cv=StratifiedKFold(n_splits=5),
    verbose=1, random_state=42, n_jobs=-1
)

search.fit(X_resampled, y_resampled)
rf_model = search.best_estimator_

# Predict
y_pred = rf_model.predict(X_test_scaled)
y_proba = rf_model.predict_proba(X_test_scaled)[:, 1]

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Metrics
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
mcc = matthews_corrcoef(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

# Specificity calculation
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

# Print results
print("Best RF Parameters:", search.best_params_)
print("Confusion Matrix:\n", conf_matrix)
print(f"Accuracy: {acc:.4f}")
print(f"AUC: {auc:.4f}")
print(f"MCC: {mcc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall/Sensitivity: {recall:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"F1 Score: {f1:.4f}")


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best RF Parameters: {'class_weight': 'balanced', 'max_depth': 44, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 360}
Confusion Matrix:
 [[93  0]
 [20  0]]
Accuracy: 0.8230
AUC: 0.5812
MCC: 0.0000
Precision: 0.0000
Recall/Sensitivity: 0.0000
Specificity: 1.0000
F1 Score: 0.0000
