In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, roc_auc_score, matthews_corrcoef,
    precision_score, recall_score, f1_score, confusion_matrix
)
from imblearn.over_sampling import SMOTE

In [12]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [13]:
df = pd.read_csv("/content/gdrive/MyDrive/Bashar_Staging/TCGA LUAD staging merged_data_II_III.csv")

In [14]:
# Filter specific stages
df_filtered = df[df['Stage'].isin(['Stage I', 'Stage III', 'Stage IV'])].copy()
print("Original class distribution:\n", df_filtered['Stage'].value_counts(), "\n")

# Encode target labels
label_encoder = LabelEncoder()
df_filtered['Target'] = label_encoder.fit_transform(df_filtered['Stage'])

# Define features and labels
X = df_filtered.drop(['Stage', 'Target'], axis=1).values
y = df_filtered['Target'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

# Print resampled distribution
resampled_counts = pd.Series(y_resampled).value_counts()
resampled_labels = [label_encoder.inverse_transform([i])[0] for i in resampled_counts.index]
print("Resampled class distribution:")
for label, count in zip(resampled_labels, resampled_counts):
    print(f"{label}: {count}")
print()

# Train model
svm = SVC(kernel='rbf', probability=True, random_state=42)
svm.fit(X_resampled, y_resampled)

# Predict
y_pred = svm.predict(X_test_scaled)
y_proba = svm.predict_proba(X_test_scaled)

# Evaluation metrics
acc = accuracy_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

# Handle AUC for binary or multi-class
unique_classes = np.unique(np.concatenate((y_resampled, y_test)))
n_classes = len(unique_classes)

if n_classes > 2:
    y_test_bin = label_binarize(y_test, classes=unique_classes)
    auc = roc_auc_score(y_test_bin, y_proba, average='macro', multi_class='ovr')
else:
    auc = roc_auc_score(y_test, y_proba[:, 1])

# Confusion matrix & specificity
conf_matrix = confusion_matrix(y_test, y_pred, labels=unique_classes)
specificities = []
for i in range(conf_matrix.shape[0]):
    tn = conf_matrix.sum() - (conf_matrix[i, :].sum() + conf_matrix[:, i].sum() - conf_matrix[i, i])
    fp = conf_matrix[:, i].sum() - conf_matrix[i, i]
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0
    specificities.append(spec)
specificity = np.mean(specificities)

# Print metrics
print("Confusion Matrix:")
print(conf_matrix)
print(f"\nAccuracy: {acc:.4f}")
print(f"AUC: {auc:.4f}")
print(f"MCC: {mcc:.4f}")
print(f"Precision (macro): {precision:.4f}")
print(f"Recall/Sensitivity (macro): {recall:.4f}")
print(f"Specificity (macro): {specificity:.4f}")
print(f"F1 Score (macro): {f1:.4f}")

Original class distribution:
 Stage
Stage I     327
Stage IV     28
Name: count, dtype: int64 

Resampled class distribution:
Stage IV: 262
Stage I: 262

Confusion Matrix:
[[65  0]
 [ 6  0]]

Accuracy: 0.9155
AUC: 0.5256
MCC: 0.0000
Precision (macro): 0.4577
Recall/Sensitivity (macro): 0.5000
Specificity (macro): 0.5000
F1 Score (macro): 0.4779
