In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import joblib

# Load preprocessed data
data_path = '/content/drive/My Drive/Colab Notebooks/TCGA_Data/preprocessed_data.csv'
preprocessed_data = pd.read_csv(data_path)

# Load trained model from Step 4
model_path = '/content/drive/My Drive/Colab Notebooks/TCGA_Data/cancer_classifier_step4.pkl'
model = joblib.load(model_path)

# Separate features and labels
X = preprocessed_data.drop(columns=['sample', 'primary_disease'])
y = preprocessed_data['primary_disease']

# Split (recreate to match Step 4)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Verify shapes
print("Training Set Shape:", X_train.shape)  # Should be (1848, 100)
print("Testing Set Shape:", X_test.shape)    # Should be (463, 100)
print("Training Labels Sample:", y_train.head())

# Evaluate Model Performance

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# Predict on test set
y_pred = model.predict(X_test)

# Classification report (precision, recall, F1-score)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['BRCA', 'LUAD', 'PRAD']))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['BRCA', 'LUAD', 'PRAD'])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix for Cancer Classification (Step 5)")
plt.show()

# Plot ROC Curves
Assess Class-Specific Performance: ROC curves show trade-offs between true positive rate and false positive rate for each class.

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Get prediction probabilities
y_pred_proba = model.predict_proba(X_test)

plt.figure(figsize=(10, 8))
for i in range(3):  # Loop through BRCA, LUAD, PRAD
    fpr, tpr, _ = roc_curve(y_test == model.classes_[i], y_pred_proba[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{model.classes_[i]} (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Cancer Classification')
plt.legend()
plt.show()

# Explore Feature Importance

In [None]:
import pandas as pd
from sklearn.inspection import permutation_importance

# Feature importance via permutation (robust for OneVsRestClassifier)
perm_importance = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
feature_importance = pd.DataFrame({'Gene': X_test.columns, 'Importance': perm_importance.importances_mean})
print("Top 10 Most Important Features Across Classes:")
print(feature_importance.sort_values(by='Importance', ascending=False).head(10))

# Coefficients from first classifier (BRCA vs. others)
coef = model.estimators_[0].coef_[0]  # BRCA coefficients
feature_coef = pd.DataFrame({'Gene': X_train.columns, 'Coefficient': coef})
print("\nTop 10 Most Important Features for BRCA (Coefficients):")
print(feature_coef.sort_values(by='Coefficient', key=abs, ascending=False).head(10))

importance of a feature is calculated by shuffling that feature's values and measuring the decrease in model performance (e.g., accuracy). Since your model is already perfectly accurate, shuffling any feature does not decrease the accuracy on the test set—it remains 100%. Thus, the importance is calculated as 0 for all features.

# Validate with External Data

In [None]:
# Load external data (example path)
external_path = '/path/to/external_data.csv'
external_data = pd.read_csv(external_path)
X_external = external_data.drop(columns=['sample', 'primary_disease'])
y_external = external_data['primary_disease']

# Ensure same genes (match Step 4’s top 100)
X_external = X_external[top_genes]  # Use top_genes from Step 4
X_external_scaled = scaler.transform(X_external)  # Scale with Step 4 scaler

# Predict
y_pred_external = model.predict(X_external_scaled)
accuracy_external = accuracy_score(y_external, y_pred_external)
print(f"Model Accuracy on External Data: {accuracy_external:.2f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_importance = pd.DataFrame({'Gene': X_train.columns, 'Importance': rf_model.feature_importances_})
print("Random Forest Feature Importance:")
print(rf_importance.sort_values(by='Importance', ascending=False).head(10))