In [7]:
import pandas as pd

# --- 1. Load reports ---
reports_path = "C:/Users/Medha/Desktop/Medical-Text-Classification-LLMs/TCGA_Reports.csv"
reports_df = pd.read_csv(reports_path)

print("Reports shape:", reports_df.shape)
print(reports_df.columns)

# --- 2. Extract patient ID from patient_filename ---
# e.g. "TCGA-BP-5195.25c0b4..." -> "TCGA-BP-5195"
reports_df["patient_id"] = reports_df["patient_filename"].str.split(".").str[0]

# --- 3. Load cancer-type metadata ---
meta_path = "C:/Users/Medha/Desktop/Medical-Text-Classification-LLMs/tcga_patient_to_cancer_type.csv"
meta_df = pd.read_csv(meta_path)

print("Metadata shape:", meta_df.shape)
print(meta_df.columns)


Reports shape: (9523, 2)
Index(['patient_filename', 'text'], dtype='object')
Metadata shape: (11160, 2)
Index(['patient_id', 'cancer_type'], dtype='object')


In [13]:
# Merge reports with metadata on patient_id
reports_df["patient_id"] = reports_df["patient_filename"].str.split(".").str[0]

tcga_df = reports_df.merge(
    meta_df,
    how="inner",
    on="patient_id"
)

print("Merged shape:", tcga_df.shape)
print("Merged df schema: ",tcga_df.columns)
print("\n",tcga_df[["patient_id", "cancer_type"]].head())

# Basic label distribution
print("\nCancer type distribution:")
print(tcga_df["cancer_type"].value_counts())


Merged shape: (9523, 4)
Merged df schema:  Index(['patient_filename', 'text', 'patient_id', 'cancer_type'], dtype='object')

      patient_id cancer_type
0  TCGA-BP-5195        KIRC
1  TCGA-D7-8573        STAD
2  TCGA-EI-7004        READ
3  TCGA-EB-A82B        SKCM
4  TCGA-A6-3808        COAD

Cancer type distribution:
cancer_type
BRCA    1034
UCEC     546
KIRC     525
HNSC     520
LUAD     488
THCA     487
LGG      469
LUSC     468
PRAD     446
COAD     418
GBM      399
BLCA     379
OV       371
STAD     361
LIHC     341
CESC     289
KIRP     280
SARC     249
PAAD     176
PCPG     174
READ     162
ESCA     146
THYM     114
KICH     112
SKCM     102
ACC       90
TGCT      87
MESO      79
UVM       65
UCS       56
DLBC      47
CHOL      43
Name: count, dtype: int64


In [9]:
# Drop missing / empty text
tcga_df.dropna(subset=["text"], inplace=True)
tcga_df["text"] = tcga_df["text"].astype(str).str.strip()
tcga_df = tcga_df[tcga_df["text"] != ""]

print("After cleaning shape:", tcga_df.shape)


After cleaning shape: (9523, 4)


In [10]:
# Quick summary by cancer_type
print(tcga_df["cancer_type"].value_counts())
print("\nNumber of unique cancer types:", tcga_df["cancer_type"].nunique())


cancer_type
BRCA    1034
UCEC     546
KIRC     525
HNSC     520
LUAD     488
THCA     487
LGG      469
LUSC     468
PRAD     446
COAD     418
GBM      399
BLCA     379
OV       371
STAD     361
LIHC     341
CESC     289
KIRP     280
SARC     249
PAAD     176
PCPG     174
READ     162
ESCA     146
THYM     114
KICH     112
SKCM     102
ACC       90
TGCT      87
MESO      79
UVM       65
UCS       56
DLBC      47
CHOL      43
Name: count, dtype: int64

Number of unique cancer types: 32


In [15]:
# =============================================================================
# STEP 3: Train/Test Split + Text to Numbers (TF-IDF)
# =============================================================================

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Define inputs/outputs
# X = all pathology report texts (input to predict from)
# y = cancer type labels (what we want to predict)
X = tcga_df["text"]
y = tcga_df["cancer_type"]
print("Raw data shapes - X:", X.shape, "y:", y.shape)

# Split data: 80% train (to learn from), 20% test (to evaluate on)
# stratify=y ensures each cancer type appears proportionally in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,           # 20% for testing, 80% for training
    random_state=42,         # Fixed seed for reproducible splits
    stratify=y               # Keep same cancer type ratios in train/test
)

print(f"‚úÖ Train size: {X_train.shape[0]} reports")
print(f"‚úÖ Test size: {X_test.shape[0]} reports")
print(f"‚úÖ Train cancer types: {y_train.nunique()} (same as full: {y.nunique()})")

# Create text-to-number converter
# TF-IDF = "Term Frequency-Inverse Document Frequency" 
# Scores words by importance: rare words in doc get higher scores
vectorizer = TfidfVectorizer(
    stop_words="english",    # Ignore common words like "the", "and", "of"
    max_features=50000       # Limit to top 50K most important words (speed)
)

# Convert text to numbers
# fit_transform: Learns vocabulary from TRAIN texts AND converts them
X_train_tfidf = vectorizer.fit_transform(X_train)

# transform: Converts TEST texts using SAME vocabulary (no cheating!)
X_test_tfidf = vectorizer.transform(X_test)

print(f"‚úÖ TF-IDF Train shape: {X_train_tfidf.shape} (rows=reports, cols=words)")
print(f"‚úÖ TF-IDF Test shape: {X_test_tfidf.shape}")


Raw data shapes - X: (9523,) y: (9523,)
‚úÖ Train size: 7618 reports
‚úÖ Test size: 1905 reports
‚úÖ Train cancer types: 32 (same as full: 32)
‚úÖ TF-IDF Train shape: (7618, 38382) (rows=reports, cols=words)
‚úÖ TF-IDF Test shape: (1905, 38382)


In [17]:
# =============================================================================
# STEP 4: Logistic Regression Baseline (Multi-Class)
# =============================================================================

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Train simple Logistic Regression (handles multi-class automatically)
print("üöÄ Training Logistic Regression...")
lr_model = LogisticRegression(
    max_iter=1000,           # Allow more iterations for convergence
    random_state=42,         # Reproducible results
)

# üöÄ Train the model: adjusts 32x38K word weights to minimize prediction errors
# Each weight shows "how important this word is for this cancer type"
lr_model.fit(X_train_tfidf, y_train)

# Predict on test set
y_pred_lr = lr_model.predict(X_test_tfidf)

# Evaluate
accuracy = accuracy_score(y_test, y_pred_lr)
print(f"‚úÖ Logistic Regression Accuracy: {accuracy:.4f} ({accuracy*100:.1f}%)")

print("\nüìä Detailed Report (top classes):")
print(classification_report(y_test, y_pred_lr))


üöÄ Training Logistic Regression...
‚úÖ Logistic Regression Accuracy: 0.9423 (94.2%)

üìä Detailed Report (top classes):
              precision    recall  f1-score   support

         ACC       1.00      0.89      0.94        18
        BLCA       0.97      1.00      0.99        76
        BRCA       1.00      1.00      1.00       207
        CESC       1.00      0.91      0.95        58
        CHOL       1.00      0.44      0.62         9
        COAD       0.87      0.99      0.93        84
        DLBC       1.00      0.89      0.94         9
        ESCA       1.00      0.90      0.95        29
         GBM       0.95      0.96      0.96        80
        HNSC       0.99      1.00      1.00       104
        KICH       0.90      0.41      0.56        22
        KIRC       0.79      0.99      0.88       105
        KIRP       1.00      0.73      0.85        56
         LGG       0.97      0.96      0.96        94
        LIHC       0.91      1.00      0.95        68
        LUAD

In [18]:
# =============================================================================
# STEP 5: Random Forest (should improve on Logistic Regression)
# =============================================================================

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

print("üå≥ Training Random Forest...")
rf_model = RandomForestClassifier(
    n_estimators=200,        # 200 decision trees
    random_state=42,
    n_jobs=-1                # Use all CPU cores
)

# üöÄ Train: builds 200 decision trees that vote on cancer type
rf_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test_tfidf)
rf_accuracy = accuracy_score(y_test, y_pred_rf)

print(f"‚úÖ Random Forest Accuracy: {rf_accuracy:.4f} ({rf_accuracy*100:.1f}%)")
print(f"üìà Improvement: {rf_accuracy - accuracy:.1%}")
print("\nüìä Random Forest Report:")
print(classification_report(y_test, y_pred_rf))


üå≥ Training Random Forest...
‚úÖ Random Forest Accuracy: 0.9318 (93.2%)
üìà Improvement: -1.0%

üìä Random Forest Report:
              precision    recall  f1-score   support

         ACC       1.00      0.67      0.80        18
        BLCA       0.97      0.99      0.98        76
        BRCA       0.98      1.00      0.99       207
        CESC       0.96      0.91      0.94        58
        CHOL       1.00      0.22      0.36         9
        COAD       0.79      0.99      0.88        84
        DLBC       1.00      0.89      0.94         9
        ESCA       1.00      0.86      0.93        29
         GBM       0.93      0.97      0.95        80
        HNSC       0.99      0.98      0.99       104
        KICH       1.00      0.27      0.43        22
        KIRC       0.74      0.99      0.85       105
        KIRP       0.97      0.70      0.81        56
         LGG       0.97      0.94      0.95        94
        LIHC       0.89      1.00      0.94        68
        L

In [19]:
print("""
üî¨ KEY INSIGHT: Logistic Regression (94.2%) beat Random Forest (93.2%) 
because TF-IDF text data is sparse + linear. Ensemble methods struggle 
with 38K sparse features. This justifies Logistic Regression as primary model.
""")



üî¨ KEY INSIGHT: Logistic Regression (94.2%) beat Random Forest (93.2%) 
because TF-IDF text data is sparse + linear. Ensemble methods struggle 
with 38K sparse features. This justifies Logistic Regression as primary model.



In [21]:
# =============================================================================
# STEP 7: Model Interpretability WITH WEIGHTS
# =============================================================================

import numpy as np

# Get BRCA row from coefficient matrix (32 cancers x 38K words)
cancer_to_idx = {cancer: i for i, cancer in enumerate(sorted(y_train.unique()))}
brca_idx = cancer_to_idx['BRCA']

# Top 10 POSITIVE weights (words that BOOST BRCA prediction)
top10_positive = np.argsort(lr_model.coef_[brca_idx])[-10:][::-1]
print("üîç Top 10 words BOOSTING BRCA prediction (with weights):")
for i, word_idx in enumerate(top10_positive):
    word = vectorizer.get_feature_names_out()[word_idx]
    weight = lr_model.coef_[brca_idx, word_idx]
    print(f"  {i+1:2d}. '{word}' ‚Üí weight: {weight:+.3f}")

print("\nüìà Higher positive weights = stronger BRCA predictors")


üîç Top 10 words BOOSTING BRCA prediction (with weights):
   1. 'breast' ‚Üí weight: +9.090
   2. 'ductal' ‚Üí weight: +4.169
   3. 'sentinel' ‚Üí weight: +3.346
   4. 'axillary' ‚Üí weight: +3.317
   5. 'invasive' ‚Üí weight: +2.772
   6. 'score' ‚Üí weight: +2.374
   7. 'carcinoma' ‚Üí weight: +2.244
   8. 'quadrant' ‚Üí weight: +2.149
   9. 'mastectomy' ‚Üí weight: +2.116
  10. 'skin' ‚Üí weight: +1.962

üìà Higher positive weights = stronger BRCA predictors


In [22]:
# =============================================================================
# STEP 8: Cross-Validation (prove 94% isn't overfitting)
# =============================================================================

from sklearn.model_selection import cross_val_score, StratifiedKFold

print("üî¨ 5-Fold Cross-Validation...")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(lr_model, X_train_tfidf, y_train, cv=cv, scoring='accuracy')

print(f"‚úÖ CV Scores: {cv_scores}")
print(f"‚úÖ CV Mean: {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")
print(f"‚úÖ Test Accuracy: {accuracy:.4f}")
print(f"‚úÖ ‚úÖ GENERALIZES WELL!" if abs(cv_scores.mean() - accuracy) < 0.02 else "‚ö†Ô∏è Check for overfitting")


üî¨ 5-Fold Cross-Validation...
‚úÖ CV Scores: [0.9343832  0.92454068 0.93569554 0.92252134 0.93565332]
‚úÖ CV Mean: 0.9306 ¬± 0.0058
‚úÖ Test Accuracy: 0.9423
‚úÖ ‚úÖ GENERALIZES WELL!
