# Dominant Cancer Type Prediction with MLP (Teacher Model)

In [22]:
# ==================== 1. LOAD AND CLEAN ====================
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("normalized_data.csv")
df_incidence = df[df["measure"] == "Incidence"].copy()

# Get dominant cancer per country/year
dominant_incidence = df_incidence.loc[
    df_incidence.groupby(["country_name", "year"])["new_cases/deaths"].idxmax()
].reset_index(drop=True)

# Drop duplicates and rare classes
dominant_incidence = dominant_incidence.loc[:, ~dominant_incidence.columns.duplicated()]
valid_classes = dominant_incidence["cancer_name"].value_counts()
dominant_incidence = dominant_incidence[dominant_incidence["cancer_name"].isin(valid_classes[valid_classes >= 3].index)]

# Encode categorical
le_country = LabelEncoder()
le_cancer = LabelEncoder()
dominant_incidence["country_name"] = le_country.fit_transform(dominant_incidence["country_name"])
dominant_incidence["cancer_name"] = le_cancer.fit_transform(dominant_incidence["cancer_name"])

# ==================== 2. TEACHER MODEL ====================
teacher_features = [
    "country_name", "year", "population", "new_cases/deaths", "total_cases", "cumulative_risk",
    "air_pollution", "alcohol_use", "gdp_per_capita", "uhc_index", "obesity_rate", "tobacco_use"
]
X_teacher = dominant_incidence[teacher_features]
y_teacher = dominant_incidence["cancer_name"]

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in sss.split(X_teacher, y_teacher):
    X_train_t = X_teacher.iloc[train_idx]
    X_test_t = X_teacher.iloc[test_idx]
    y_train_t = y_teacher.iloc[train_idx]
    y_test_t = y_teacher.iloc[test_idx]

scaler_t = StandardScaler()
X_train_t_scaled = scaler_t.fit_transform(X_train_t)
X_test_t_scaled = scaler_t.transform(X_test_t)

mlp_teacher = MLPClassifier(hidden_layer_sizes=(256, 128, 64), max_iter=1500, random_state=42)
mlp_teacher.fit(X_train_t_scaled, y_train_t)
y_pred_t = mlp_teacher.predict(X_test_t_scaled)

print("=== Teacher Model Report ===")
print(classification_report(y_test_t, y_pred_t, zero_division=0))

# ==================== 3. STUDENT PREP ====================
# Enhanced feature set for student
student_features = [
    "country_name", "year", "population", "new_cases/deaths", "total_cases",
    "cumulative_risk", "tobacco_use", "alcohol_use"
]
X_student = dominant_incidence[student_features]
y_student = dominant_incidence["cancer_name"]

X_train_s = X_student.iloc[train_idx]
X_test_s = X_student.iloc[test_idx]
y_train_s = y_student.iloc[train_idx]
y_test_s = y_student.iloc[test_idx]

scaler_s = StandardScaler()
X_train_s_scaled = scaler_s.fit_transform(X_train_s)
X_test_s_scaled = scaler_s.transform(X_test_s)

# Soft labels from teacher
X_train_teacher_input = scaler_t.transform(X_teacher.iloc[train_idx])
soft_labels = mlp_teacher.predict_proba(X_train_teacher_input)

# ==================== 4. DISTILLED STUDENT (PYTORCH) ====================
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

X_tensor = torch.tensor(X_train_s_scaled, dtype=torch.float32)
soft_targets_tensor = torch.tensor(soft_labels, dtype=torch.float32)

train_dataset = TensorDataset(X_tensor, soft_targets_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Final student model with dropout and layernorm
class SmarterStudent(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SmarterStudent, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, output_dim)
        )

    def forward(self, x):
        return self.net(x)

input_dim = X_train_s_scaled.shape[1]
output_dim = soft_labels.shape[1]
student_model = SmarterStudent(input_dim, output_dim)

temperature = 4.0
soft_loss_fn = nn.KLDivLoss(reduction="batchmean")
optimizer = optim.Adam(student_model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.7)

# Train with only soft loss
student_model.train()
for epoch in range(100):
    total_loss = 0
    for batch_X, batch_soft in train_loader:
        optimizer.zero_grad()
        logits = student_model(batch_X)
        log_probs = nn.functional.log_softmax(logits / temperature, dim=1)
        soft_loss = soft_loss_fn(log_probs, batch_soft)
        soft_loss.backward()
        optimizer.step()
        total_loss += soft_loss.item()
    scheduler.step()
    if epoch % 10 == 0 or epoch == 99:
        print(f"Epoch {epoch+1}, Soft Loss: {total_loss:.4f}")

# ==================== 5. EVALUATION ====================
student_model.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test_s_scaled, dtype=torch.float32)
    logits = student_model(X_test_tensor)
    y_pred_student = torch.argmax(logits, dim=1).numpy()

print("=== Distilled Student Model Report ===")
print(classification_report(y_test_s, y_pred_student, zero_division=0))

=== Teacher Model Report ===
              precision    recall  f1-score   support

           0       0.99      0.96      0.97        80
           1       1.00      1.00      1.00         1
           2       0.50      0.33      0.40         3
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00         4
           5       0.85      1.00      0.92        22
           6       1.00      1.00      1.00        34
           7       1.00      0.80      0.89         5
           8       1.00      1.00      1.00         3

    accuracy                           0.96       155
   macro avg       0.93      0.90      0.91       155
weighted avg       0.96      0.96      0.96       155

Epoch 1, Soft Loss: 40.5216
Epoch 11, Soft Loss: 15.6919
Epoch 21, Soft Loss: 8.8323
Epoch 31, Soft Loss: 6.4773
Epoch 41, Soft Loss: 5.3046
Epoch 51, Soft Loss: 4.4690
Epoch 61, Soft Loss: 3.9904
Epoch 71, Soft Loss: 3.3055
Epoch 81, Soft Loss: 3.0052
Epoch 91, Sof