In [44]:
import pandas as pd
import scipy.stats as stats
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [45]:
train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv('Data/test.csv')
data_dict = pd.read_csv('Data/data_dictionary.csv')
sample_submission = pd.read_csv('Data/sample_submission.csv')

In [46]:
for column in train_data.columns:
    if column == 'ID':
        pass
    else:
        print(f"{train_data[column].value_counts(dropna=False)}")
        print()

dri_score
Intermediate                                         10436
N/A - pediatric                                       4779
High                                                  4701
N/A - non-malignant indication                        2427
TBD cytogenetics                                      2003
Low                                                   1926
High - TED AML case <missing cytogenetics             1414
Intermediate - TED AML case <missing cytogenetics      481
N/A - disease not classifiable                         272
Very high                                              198
NaN                                                    154
Missing disease status                                   9
Name: count, dtype: int64

psych_disturb
No          23005
Yes          3587
NaN          2062
Not done      146
Name: count, dtype: int64

cyto_score
Poor            8802
NaN             8068
Intermediate    6376
Favorable       3011
TBD             1341
Normal           643
Other

In [47]:
train_data.columns

Index(['ID', 'dri_score', 'psych_disturb', 'cyto_score', 'diabetes',
       'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia',
       'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue',
       'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status',
       'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6',
       'hla_match_c_low', 'rituximab', 'hla_match_drb1_low',
       'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail',
       'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct',
       'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe',
       'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer',
       'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue',
       'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score',
       'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related',
       'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_hi

In [48]:
data_dict

Unnamed: 0,variable,description,type,values
0,dri_score,Refined disease risk index,Categorical,['Intermediate' 'High' 'N/A - non-malignant in...
1,psych_disturb,Psychiatric disturbance,Categorical,['Yes' 'No' nan 'Not done']
2,cyto_score,Cytogenetic score,Categorical,['Intermediate' 'Favorable' 'Poor' 'TBD' nan '...
3,diabetes,Diabetes,Categorical,['No' 'Yes' nan 'Not done']
4,hla_match_c_high,Recipient / 1st donor allele level (high resol...,Numerical,
5,hla_high_res_8,Recipient / 1st donor allele-level (high resol...,Numerical,
6,tbi_status,TBI,Categorical,"['No TBI' 'TBI + Cy +- Other' 'TBI +- Other, <..."
7,arrhythmia,Arrhythmia,Categorical,['No' nan 'Yes' 'Not done']
8,hla_low_res_6,Recipient / 1st donor antigen-level (low resol...,Numerical,
9,graft_type,Graft type,Categorical,['Peripheral blood' 'Bone marrow']


In [49]:
# Check correlation for numerical columns
correlations = train_data.corr(numeric_only=True)['efs'].sort_values(ascending=False)
correlations

efs                    1.000000
age_at_hct             0.227866
comorbidity_score      0.145723
hla_match_drb1_high    0.074446
hla_match_drb1_low     0.067485
hla_high_res_10        0.048431
donor_age              0.047566
hla_low_res_6          0.047007
hla_high_res_6         0.045261
hla_high_res_8         0.044224
hla_low_res_10         0.041972
hla_match_b_low        0.040965
hla_low_res_8          0.040612
hla_nmdp_6             0.040145
hla_match_dqb1_high    0.039736
hla_match_dqb1_low     0.032440
hla_match_c_high       0.031850
hla_match_b_high       0.026293
hla_match_c_low        0.016335
hla_match_a_high       0.015881
hla_match_a_low        0.015086
ID                    -0.003517
karnofsky_score       -0.091156
year_hct              -0.106419
efs_time              -0.741042
Name: efs, dtype: float64

In [50]:
# Statistical Test for categorical variables
# low p-value suggest a relationship between the categorical column and efs
for col in train_data.columns:
    if col != 'efs' and train_data[col].dtype == 'object':
        contingency = pd.crosstab(train_data[col], train_data['efs'])
        chi2, p, dof, expected = stats.chi2_contingency(contingency)
        print(f"{col}: p-value={p}")

dri_score: p-value=0.0
psych_disturb: p-value=1.2979884206277078e-26
cyto_score: p-value=0.0017695925410587405
diabetes: p-value=9.473517746062939e-31
tbi_status: p-value=3.872202121618774e-17
arrhythmia: p-value=1.4455281691857336e-13
graft_type: p-value=0.0
vent_hist: p-value=0.5894941475545321
renal_issue: p-value=0.0008044609682586662
pulm_severe: p-value=1.1740716320072957e-38
prim_disease_hct: p-value=0.0
cmv_status: p-value=2.996206556381759e-63
tce_imm_match: p-value=0.003445012073378599
rituximab: p-value=0.38833316629745207
prod_type: p-value=0.0
cyto_score_detail: p-value=2.2303455266813332e-39
conditioning_intensity: p-value=8.244812565600892e-75
ethnicity: p-value=1.3456203538403933e-12
obesity: p-value=3.083035318962666e-11
mrd_hct: p-value=0.00293750887501947
in_vivo_tcd: p-value=2.2959109320915436e-169
tce_match: p-value=7.401409152266478e-06
hepatic_severe: p-value=1.5564351873948503e-43
prior_tumor: p-value=1.6392138057148511e-59
peptic_ulcer: p-value=7.37114508919722

# Handling NaN values

In [51]:
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier



def handle_missing_values(data):
    # Separating categorical and numeric columns
    cat_cols = [col for col in data.columns if data[col].dtype == 'object']
    num_cols = [col for col in data.columns if col not in cat_cols]

    # Handling missing numerical data with KNNImputer
    knn_imputer = KNNImputer(n_neighbors=5)
    data[num_cols] = knn_imputer.fit_transform(data[num_cols])

    # Imputing categorical data with KNeighborsClassifier
    for col in cat_cols:
        missing_mask = data[col].isna()
        if missing_mask.sum() > 0:
            # Separate training and rpediction sets
            X_train = data.loc[~missing_mask, num_cols]
            y_train = data.loc[~missing_mask, col]
            X_missing = data.loc[missing_mask, num_cols]

            # Train a KNeighborsClassifier
            knn = KNeighborsClassifier(n_neighbors=5)
            knn.fit(X_train, y_train)
            imputed_values = knn.predict(X_missing)

            # Fill missing values
            data.loc[missing_mask, col] = imputed_values

    return data

train_data = handle_missing_values(train_data)
train_data.head()

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,vent_hist,renal_issue,pulm_severe,prim_disease_hct,hla_high_res_6,cmv_status,hla_high_res_10,hla_match_dqb1_high,tce_imm_match,hla_nmdp_6,hla_match_c_low,rituximab,hla_match_drb1_low,hla_match_dqb1_low,prod_type,cyto_score_detail,conditioning_intensity,ethnicity,year_hct,obesity,mrd_hct,in_vivo_tcd,tce_match,hla_match_a_high,hepatic_severe,donor_age,prior_tumor,hla_match_b_low,peptic_ulcer,age_at_hct,hla_match_a_low,gvhd_proph,rheum_issue,sex_match,hla_match_b_high,race_group,comorbidity_score,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,0.0,N/A - non-malignant indication,No,Poor,No,2.0,7.0,No TBI,No,6.0,Bone marrow,No,No,No,IEA,6.0,+/+,8.8,2.0,P/P,6.0,2.0,No,2.0,2.0,BM,Intermediate,MAC,Not Hispanic or Latino,2016.0,No,Negative,Yes,Permissive,2.0,No,38.3438,No,2.0,No,9.942,2.0,FKalone,No,M-F,2.0,More than one race,0.0,90.0,No,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,1.0,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,Peripheral blood,No,No,No,AML,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,PB,Intermediate,MAC,Not Hispanic or Latino,2008.0,No,Positive,No,Permissive,2.0,No,72.29,No,2.0,No,43.705,2.0,Other GVHD Prophylaxis,No,F-F,2.0,Asian,3.0,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2,2.0,N/A - non-malignant indication,No,Poor,No,2.0,8.0,No TBI,No,6.0,Bone marrow,No,No,No,HIS,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,BM,Intermediate,MAC,Not Hispanic or Latino,2019.0,No,Negative,Yes,Permissive,2.0,No,34.4604,No,2.0,No,33.997,2.0,Cyclophosphamide alone,No,F-M,2.0,More than one race,0.0,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793
3,3.0,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,Bone marrow,No,No,No,ALL,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,BM,Intermediate,MAC,Not Hispanic or Latino,2009.0,No,Positive,No,Permissive,2.0,No,29.23,No,2.0,No,43.245,2.0,FK+ MMF +- others,No,M-M,2.0,White,0.0,90.0,Yes,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,102.349
4,4.0,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,Peripheral blood,No,No,No,MPN,6.0,+/+,10.0,2.0,P/P,5.0,2.0,No,2.0,2.0,PB,Intermediate,MAC,Hispanic or Latino,2018.0,No,Negative,Yes,Permissive,2.0,No,56.81,No,2.0,No,29.74,2.0,TDEPLETION +- other,No,M-F,2.0,American Indian or Alaska Native,1.0,90.0,No,Permissive mismatched,Related,MEL,8.0,No,2.0,No,10.0,0.0,16.223


In [52]:
for column in train_data.columns:
    if column == 'ID':
        pass
    else:
        print(f"{train_data[column].value_counts(dropna=False)}")
        print()

dri_score
Intermediate                                         10505
N/A - pediatric                                       4808
High                                                  4737
N/A - non-malignant indication                        2435
TBD cytogenetics                                      2007
Low                                                   1929
High - TED AML case <missing cytogenetics             1417
Intermediate - TED AML case <missing cytogenetics      483
N/A - disease not classifiable                         272
Very high                                              198
Missing disease status                                   9
Name: count, dtype: int64

psych_disturb
No          25017
Yes          3637
Not done      146
Name: count, dtype: int64

cyto_score
Poor            12878
Intermediate     9179
Favorable        3910
TBD              1511
Normal            716
Other             551
Not tested         55
Name: count, dtype: int64

diabetes
No          24212


# Simple NN

In [53]:
# Separate target and drop unneeded columns
y = train_data['efs']
X = train_data.drop(columns=['efs', 'efs_time', 'ID'], errors='ignore')

# Encode categorical variables
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

# Scale numeric featuers
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Ensure all numerica
X = X.astype(np.float32)


# Convert to NumPy arrays
X_array = X.values.astype(np.float32)
y_array = y.values.astype(np.float32)

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, test_size=0.2, random_state=42
)

# Convert NumPy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

# Create Datasets and Dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [54]:
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau

# Define a simple feedforward model
class SimpleNet(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

In [55]:
# Initialize the model, loss, and optimizer
model = SimpleNet(input_dim=X_train.shape[1])
criterion = nn.BCELoss() # Binary cross-entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Learning Rate scheduler
# StepLR
# Decrease the learning rate every 10 epochs by a factor of 0.1
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

# ReduceLROnPlateaus
# scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)


# Training loop 
epochs = 100
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = outputs.view(-1) # Flatten for loss calculation
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs).view(-1)
            loss = criterion(outputs, targets)
            val_loss += loss.item() * inputs.size(0)

            preds = (outputs >= 0.5).float()
            val_correct += (preds == targets).sum().item()
            val_total += targets.size(0)
    val_loss /= len(val_loader.dataset)
    val_accuracy = val_correct / val_total

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

    # Update the learning rate using scheduler
    scheduler.step()

Epoch 1/100, Train Loss: 0.6094, Val Loss: 0.5907, Val Acc: 0.6821
Epoch 2/100, Train Loss: 0.5914, Val Loss: 0.5920, Val Acc: 0.6826
Epoch 3/100, Train Loss: 0.5877, Val Loss: 0.5913, Val Acc: 0.6844
Epoch 4/100, Train Loss: 0.5846, Val Loss: 0.5914, Val Acc: 0.6832
Epoch 5/100, Train Loss: 0.5800, Val Loss: 0.5950, Val Acc: 0.6859
Epoch 6/100, Train Loss: 0.5758, Val Loss: 0.5927, Val Acc: 0.6847
Epoch 7/100, Train Loss: 0.5706, Val Loss: 0.5948, Val Acc: 0.6818
Epoch 8/100, Train Loss: 0.5650, Val Loss: 0.6003, Val Acc: 0.6762
Epoch 9/100, Train Loss: 0.5569, Val Loss: 0.6060, Val Acc: 0.6832
Epoch 10/100, Train Loss: 0.5493, Val Loss: 0.6134, Val Acc: 0.6767
Epoch 11/100, Train Loss: 0.5273, Val Loss: 0.6099, Val Acc: 0.6740
Epoch 12/100, Train Loss: 0.5223, Val Loss: 0.6136, Val Acc: 0.6747
Epoch 13/100, Train Loss: 0.5192, Val Loss: 0.6141, Val Acc: 0.6705
Epoch 14/100, Train Loss: 0.5168, Val Loss: 0.6189, Val Acc: 0.6729
Epoch 15/100, Train Loss: 0.5145, Val Loss: 0.6216, Val A

In [56]:
class ImprovedNet(nn.Module):
    def __init__(self, input_dim):
        super(ImprovedNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32,1)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.dropout(self.relu(self.bn1(self.fc1(x))))
        x = self.dropout(self.relu(self.bn2(self.fc2(x))))
        x = self.relu(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        return x

def training(model, criterion, optimizer, epochs, train_loader, val_loader, device):
    # Move model to the specified device
    model = model.to(device)
    
    # Learning Rate Scheduler
    scheduler = StepLR(optimizer, step_size=10, gamma=0.1)
    history = {
        'train_loss': [],
        'val_loss': [],
        'val_accuracy': [],
        'val_auc': []
    }
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()
            outputs = model(inputs).view(-1)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        all_targets = []
        all_outputs = []
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)

                outputs = model(inputs).view(-1)
                loss = criterion(outputs, targets)
                val_loss += loss.item() * inputs.size(0)

                # Collect outputs for metrics
                preds = (outputs >= 0.5).float()
                val_correct += (preds == targets).sum().item()
                val_total += targets.size(0)

                all_targets.extend(targets.cpu().numpy())
                all_outputs.extend(outputs.cpu().numpy())

        epoch_val_loss = val_loss / len(val_loader.dataset)
        epoch_val_accuracy = val_correct / val_total
        epoch_val_auc = roc_auc_score(all_targets, all_outputs)

        # Log metrics
        history['train_loss'].append(epoch_loss)
        history['val_loss'].append(epoch_val_loss)
        history['val_accuracy'].append(epoch_val_accuracy)
        history['val_auc'].append(epoch_val_auc)

        print(
            f"Epoch {epoch + 1}/{epochs}, "
            f"Train Loss: {epoch_loss:.4f}, "
            f"Val Loss: {epoch_val_loss:.4f}, "
            f"Val Acc: {epoch_val_accuracy:.4f}, "
            f"Val AUC: {epoch_val_auc:.4f}"
        )

        # Update the learning rate using scheduler
        scheduler.step()
    return history


In [57]:
# Initialize the model, loss, and optimizer
model = ImprovedNet(input_dim=X_train.shape[1])
criterion = nn.BCELoss() # Binary cross-entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

training(model, criterion, optimizer, 100, train_loader, val_loader, 'mps')

Epoch 1/100, Train Loss: 0.6156, Val Loss: 0.5959, Val Acc: 0.6830, Val AUC: 0.7459
Epoch 2/100, Train Loss: 0.5989, Val Loss: 0.5929, Val Acc: 0.6814, Val AUC: 0.7470
Epoch 3/100, Train Loss: 0.5933, Val Loss: 0.5912, Val Acc: 0.6826, Val AUC: 0.7523
Epoch 4/100, Train Loss: 0.5917, Val Loss: 0.5906, Val Acc: 0.6847, Val AUC: 0.7503
Epoch 5/100, Train Loss: 0.5876, Val Loss: 0.5897, Val Acc: 0.6856, Val AUC: 0.7507
Epoch 6/100, Train Loss: 0.5848, Val Loss: 0.5902, Val Acc: 0.6818, Val AUC: 0.7499
Epoch 7/100, Train Loss: 0.5799, Val Loss: 0.5941, Val Acc: 0.6818, Val AUC: 0.7465
Epoch 8/100, Train Loss: 0.5775, Val Loss: 0.5973, Val Acc: 0.6793, Val AUC: 0.7444
Epoch 9/100, Train Loss: 0.5757, Val Loss: 0.5988, Val Acc: 0.6786, Val AUC: 0.7447
Epoch 10/100, Train Loss: 0.5712, Val Loss: 0.5923, Val Acc: 0.6832, Val AUC: 0.7475
Epoch 11/100, Train Loss: 0.5619, Val Loss: 0.5943, Val Acc: 0.6845, Val AUC: 0.7477
Epoch 12/100, Train Loss: 0.5549, Val Loss: 0.5965, Val Acc: 0.6849, Val A

{'train_loss': [0.6156109659622113,
  0.5989260077890423,
  0.5932925513221158,
  0.5916935698025756,
  0.5875923407988416,
  0.5847792810036077,
  0.5799034841772582,
  0.5775483250617981,
  0.5757449288335111,
  0.5711507110546032,
  0.5618560186276833,
  0.5549414319710599,
  0.5567457901520861,
  0.5542651701304647,
  0.5539981724901332,
  0.5516702938824892,
  0.5508494520352947,
  0.5489254092176755,
  0.5482638561477263,
  0.5493227368427648,
  0.545513176628285,
  0.5472696130888329,
  0.5462312327076991,
  0.5450683237777816,
  0.5453108797884649,
  0.5451272029843596,
  0.5454674000127449,
  0.5435827443169223,
  0.5464668213907216,
  0.5418995574530628,
  0.5444395083934068,
  0.5396172458926837,
  0.5450145694116751,
  0.5435142395810948,
  0.5430690756688515,
  0.5427789232383172,
  0.5456837864385711,
  0.5429650818722116,
  0.5444932873050372,
  0.5449116095900536,
  0.5438728771275945,
  0.5427821226004097,
  0.543816267988748,
  0.5435190946277645,
  0.5442279798703061