In [1]:
import pandas as pd
import scipy.stats as stats
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [2]:
train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv('Data/test.csv')
data_dict = pd.read_csv('Data/data_dictionary.csv')
sample_submission = pd.read_csv('Data/sample_submission.csv')

In [3]:
for column in train_data.columns:
    if column == 'ID':
        pass
    else:
        print(f"{train_data[column].value_counts(dropna=False)}")
        print()

dri_score
Intermediate                                         10436
N/A - pediatric                                       4779
High                                                  4701
N/A - non-malignant indication                        2427
TBD cytogenetics                                      2003
Low                                                   1926
High - TED AML case <missing cytogenetics             1414
Intermediate - TED AML case <missing cytogenetics      481
N/A - disease not classifiable                         272
Very high                                              198
NaN                                                    154
Missing disease status                                   9
Name: count, dtype: int64

psych_disturb
No          23005
Yes          3587
NaN          2062
Not done      146
Name: count, dtype: int64

cyto_score
Poor            8802
NaN             8068
Intermediate    6376
Favorable       3011
TBD             1341
Normal           643
Other

In [4]:
train_data.columns

Index(['ID', 'dri_score', 'psych_disturb', 'cyto_score', 'diabetes',
       'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia',
       'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue',
       'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status',
       'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6',
       'hla_match_c_low', 'rituximab', 'hla_match_drb1_low',
       'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail',
       'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct',
       'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe',
       'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer',
       'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue',
       'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score',
       'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related',
       'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_hi

In [5]:
data_dict

Unnamed: 0,variable,description,type,values
0,dri_score,Refined disease risk index,Categorical,['Intermediate' 'High' 'N/A - non-malignant in...
1,psych_disturb,Psychiatric disturbance,Categorical,['Yes' 'No' nan 'Not done']
2,cyto_score,Cytogenetic score,Categorical,['Intermediate' 'Favorable' 'Poor' 'TBD' nan '...
3,diabetes,Diabetes,Categorical,['No' 'Yes' nan 'Not done']
4,hla_match_c_high,Recipient / 1st donor allele level (high resol...,Numerical,
5,hla_high_res_8,Recipient / 1st donor allele-level (high resol...,Numerical,
6,tbi_status,TBI,Categorical,"['No TBI' 'TBI + Cy +- Other' 'TBI +- Other, <..."
7,arrhythmia,Arrhythmia,Categorical,['No' nan 'Yes' 'Not done']
8,hla_low_res_6,Recipient / 1st donor antigen-level (low resol...,Numerical,
9,graft_type,Graft type,Categorical,['Peripheral blood' 'Bone marrow']


In [6]:
# Check correlation for numerical columns
correlations = train_data.corr(numeric_only=True)['efs'].sort_values(ascending=False)
correlations

efs                    1.000000
age_at_hct             0.227866
comorbidity_score      0.145723
hla_match_drb1_high    0.074446
hla_match_drb1_low     0.067485
hla_high_res_10        0.048431
donor_age              0.047566
hla_low_res_6          0.047007
hla_high_res_6         0.045261
hla_high_res_8         0.044224
hla_low_res_10         0.041972
hla_match_b_low        0.040965
hla_low_res_8          0.040612
hla_nmdp_6             0.040145
hla_match_dqb1_high    0.039736
hla_match_dqb1_low     0.032440
hla_match_c_high       0.031850
hla_match_b_high       0.026293
hla_match_c_low        0.016335
hla_match_a_high       0.015881
hla_match_a_low        0.015086
ID                    -0.003517
karnofsky_score       -0.091156
year_hct              -0.106419
efs_time              -0.741042
Name: efs, dtype: float64

In [7]:
# Statistical Test for categorical variables
# low p-value suggest a relationship between the categorical column and efs
for col in train_data.columns:
    if col != 'efs' and train_data[col].dtype == 'object':
        contingency = pd.crosstab(train_data[col], train_data['efs'])
        chi2, p, dof, expected = stats.chi2_contingency(contingency)
        print(f"{col}: p-value={p}")

dri_score: p-value=0.0
psych_disturb: p-value=1.2979884206277078e-26
cyto_score: p-value=0.0017695925410587405
diabetes: p-value=9.473517746062939e-31
tbi_status: p-value=3.872202121618774e-17
arrhythmia: p-value=1.4455281691857336e-13
graft_type: p-value=0.0
vent_hist: p-value=0.5894941475545321
renal_issue: p-value=0.0008044609682586662
pulm_severe: p-value=1.1740716320072957e-38
prim_disease_hct: p-value=0.0
cmv_status: p-value=2.996206556381759e-63
tce_imm_match: p-value=0.003445012073378599
rituximab: p-value=0.38833316629745207
prod_type: p-value=0.0
cyto_score_detail: p-value=2.2303455266813332e-39
conditioning_intensity: p-value=8.244812565600892e-75
ethnicity: p-value=1.3456203538403933e-12
obesity: p-value=3.083035318962666e-11
mrd_hct: p-value=0.00293750887501947
in_vivo_tcd: p-value=2.2959109320915436e-169
tce_match: p-value=7.401409152266478e-06
hepatic_severe: p-value=1.5564351873948503e-43
prior_tumor: p-value=1.6392138057148511e-59
peptic_ulcer: p-value=7.37114508919722

# Handling NaN values

In [8]:
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier



def handle_missing_values(data):
    # Separating categorical and numeric columns
    cat_cols = [col for col in data.columns if data[col].dtype == 'object']
    num_cols = [col for col in data.columns if col not in cat_cols]

    # Handling missing numerical data with KNNImputer
    knn_imputer = KNNImputer(n_neighbors=5)
    data[num_cols] = knn_imputer.fit_transform(data[num_cols])

    # Imputing categorical data with KNeighborsClassifier
    for col in cat_cols:
        missing_mask = data[col].isna()
        if missing_mask.sum() > 0:
            # Separate training and rpediction sets
            X_train = data.loc[~missing_mask, num_cols]
            y_train = data.loc[~missing_mask, col]
            X_missing = data.loc[missing_mask, num_cols]

            # Train a KNeighborsClassifier
            knn = KNeighborsClassifier(n_neighbors=5)
            knn.fit(X_train, y_train)
            imputed_values = knn.predict(X_missing)

            # Fill missing values
            data.loc[missing_mask, col] = imputed_values

    return data

train_data = handle_missing_values(train_data)
train_data.head()

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,vent_hist,renal_issue,pulm_severe,prim_disease_hct,hla_high_res_6,cmv_status,hla_high_res_10,hla_match_dqb1_high,tce_imm_match,hla_nmdp_6,hla_match_c_low,rituximab,hla_match_drb1_low,hla_match_dqb1_low,prod_type,cyto_score_detail,conditioning_intensity,ethnicity,year_hct,obesity,mrd_hct,in_vivo_tcd,tce_match,hla_match_a_high,hepatic_severe,donor_age,prior_tumor,hla_match_b_low,peptic_ulcer,age_at_hct,hla_match_a_low,gvhd_proph,rheum_issue,sex_match,hla_match_b_high,race_group,comorbidity_score,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,0.0,N/A - non-malignant indication,No,Poor,No,2.0,7.0,No TBI,No,6.0,Bone marrow,No,No,No,IEA,6.0,+/+,8.8,2.0,P/P,6.0,2.0,No,2.0,2.0,BM,Intermediate,MAC,Not Hispanic or Latino,2016.0,No,Negative,Yes,Permissive,2.0,No,38.3438,No,2.0,No,9.942,2.0,FKalone,No,M-F,2.0,More than one race,0.0,90.0,No,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,1.0,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,Peripheral blood,No,No,No,AML,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,PB,Intermediate,MAC,Not Hispanic or Latino,2008.0,No,Positive,No,Permissive,2.0,No,72.29,No,2.0,No,43.705,2.0,Other GVHD Prophylaxis,No,F-F,2.0,Asian,3.0,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2,2.0,N/A - non-malignant indication,No,Poor,No,2.0,8.0,No TBI,No,6.0,Bone marrow,No,No,No,HIS,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,BM,Intermediate,MAC,Not Hispanic or Latino,2019.0,No,Negative,Yes,Permissive,2.0,No,34.4604,No,2.0,No,33.997,2.0,Cyclophosphamide alone,No,F-M,2.0,More than one race,0.0,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793
3,3.0,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,Bone marrow,No,No,No,ALL,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,BM,Intermediate,MAC,Not Hispanic or Latino,2009.0,No,Positive,No,Permissive,2.0,No,29.23,No,2.0,No,43.245,2.0,FK+ MMF +- others,No,M-M,2.0,White,0.0,90.0,Yes,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,102.349
4,4.0,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,Peripheral blood,No,No,No,MPN,6.0,+/+,10.0,2.0,P/P,5.0,2.0,No,2.0,2.0,PB,Intermediate,MAC,Hispanic or Latino,2018.0,No,Negative,Yes,Permissive,2.0,No,56.81,No,2.0,No,29.74,2.0,TDEPLETION +- other,No,M-F,2.0,American Indian or Alaska Native,1.0,90.0,No,Permissive mismatched,Related,MEL,8.0,No,2.0,No,10.0,0.0,16.223


In [9]:
for column in train_data.columns:
    if column == 'ID':
        pass
    else:
        print(f"{train_data[column].value_counts(dropna=False)}")
        print()

dri_score
Intermediate                                         10505
N/A - pediatric                                       4808
High                                                  4737
N/A - non-malignant indication                        2435
TBD cytogenetics                                      2007
Low                                                   1929
High - TED AML case <missing cytogenetics             1417
Intermediate - TED AML case <missing cytogenetics      483
N/A - disease not classifiable                         272
Very high                                              198
Missing disease status                                   9
Name: count, dtype: int64

psych_disturb
No          25017
Yes          3637
Not done      146
Name: count, dtype: int64

cyto_score
Poor            12878
Intermediate     9179
Favorable        3910
TBD              1511
Normal            716
Other             551
Not tested         55
Name: count, dtype: int64

diabetes
No          24212


# Simple NN

In [13]:
# Separate target and drop unneeded columns
y = train_data['efs']
X = train_data.drop(columns=['efs', 'efs_time', 'ID'], errors='ignore')

# Encode categorical variables
cat_cols = [col for col in X.columns if X[col].dtype == 'object']
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

# Scale numeric featuers
num_cols = [col for col in X.columns if col not in cat_cols]
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Ensure all numerica
X = X.astype(np.float32)


# Convert to NumPy arrays
X_array = X.values.astype(np.float32)
y_array = y.values.astype(np.float32)

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, test_size=0.2, random_state=42
)

# Convert NumPy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

# Create Datasets and Dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [14]:
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau

# Define a simple feedforward model
class SimpleNet(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

In [15]:
# Initialize the model, loss, and optimizer
model = SimpleNet(input_dim=X_train.shape[1])
criterion = nn.BCELoss() # Binary cross-entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Learning Rate scheduler
# StepLR
# Decrease the learning rate every 10 epochs by a factor of 0.1
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

# ReduceLROnPlateaus
# scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)


# Training loop 
epochs = 100
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = outputs.view(-1) # Flatten for loss calculation
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs).view(-1)
            loss = criterion(outputs, targets)
            val_loss += loss.item() * inputs.size(0)

            preds = (outputs >= 0.5).float()
            val_correct += (preds == targets).sum().item()
            val_total += targets.size(0)
    val_loss /= len(val_loader.dataset)
    val_accuracy = val_correct / val_total

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

    # Update the learning rate using scheduler
    scheduler.step()

Epoch 1/100, Train Loss: 0.6061, Val Loss: 0.5920, Val Acc: 0.6809
Epoch 2/100, Train Loss: 0.5842, Val Loss: 0.5936, Val Acc: 0.6821
Epoch 3/100, Train Loss: 0.5761, Val Loss: 0.5969, Val Acc: 0.6802
Epoch 4/100, Train Loss: 0.5681, Val Loss: 0.6000, Val Acc: 0.6783
Epoch 5/100, Train Loss: 0.5570, Val Loss: 0.6068, Val Acc: 0.6786
Epoch 6/100, Train Loss: 0.5455, Val Loss: 0.6148, Val Acc: 0.6722
Epoch 7/100, Train Loss: 0.5310, Val Loss: 0.6189, Val Acc: 0.6628
Epoch 8/100, Train Loss: 0.5145, Val Loss: 0.6405, Val Acc: 0.6564
Epoch 9/100, Train Loss: 0.4969, Val Loss: 0.6599, Val Acc: 0.6660
Epoch 10/100, Train Loss: 0.4821, Val Loss: 0.6781, Val Acc: 0.6569
Epoch 11/100, Train Loss: 0.4389, Val Loss: 0.6827, Val Acc: 0.6609
Epoch 12/100, Train Loss: 0.4285, Val Loss: 0.6926, Val Acc: 0.6545
Epoch 13/100, Train Loss: 0.4226, Val Loss: 0.7004, Val Acc: 0.6528
Epoch 14/100, Train Loss: 0.4186, Val Loss: 0.7082, Val Acc: 0.6550
Epoch 15/100, Train Loss: 0.4146, Val Loss: 0.7157, Val A

In [16]:
class ImprovedNet(nn.Module):
    def __init__(self, input_dim):
        super(ImprovedNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32,1)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.dropout(self.relu(self.bn1(self.fc1(x))))
        x = self.dropout(self.relu(self.bn2(self.fc2(x))))
        x = self.relu(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        return x

def training(model, criterion, optimizer, epochs, train_loader, val_loader, device):
    """
    Train and evaluate the model.

    Args:
        model: PyTorch model to train.
        train_loader = DataLoader for the training dataset.
        val_loader = DataLoader for the validation dataset.
        criterion: Loss function.
        optimizer: Optimizer for training the model.
        scheduler: Learning rate scheduler (optional).
        epochs: Number of epochs to train.
        device: Device to use for training ('cpu' or 'mps')

    Returns:
        history: Dictionary containing training and validation metrics
    """
    
    # Move model to the specified device
    model = model.to(device)
    
    # Learning Rate Scheduler
    scheduler = StepLR(optimizer, step_size=10, gamma=0.1)
    history = {
        'train_loss': [],
        'val_loss': [],
        'val_accuracy': [],
        'val_auc': []
    }
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()
            outputs = model(inputs).view(-1)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        all_targets = []
        all_outputs = []
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)

                outputs = model(inputs).view(-1)
                loss = criterion(outputs, targets)
                val_loss += loss.item() * inputs.size(0)

                # Collect outputs for metrics
                preds = (outputs >= 0.5).float()
                val_correct += (preds == targets).sum().item()
                val_total += targets.size(0)

                all_targets.extend(targets.cpu().numpy())
                all_outputs.extend(outputs.cpu().numpy())

        epoch_val_loss = val_loss / len(val_loader.dataset)
        epoch_val_accuracy = val_correct / val_total
        epoch_val_auc = roc_auc_score(all_targets, all_outputs)

        # Log metrics
        history['train_loss'].append(epoch_loss)
        history['val_loss'].append(epoch_val_loss)
        history['val_accuracy'].append(epoch_val_accuracy)
        history['val_auc'].append(epoch_val_auc)

        print(
            f"Epoch {epoch + 1}/{epochs}, "
            f"Train Loss: {epoch_loss:.4f}, "
            f"Val Loss: {epoch_val_loss:.4f}, "
            f"Val Acc: {epoch_val_accuracy:.4f}, "
            f"Val AUC: {epoch_val_auc:.4f}"
        )

        # Update the learning rate using scheduler
        scheduler.step()
    return history


In [17]:
# Initialize the model, loss, and optimizer
model = ImprovedNet(input_dim=X_train.shape[1])
criterion = nn.BCELoss() # Binary cross-entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

training(model, criterion, optimizer, 100, train_loader, val_loader, 'mps')

Epoch 1/100, Train Loss: 0.6180, Val Loss: 0.5935, Val Acc: 0.6835, Val AUC: 0.7467
Epoch 2/100, Train Loss: 0.5980, Val Loss: 0.5912, Val Acc: 0.6776, Val AUC: 0.7497
Epoch 3/100, Train Loss: 0.5900, Val Loss: 0.5933, Val Acc: 0.6839, Val AUC: 0.7472
Epoch 4/100, Train Loss: 0.5867, Val Loss: 0.5919, Val Acc: 0.6861, Val AUC: 0.7482
Epoch 5/100, Train Loss: 0.5853, Val Loss: 0.5938, Val Acc: 0.6839, Val AUC: 0.7457
Epoch 6/100, Train Loss: 0.5826, Val Loss: 0.5958, Val Acc: 0.6833, Val AUC: 0.7441
Epoch 7/100, Train Loss: 0.5766, Val Loss: 0.5961, Val Acc: 0.6800, Val AUC: 0.7456
Epoch 8/100, Train Loss: 0.5734, Val Loss: 0.5946, Val Acc: 0.6809, Val AUC: 0.7442
Epoch 9/100, Train Loss: 0.5696, Val Loss: 0.5979, Val Acc: 0.6826, Val AUC: 0.7412
Epoch 10/100, Train Loss: 0.5672, Val Loss: 0.5999, Val Acc: 0.6790, Val AUC: 0.7422
Epoch 11/100, Train Loss: 0.5507, Val Loss: 0.6002, Val Acc: 0.6800, Val AUC: 0.7432
Epoch 12/100, Train Loss: 0.5503, Val Loss: 0.5995, Val Acc: 0.6813, Val A

{'train_loss': [0.6179516276965539,
  0.5980288926098082,
  0.5900237810280587,
  0.5867442074335284,
  0.5852958448645141,
  0.5825939693384701,
  0.5766428709857994,
  0.573411158969005,
  0.5695665892627504,
  0.5671504690001409,
  0.5506664777381552,
  0.5502710496385892,
  0.5443323351442814,
  0.5472971537460883,
  0.5464630154685841,
  0.5428280375070043,
  0.5402270932578378,
  0.5392507989787393,
  0.5394355360418558,
  0.5354655121349626,
  0.5351498833133115,
  0.5340063131931755,
  0.5317423617260324,
  0.5342924041880501,
  0.5319779591427909,
  0.534930585945646,
  0.5358179390430451,
  0.5343284432258871,
  0.5313075054436922,
  0.5336618578268422,
  0.5320641698108779,
  0.5319516191879908,
  0.5337891175515121,
  0.53324379461507,
  0.5336062758333153,
  0.5349020741052097,
  0.5307842476086484,
  0.531680108482639,
  0.5338254898786545,
  0.5339516227444013,
  0.5336005765944719,
  0.5298053786158562,
  0.5329290395809545,
  0.5339242584589455,
  0.5326352327234215,
 

# Non traditional method

## XGBoost

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score
import xgboost as xgb

train_data = pd.read_csv('Data/train.csv')

# Handle missing values 
train_data = handle_missing_values(train_data)

# Separate target and features
y = train_data['efs']
X = train_data.drop(columns=['efs', 'efs_time', 'ID'], errors='ignore')

# Encode categorical variables
cat_cols = [col for col in X.columns if X[col].dtype=='object']
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

# Scale numeric features
num_cols = [col for col in X.columns if X[col].dtype != 'uint8']
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Convert to NumPy arrays
X_array = X.values
y_array = y.values

# Split into train and test sets
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, test_size = 0.2, random_state=42
)

# Convert to DMatrix for XGBoost
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
val_dmatrix = xgb.DMatrix(X_val, label=y_val)

# Define XGBoost paramters
params = {
    "objective" : "binary:logistic", # Binary classification
    "eval_metric" : "auc",           # Evaluation metric
    "eta" : 0.1,                     # Learning rate
    "max_depth" : 6,                 # Max depth of tress
    "subsample" : 0.8,               # Row sampling
    "colsample_bytree" : 0.8,        # Feature Sampling
    "lambda" : 1,                    # L2 regularization
    "alpha" : 0                      # L1 regularization
}

# Trian the model
evals = [(train_dmatrix, 'train'), (val_dmatrix, 'eval')]
num_boost_round = 200
early_stopping_rounds = 10

xgb_model = xgb.train(
    params=params,
    dtrain=train_dmatrix,
    num_boost_round=num_boost_round,
    evals=evals,
    early_stopping_rounds=early_stopping_rounds,
    verbose_eval=10
)

# Make predictions on validation set
val_preds = xgb_model.predict(val_dmatrix)
val_preds_binary = (val_preds >= 0.5).astype(int)

# Evaluate model
auc = roc_auc_score(y_val, val_preds)
acc = accuracy_score(y_val, val_preds_binary)

print(f"Validation AUC: {auc:.4f}")
print(f"Validation ACC: {acc:.4f}")

[0]	train-auc:0.69296	eval-auc:0.66999
[10]	train-auc:0.75606	eval-auc:0.72814
[20]	train-auc:0.77368	eval-auc:0.73655
[30]	train-auc:0.78955	eval-auc:0.74152
[40]	train-auc:0.80433	eval-auc:0.74609
[50]	train-auc:0.81473	eval-auc:0.74908
[60]	train-auc:0.82327	eval-auc:0.75004
[69]	train-auc:0.83192	eval-auc:0.75000
Validation AUC: 0.7500
Validation ACC: 0.6839


## LightGBM

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score
import lightgbm as lgb

# load the data
train_data = pd.read_csv('Data/train.csv')

# Handle missing values
train_data = handle_missing_values(train_data)

# Separate target and features
y = train_data['efs']
X = train_data.drop(columns=['efs', 'efs_time', 'ID'], errors='ignore')

# Encode categorical variables
cat_cols = [col for col in X.columns if X[col].dtype=='object']
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

# Scale numeric features
num_cols = [col for col in X.columns if X[col].dtype!='uint8']
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Convert to NumPy arrays
X_array = X.values
y_array = y.values

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, test_size=0.2, random_state=42
)

# Convert categorical column indices 
categorical_features = [X.columns.get_loc(col) for col in cat_cols if col in X.columns]

# LightGBM dataset
train_dataset = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
val_dataset = lgb.Dataset(X_val, label=y_val, categorical_feature=categorical_features, reference=train_dataset)

# Define LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'auc', # Evaluation metric
    'boosting_type': 'gbdt',
    'num_leaves': 31, # Controls complexity of the tree,
    'learning_rate': 0.1,
    'feature_fraction': 0.8, # Randomly select a fraction of features for each iteration
    'bagging_fraction': 0.8, # Randomly select a fraction of data for each iteration
    'bagging_freq': 5, # Frequency of bagging
    'verbose': -1
}

# Train the model
num_round = 200
early_stopping_rounds = 10
lgb_model = lgb.train(
    params=params,
    train_set=train_dataset,
    num_boost_round=num_round,
    valid_sets=[train_dataset, val_dataset],
    valid_names=['train', 'eval'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=early_stopping_rounds),
        lgb.log_evaluation(10)
    ]
)
# Make predictions on validation set
val_preds = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
val_preds_binary = (val_preds >= 0.5).astype(int)

# Evaluate model
auc = roc_auc_score(y_val, val_preds)
accuracy = accuracy_score(y_val, val_preds_binary)

print(f"Validation AUC: {auc:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

Training until validation scores don't improve for 10 rounds
[10]	train's auc: 0.742947	eval's auc: 0.726273
[20]	train's auc: 0.759245	eval's auc: 0.735371
[30]	train's auc: 0.774242	eval's auc: 0.741467
[40]	train's auc: 0.785755	eval's auc: 0.746016
[50]	train's auc: 0.797085	eval's auc: 0.748616
[60]	train's auc: 0.805966	eval's auc: 0.750482
[70]	train's auc: 0.813671	eval's auc: 0.752309
[80]	train's auc: 0.820852	eval's auc: 0.753117
[90]	train's auc: 0.828277	eval's auc: 0.75346
Early stopping, best iteration is:
[88]	train's auc: 0.826883	eval's auc: 0.753681
Validation AUC: 0.7537
Validation Accuracy: 0.6891


## Random Forest Classifier

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

# Load the data
train_data = pd.read_csv('Data/train.csv')

# Hanlde missing values
train_data = handle_missing_values(train_data)

# Separate target and features 
y = train_data['efs']
X = train_data.drop(columns=['efs', 'efs_time', 'ID'], errors='ignore')

# Encode categorical variables
cat_cols = [col for col in X.columns if X[col].dtype=='object']
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

# Scale numeric ceatures
num_cols = [col for col in X.columns if col not in cat_cols]
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Convert to NumPy arrays
X_array = X.values
y_array = y.values

# Split into train and validation set
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, test_size=0.2, random_state=42
)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,         # Number of trees in the forest
    max_depth=10,            # Maximum depth of the tree
    min_samples_split=5,     # Maximum samples to split a node
    min_samples_leaf=2,      # Minimum samples at leaf node
    max_features='sqrt',     # Number of features to consider at each split
    random_state=42,         # For reproducibility
    n_jobs=-1,               # Use all available cores for training
)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on validation set
val_preds = rf_model.predict_proba(X_val)[:,1] # Probability of positive class
val_preds_binary = (val_preds >= 0.5).astype(int)

# Evaluate the model
auc = roc_auc_score(y_val, val_preds)
accuracy = accuracy_score(y_val, val_preds_binary)

print(f"Validation AUC: {auc:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

Validation AUC: 0.7294
Validation Accuracy: 0.6648
