In [62]:
import pandas as pd
import scipy.stats as stats
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [63]:
train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv('Data/test.csv')
data_dict = pd.read_csv('Data/data_dictionary.csv')
sample_submission = pd.read_csv('Data/sample_submission.csv')

In [64]:
for column in train_data.columns:
    if column == 'ID':
        pass
    else:
        print(f"{train_data[column].value_counts(dropna=False)}")
        print()

dri_score
Intermediate                                         10436
N/A - pediatric                                       4779
High                                                  4701
N/A - non-malignant indication                        2427
TBD cytogenetics                                      2003
Low                                                   1926
High - TED AML case <missing cytogenetics             1414
Intermediate - TED AML case <missing cytogenetics      481
N/A - disease not classifiable                         272
Very high                                              198
NaN                                                    154
Missing disease status                                   9
Name: count, dtype: int64

psych_disturb
No          23005
Yes          3587
NaN          2062
Not done      146
Name: count, dtype: int64

cyto_score
Poor            8802
NaN             8068
Intermediate    6376
Favorable       3011
TBD             1341
Normal           643
Other

In [65]:
train_data.columns

Index(['ID', 'dri_score', 'psych_disturb', 'cyto_score', 'diabetes',
       'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia',
       'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue',
       'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status',
       'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6',
       'hla_match_c_low', 'rituximab', 'hla_match_drb1_low',
       'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail',
       'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct',
       'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe',
       'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer',
       'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue',
       'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score',
       'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related',
       'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_hi

In [66]:
data_dict

Unnamed: 0,variable,description,type,values
0,dri_score,Refined disease risk index,Categorical,['Intermediate' 'High' 'N/A - non-malignant in...
1,psych_disturb,Psychiatric disturbance,Categorical,['Yes' 'No' nan 'Not done']
2,cyto_score,Cytogenetic score,Categorical,['Intermediate' 'Favorable' 'Poor' 'TBD' nan '...
3,diabetes,Diabetes,Categorical,['No' 'Yes' nan 'Not done']
4,hla_match_c_high,Recipient / 1st donor allele level (high resol...,Numerical,
5,hla_high_res_8,Recipient / 1st donor allele-level (high resol...,Numerical,
6,tbi_status,TBI,Categorical,"['No TBI' 'TBI + Cy +- Other' 'TBI +- Other, <..."
7,arrhythmia,Arrhythmia,Categorical,['No' nan 'Yes' 'Not done']
8,hla_low_res_6,Recipient / 1st donor antigen-level (low resol...,Numerical,
9,graft_type,Graft type,Categorical,['Peripheral blood' 'Bone marrow']


In [67]:
# Check correlation for numerical columns
correlations = train_data.corr(numeric_only=True)['efs'].sort_values(ascending=False)
correlations

efs                    1.000000
age_at_hct             0.227866
comorbidity_score      0.145723
hla_match_drb1_high    0.074446
hla_match_drb1_low     0.067485
hla_high_res_10        0.048431
donor_age              0.047566
hla_low_res_6          0.047007
hla_high_res_6         0.045261
hla_high_res_8         0.044224
hla_low_res_10         0.041972
hla_match_b_low        0.040965
hla_low_res_8          0.040612
hla_nmdp_6             0.040145
hla_match_dqb1_high    0.039736
hla_match_dqb1_low     0.032440
hla_match_c_high       0.031850
hla_match_b_high       0.026293
hla_match_c_low        0.016335
hla_match_a_high       0.015881
hla_match_a_low        0.015086
ID                    -0.003517
karnofsky_score       -0.091156
year_hct              -0.106419
efs_time              -0.741042
Name: efs, dtype: float64

In [68]:
# Statistical Test for categorical variables
# low p-value suggest a relationship between the categorical column and efs
for col in train_data.columns:
    if col != 'efs' and train_data[col].dtype == 'object':
        contingency = pd.crosstab(train_data[col], train_data['efs'])
        chi2, p, dof, expected = stats.chi2_contingency(contingency)
        print(f"{col}: p-value={p}")

dri_score: p-value=0.0
psych_disturb: p-value=1.2979884206277078e-26
cyto_score: p-value=0.0017695925410587407
diabetes: p-value=9.473517746062939e-31
tbi_status: p-value=3.872202121618803e-17
arrhythmia: p-value=1.4455281691857336e-13
graft_type: p-value=0.0
vent_hist: p-value=0.5894941475545321
renal_issue: p-value=0.0008044609682586662
pulm_severe: p-value=1.1740716320072957e-38
prim_disease_hct: p-value=0.0
cmv_status: p-value=2.9962065563817596e-63
tce_imm_match: p-value=0.0034450120733785987
rituximab: p-value=0.38833316629745207
prod_type: p-value=0.0
cyto_score_detail: p-value=2.2303455266813332e-39
conditioning_intensity: p-value=8.244812565600893e-75
ethnicity: p-value=1.3456203538403933e-12
obesity: p-value=3.0830353189626647e-11
mrd_hct: p-value=0.002937508875019472
in_vivo_tcd: p-value=2.2959109320915427e-169
tce_match: p-value=7.401409152266494e-06
hepatic_severe: p-value=1.5564351873948503e-43
prior_tumor: p-value=1.6392138057148511e-59
peptic_ulcer: p-value=7.3711450891

# Handling NaN values

In [69]:
for col in train_data.columns:
    if col == 'ID':
        continue
    elif col != 'efs' and train_data[col].dtype == 'object':
        # Fill missing value with 'Missing'
        train_data[col].fillna('Missing', inplace=True)
    elif pd.api.types.is_numeric_dtype(train_data[col]):
        # Calculate the mode of the column
        mode_value = train_data[col].mode().iloc[0]
        train_data[col].fillna(mode_value, inplace=True)

In [70]:
for column in train_data.columns:
    if column == 'ID':
        pass
    else:
        print(f"{train_data[column].value_counts(dropna=False)}")
        print()

dri_score
Intermediate                                         10436
N/A - pediatric                                       4779
High                                                  4701
N/A - non-malignant indication                        2427
TBD cytogenetics                                      2003
Low                                                   1926
High - TED AML case <missing cytogenetics             1414
Intermediate - TED AML case <missing cytogenetics      481
N/A - disease not classifiable                         272
Very high                                              198
Missing                                                154
Missing disease status                                   9
Name: count, dtype: int64

psych_disturb
No          23005
Yes          3587
Missing      2062
Not done      146
Name: count, dtype: int64

cyto_score
Poor            8802
Missing         8068
Intermediate    6376
Favorable       3011
TBD             1341
Normal           643
Other

# Simple NN

In [76]:
# Separate target and drop unneeded columns
y = train_data['efs']
X = train_data.drop(columns=['efs', 'efs_time', 'ID'], errors='ignore')

# Identify categorical and numerical columns
cat_cols = [col for col in X.columns if X[col].dtype=='object']
num_cols = [col for col in X.columns if pd.api.types.is_numeric_dtype(X[col])]

# For categorical: fill with "Missing" (if not already done)
for col in cat_cols:
    X[col].fillna('Missing', inplace=True)

# For numeric: fill with mode
for col in num_cols:
    if X[col].isna().any():
        X[col].fillna(X[col].median(), inplace=True)

# Encode categorical variables
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

# Scale numeric featuers
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Ensure all numerica
X = X.astype(np.float32)


# Convert to NumPy arrays
X_array = X.values.astype(np.float32)
y_array = y.values.astype(np.float32)

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, test_size=0.2, random_state=42
)

# Convert NumPy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

# Create Datasets and Dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [80]:
# Define a simple feedforward model
class SimpleNet(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

In [84]:
# Initialize the model, loss, and optimizer
model = SimpleNet(input_dim=X_train.shape[1])
criterion = nn.BCELoss() # Binary cross-entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop 
epochs = 100
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = outputs.view(-1) # Flatten for loss calculation
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)

    # Validation
    model.eval()
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs).view(-1)
            preds = (outputs >= 0.5).float()
            val_correct += (preds==targets).sum().item()
            val_total += targets.size(0)
    val_accuracy = val_correct / val_total

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {epoch_loss:.4f}, Val Acc: {val_accuracy:.4f}")

Epoch 1/100, Train Loss: 0.6072, Val Acc: 0.6807
Epoch 2/100, Train Loss: 0.5911, Val Acc: 0.6773
Epoch 3/100, Train Loss: 0.5866, Val Acc: 0.6813
Epoch 4/100, Train Loss: 0.5826, Val Acc: 0.6844
Epoch 5/100, Train Loss: 0.5763, Val Acc: 0.6806
Epoch 6/100, Train Loss: 0.5699, Val Acc: 0.6802
Epoch 7/100, Train Loss: 0.5609, Val Acc: 0.6717
Epoch 8/100, Train Loss: 0.5527, Val Acc: 0.6701
Epoch 9/100, Train Loss: 0.5417, Val Acc: 0.6729
Epoch 10/100, Train Loss: 0.5321, Val Acc: 0.6701
Epoch 11/100, Train Loss: 0.5212, Val Acc: 0.6693
Epoch 12/100, Train Loss: 0.5112, Val Acc: 0.6674
Epoch 13/100, Train Loss: 0.4983, Val Acc: 0.6517
Epoch 14/100, Train Loss: 0.4872, Val Acc: 0.6559
Epoch 15/100, Train Loss: 0.4775, Val Acc: 0.6611
Epoch 16/100, Train Loss: 0.4661, Val Acc: 0.6556
Epoch 17/100, Train Loss: 0.4544, Val Acc: 0.6536
Epoch 18/100, Train Loss: 0.4458, Val Acc: 0.6524
Epoch 19/100, Train Loss: 0.4368, Val Acc: 0.6528
Epoch 20/100, Train Loss: 0.4247, Val Acc: 0.6483
Epoch 21/