In [22]:
import pandas as pd
import scipy.stats as stats
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [23]:
train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv('Data/test.csv')
data_dict = pd.read_csv('Data/data_dictionary.csv')
sample_submission = pd.read_csv('Data/sample_submission.csv')

# Handling NaN values

In [24]:
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier



def handle_missing_values(data):
    # Separating categorical and numeric columns
    cat_cols = [col for col in data.columns if data[col].dtype == 'object']
    num_cols = [col for col in data.columns if col not in cat_cols]

    # Handling missing numerical data with KNNImputer
    knn_imputer = KNNImputer(n_neighbors=5)
    data[num_cols] = knn_imputer.fit_transform(data[num_cols])

    # Imputing categorical data with KNeighborsClassifier
    for col in cat_cols:
        missing_mask = data[col].isna()
        if missing_mask.sum() > 0:
            # Separate training and rpediction sets
            X_train = data.loc[~missing_mask, num_cols]
            y_train = data.loc[~missing_mask, col]
            X_missing = data.loc[missing_mask, num_cols]

            # Train a KNeighborsClassifier
            knn = KNeighborsClassifier(n_neighbors=5)
            knn.fit(X_train, y_train)
            imputed_values = knn.predict(X_missing)

            # Fill missing values
            data.loc[missing_mask, col] = imputed_values

    return data

train_data = handle_missing_values(train_data)
train_data.head()

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,0.0,N/A - non-malignant indication,No,Poor,No,2.0,7.0,No TBI,No,6.0,...,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,1.0,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2,2.0,N/A - non-malignant indication,No,Poor,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793
3,3.0,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,102.349
4,4.0,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Related,MEL,8.0,No,2.0,No,10.0,0.0,16.223


In [25]:
for column in train_data.columns:
    if column == 'ID':
        pass
    else:
        print(f"{train_data[column].value_counts(dropna=False)}")
        print()

dri_score
Intermediate                                         10505
N/A - pediatric                                       4808
High                                                  4737
N/A - non-malignant indication                        2435
TBD cytogenetics                                      2007
Low                                                   1929
High - TED AML case <missing cytogenetics             1417
Intermediate - TED AML case <missing cytogenetics      483
N/A - disease not classifiable                         272
Very high                                              198
Missing disease status                                   9
Name: count, dtype: int64

psych_disturb
No          25017
Yes          3637
Not done      146
Name: count, dtype: int64

cyto_score
Poor            12878
Intermediate     9179
Favorable        3910
TBD              1511
Normal            716
Other             551
Not tested         55
Name: count, dtype: int64

diabetes
No          24212


# Simple NN

In [26]:
# Separate target and drop unneeded columns
y = train_data['efs']
X = train_data.drop(columns=['efs', 'efs_time', 'ID'], errors='ignore')

# Encode categorical variables
cat_cols = [col for col in X.columns if X[col].dtype == 'object']
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

# Scale numeric featuers
num_cols = [col for col in X.columns if col not in cat_cols]
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Ensure all numerica
X = X.astype(np.float32)


# Convert to NumPy arrays
X_array = X.values.astype(np.float32)
y_array = y.values.astype(np.float32)

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, test_size=0.2, random_state=42
)

# Convert NumPy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

# Create Datasets and Dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [27]:
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau

# Define a simple feedforward model
class SimpleNet(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

In [28]:
# Initialize the model, loss, and optimizer
model = SimpleNet(input_dim=X_train.shape[1])
criterion = nn.BCELoss() # Binary cross-entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Learning Rate scheduler
# StepLR
# Decrease the learning rate every 10 epochs by a factor of 0.1
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

# ReduceLROnPlateaus
# scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)


# Training loop 
epochs = 100
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = outputs.view(-1) # Flatten for loss calculation
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs).view(-1)
            loss = criterion(outputs, targets)
            val_loss += loss.item() * inputs.size(0)

            preds = (outputs >= 0.5).float()
            val_correct += (preds == targets).sum().item()
            val_total += targets.size(0)
    val_loss /= len(val_loader.dataset)
    val_accuracy = val_correct / val_total

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

    # Update the learning rate using scheduler
    scheduler.step()

Epoch 1/100, Train Loss: 0.6054, Val Loss: 0.5911, Val Acc: 0.6826
Epoch 2/100, Train Loss: 0.5848, Val Loss: 0.5944, Val Acc: 0.6813
Epoch 3/100, Train Loss: 0.5768, Val Loss: 0.5926, Val Acc: 0.6868
Epoch 4/100, Train Loss: 0.5690, Val Loss: 0.5961, Val Acc: 0.6786
Epoch 5/100, Train Loss: 0.5583, Val Loss: 0.6019, Val Acc: 0.6786
Epoch 6/100, Train Loss: 0.5446, Val Loss: 0.6137, Val Acc: 0.6760
Epoch 7/100, Train Loss: 0.5318, Val Loss: 0.6224, Val Acc: 0.6641
Epoch 8/100, Train Loss: 0.5168, Val Loss: 0.6337, Val Acc: 0.6608
Epoch 9/100, Train Loss: 0.5008, Val Loss: 0.6441, Val Acc: 0.6663
Epoch 10/100, Train Loss: 0.4860, Val Loss: 0.6697, Val Acc: 0.6615
Epoch 11/100, Train Loss: 0.4419, Val Loss: 0.6734, Val Acc: 0.6602
Epoch 12/100, Train Loss: 0.4322, Val Loss: 0.6808, Val Acc: 0.6589
Epoch 13/100, Train Loss: 0.4267, Val Loss: 0.6883, Val Acc: 0.6582
Epoch 14/100, Train Loss: 0.4223, Val Loss: 0.6925, Val Acc: 0.6528
Epoch 15/100, Train Loss: 0.4185, Val Loss: 0.6985, Val A

In [29]:
class ImprovedNet(nn.Module):
    def __init__(self, input_dim):
        super(ImprovedNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32,1)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.dropout(self.relu(self.bn1(self.fc1(x))))
        x = self.dropout(self.relu(self.bn2(self.fc2(x))))
        x = self.relu(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        return x

def training(model, criterion, optimizer, epochs, train_loader, val_loader, device):
    """
    Train and evaluate the model.

    Args:
        model: PyTorch model to train.
        train_loader = DataLoader for the training dataset.
        val_loader = DataLoader for the validation dataset.
        criterion: Loss function.
        optimizer: Optimizer for training the model.
        scheduler: Learning rate scheduler (optional).
        epochs: Number of epochs to train.
        device: Device to use for training ('cpu' or 'mps')

    Returns:
        history: Dictionary containing training and validation metrics
    """
    
    # Move model to the specified device
    model = model.to(device)
    
    # Learning Rate Scheduler
    scheduler = StepLR(optimizer, step_size=10, gamma=0.1)
    history = {
        'train_loss': [],
        'val_loss': [],
        'val_accuracy': [],
        'val_auc': []
    }
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()
            outputs = model(inputs).view(-1)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        all_targets = []
        all_outputs = []
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)

                outputs = model(inputs).view(-1)
                loss = criterion(outputs, targets)
                val_loss += loss.item() * inputs.size(0)

                # Collect outputs for metrics
                preds = (outputs >= 0.5).float()
                val_correct += (preds == targets).sum().item()
                val_total += targets.size(0)

                all_targets.extend(targets.cpu().numpy())
                all_outputs.extend(outputs.cpu().numpy())

        epoch_val_loss = val_loss / len(val_loader.dataset)
        epoch_val_accuracy = val_correct / val_total
        epoch_val_auc = roc_auc_score(all_targets, all_outputs)

        # Log metrics
        history['train_loss'].append(epoch_loss)
        history['val_loss'].append(epoch_val_loss)
        history['val_accuracy'].append(epoch_val_accuracy)
        history['val_auc'].append(epoch_val_auc)

        print(
            f"Epoch {epoch + 1}/{epochs}, "
            f"Train Loss: {epoch_loss:.4f}, "
            f"Val Loss: {epoch_val_loss:.4f}, "
            f"Val Acc: {epoch_val_accuracy:.4f}, "
            f"Val AUC: {epoch_val_auc:.4f}"
        )

        # Update the learning rate using scheduler
        scheduler.step()
    return history


In [30]:
# Initialize the model, loss, and optimizer
model = ImprovedNet(input_dim=X_train.shape[1])
criterion = nn.BCELoss() # Binary cross-entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

training(model, criterion, optimizer, 100, train_loader, val_loader, 'mps')

Epoch 1/100, Train Loss: 0.6163, Val Loss: 0.5936, Val Acc: 0.6828, Val AUC: 0.7457
Epoch 2/100, Train Loss: 0.5976, Val Loss: 0.5925, Val Acc: 0.6802, Val AUC: 0.7481
Epoch 3/100, Train Loss: 0.5912, Val Loss: 0.5950, Val Acc: 0.6804, Val AUC: 0.7436
Epoch 4/100, Train Loss: 0.5898, Val Loss: 0.5917, Val Acc: 0.6833, Val AUC: 0.7479
Epoch 5/100, Train Loss: 0.5853, Val Loss: 0.5930, Val Acc: 0.6835, Val AUC: 0.7461
Epoch 6/100, Train Loss: 0.5818, Val Loss: 0.5923, Val Acc: 0.6799, Val AUC: 0.7472
Epoch 7/100, Train Loss: 0.5777, Val Loss: 0.5952, Val Acc: 0.6786, Val AUC: 0.7451
Epoch 8/100, Train Loss: 0.5759, Val Loss: 0.5994, Val Acc: 0.6792, Val AUC: 0.7438
Epoch 9/100, Train Loss: 0.5690, Val Loss: 0.5951, Val Acc: 0.6823, Val AUC: 0.7445
Epoch 10/100, Train Loss: 0.5645, Val Loss: 0.5989, Val Acc: 0.6774, Val AUC: 0.7413
Epoch 11/100, Train Loss: 0.5522, Val Loss: 0.5984, Val Acc: 0.6778, Val AUC: 0.7422
Epoch 12/100, Train Loss: 0.5487, Val Loss: 0.6000, Val Acc: 0.6780, Val A

{'train_loss': [0.6163133018960555,
  0.5976323392656114,
  0.5911967740704616,
  0.5898128296352095,
  0.5853117661343681,
  0.5817532922244735,
  0.5776746368656556,
  0.5759394769039419,
  0.5690029778828224,
  0.5645119466715389,
  0.5521917016969786,
  0.5487025638421377,
  0.5467156305909157,
  0.5414758675628238,
  0.5416504581355386,
  0.540240859405862,
  0.5378663677308294,
  0.5355301716675361,
  0.5350934070431523,
  0.5357806621326341,
  0.5335787710630231,
  0.5345678663088216,
  0.5305696334275934,
  0.5308158396846718,
  0.5340645468069447,
  0.5306105573972066,
  0.5324103205154339,
  0.5343653436750173,
  0.5328534767859512,
  0.5331697201563252,
  0.5311056833714247,
  0.5315784826460812,
  0.531610753097468,
  0.5290778868314293,
  0.5301838826388121,
  0.5312238537602955,
  0.5297626044601202,
  0.5306709908362892,
  0.5321547002014186,
  0.5302957510782613,
  0.5301965557038784,
  0.5322521960155832,
  0.5330247537543377,
  0.5312880797932545,
  0.5297436240232654

# Non traditional method

## XGBoost

In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score
import xgboost as xgb

train_data = pd.read_csv('Data/train.csv')

# Handle missing values 
train_data = handle_missing_values(train_data)

# Separate target and features
y = train_data['efs']
X = train_data.drop(columns=['efs', 'efs_time', 'ID'], errors='ignore')

# Encode categorical variables
cat_cols = [col for col in X.columns if X[col].dtype=='object']
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

# Scale numeric features
num_cols = [col for col in X.columns if X[col].dtype != 'uint8']
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Convert to NumPy arrays
X_array = X.values
y_array = y.values

# Split into train and test sets
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, test_size = 0.2, random_state=42
)

# Convert to DMatrix for XGBoost
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
val_dmatrix = xgb.DMatrix(X_val, label=y_val)

# Define XGBoost paramters
params = {
    "objective" : "binary:logistic", # Binary classification
    "eval_metric" : "auc",           # Evaluation metric
    "eta" : 0.1,                     # Learning rate
    "max_depth" : 6,                 # Max depth of tress
    "subsample" : 0.8,               # Row sampling
    "colsample_bytree" : 0.8,        # Feature Sampling
    "lambda" : 1,                    # L2 regularization
    "alpha" : 0                      # L1 regularization
}

# Trian the model
evals = [(train_dmatrix, 'train'), (val_dmatrix, 'eval')]
num_boost_round = 200
early_stopping_rounds = 10

xgb_model = xgb.train(
    params=params,
    dtrain=train_dmatrix,
    num_boost_round=num_boost_round,
    evals=evals,
    early_stopping_rounds=early_stopping_rounds,
    verbose_eval=10
)

# Make predictions on validation set
val_preds = xgb_model.predict(val_dmatrix)
val_preds_binary = (val_preds >= 0.5).astype(int)

# Evaluate model
auc = roc_auc_score(y_val, val_preds)
acc = accuracy_score(y_val, val_preds_binary)

print(f"Validation AUC: {auc:.4f}")
print(f"Validation ACC: {acc:.4f}")

[0]	train-auc:0.69296	eval-auc:0.66999
[10]	train-auc:0.75606	eval-auc:0.72814
[20]	train-auc:0.77368	eval-auc:0.73655
[30]	train-auc:0.78955	eval-auc:0.74152
[40]	train-auc:0.80433	eval-auc:0.74609
[50]	train-auc:0.81473	eval-auc:0.74908
[60]	train-auc:0.82327	eval-auc:0.75004
[68]	train-auc:0.83090	eval-auc:0.74995
Validation AUC: 0.7500
Validation ACC: 0.6839


## LightGBM

In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score
import lightgbm as lgb

# load the data
train_data = pd.read_csv('Data/train.csv')

# Handle missing values
train_data = handle_missing_values(train_data)

# Separate target and features
y = train_data['efs']
X = train_data.drop(columns=['efs', 'efs_time', 'ID'], errors='ignore')

# Encode categorical variables
cat_cols = [col for col in X.columns if X[col].dtype=='object']
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

# Scale numeric features
num_cols = [col for col in X.columns if X[col].dtype!='uint8']
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Convert to NumPy arrays
X_array = X.values
y_array = y.values

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, test_size=0.2, random_state=42
)

# Convert categorical column indices 
categorical_features = [X.columns.get_loc(col) for col in cat_cols if col in X.columns]

# LightGBM dataset
train_dataset = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
val_dataset = lgb.Dataset(X_val, label=y_val, categorical_feature=categorical_features, reference=train_dataset)

# Define LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'auc', # Evaluation metric
    'boosting_type': 'gbdt',
    'num_leaves': 31, # Controls complexity of the tree,
    'learning_rate': 0.1,
    'feature_fraction': 0.8, # Randomly select a fraction of features for each iteration
    'bagging_fraction': 0.8, # Randomly select a fraction of data for each iteration
    'bagging_freq': 5, # Frequency of bagging
    'verbose': -1
}

# Train the model
num_round = 200
early_stopping_rounds = 10
lgb_model = lgb.train(
    params=params,
    train_set=train_dataset,
    num_boost_round=num_round,
    valid_sets=[train_dataset, val_dataset],
    valid_names=['train', 'eval'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=early_stopping_rounds),
        lgb.log_evaluation(10)
    ]
)
# Make predictions on validation set
val_preds = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
val_preds_binary = (val_preds >= 0.5).astype(int)

# Evaluate model
auc = roc_auc_score(y_val, val_preds)
accuracy = accuracy_score(y_val, val_preds_binary)

print(f"Validation AUC: {auc:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

Training until validation scores don't improve for 10 rounds
[10]	train's auc: 0.742947	eval's auc: 0.726273
[20]	train's auc: 0.759245	eval's auc: 0.735371
[30]	train's auc: 0.774242	eval's auc: 0.741467
[40]	train's auc: 0.785755	eval's auc: 0.746016
[50]	train's auc: 0.797085	eval's auc: 0.748616
[60]	train's auc: 0.805966	eval's auc: 0.750482
[70]	train's auc: 0.813671	eval's auc: 0.752309
[80]	train's auc: 0.820852	eval's auc: 0.753117
[90]	train's auc: 0.828277	eval's auc: 0.75346
Early stopping, best iteration is:
[88]	train's auc: 0.826883	eval's auc: 0.753681
Validation AUC: 0.7537
Validation Accuracy: 0.6891


## Random Forest Classifier

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

# Load the data
train_data = pd.read_csv('Data/train.csv')

# Hanlde missing values
train_data = handle_missing_values(train_data)

# Separate target and features 
y = train_data['efs']
X = train_data.drop(columns=['efs', 'efs_time', 'ID'], errors='ignore')

# Encode categorical variables
cat_cols = [col for col in X.columns if X[col].dtype=='object']
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

# Scale numeric ceatures
num_cols = [col for col in X.columns if col not in cat_cols]
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Convert to NumPy arrays
X_array = X.values
y_array = y.values

# Split into train and validation set
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, test_size=0.2, random_state=42
)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,         # Number of trees in the forest
    max_depth=10,            # Maximum depth of the tree
    min_samples_split=5,     # Maximum samples to split a node
    min_samples_leaf=2,      # Minimum samples at leaf node
    max_features='sqrt',     # Number of features to consider at each split
    random_state=42,         # For reproducibility
    n_jobs=-1,               # Use all available cores for training
)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on validation set
val_preds = rf_model.predict_proba(X_val)[:,1] # Probability of positive class
val_preds_binary = (val_preds >= 0.5).astype(int)

# Evaluate the model
auc = roc_auc_score(y_val, val_preds)
accuracy = accuracy_score(y_val, val_preds_binary)

print(f"Validation AUC: {auc:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

Validation AUC: 0.7297
Validation Accuracy: 0.6667


## Simple Ensemble (RandomForestClassifier, XGBClassifier, LGBMClassifier)   

## Voting Classifier

In [34]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

# Initialize individual models
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

xgb_model = XGBClassifier(
    n_estimators=200, 
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
)

lgbm_model = LGBMClassifier(
    n_estimators=200,
    learning_rate=0.1,
    num_leaves=31,
    random_state=42
)

# Combine models in voting classifier
voting_ensemble = VotingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgb_model),('lgbm', lgbm_model)],
    voting='soft'
)

# Train ensemble model
voting_ensemble.fit(X_train, y_train)

# Make predictions
val_preds = voting_ensemble.predict_proba(X_val)[:,1]
val_preds_binary = (val_preds >= 0.5).astype(int)

# Evaluate model
auc = roc_auc_score(y_val, val_preds)
accuracy = accuracy_score(y_val, val_preds_binary)

print(f"Validation AUC: {auc:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

Parameters: { "use_label_encoder" } are not used.



Validation AUC: 0.7531
Validation Accuracy: 0.6875


## Stacking Classifier

In [35]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Initialize base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100,
                                  max_depth=10,
                                  random_state=42,
                                  n_jobs=-1)),
    ('xgb', XGBClassifier(n_estimators=200,
                          max_depth=6,
                          learning_rate=0.1,
                          random_state=42,
                          use_label_encoder=False,
                          eval_metric='logloss'
                          )),
    ('lgbm', LGBMClassifier(n_estimators=200,
                            learning_rate=0.1,
                            num_leaves=31,
                            random_state=42))
]

# Meta-modeling 
meta_model = LogisticRegression(random_state=42)

# Create stacking ensemble
stacking_ensemble = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5 # Use cross-validation
)

# Train stacking ensemble
stacking_ensemble.fit(X_train, y_train)

# Make predictions
val_preds = stacking_ensemble.predict_proba(X_val)[:,1]
val_preds_binary = (val_preds >= 0.5).astype(int)

# Evaluate model
auc = roc_auc_score(y_val, val_preds)
accuracy_score = accuracy_score(y_val, val_preds_binary)

print(f"Validation AUC: {auc:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Validation AUC: 0.7532
Validation Accuracy: 0.6875
