In [1]:
import pandas as pd
import scipy.stats as stats
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [2]:
train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv('Data/test.csv')
data_dict = pd.read_csv('Data/data_dictionary.csv')
sample_submission = pd.read_csv('Data/sample_submission.csv')

# Handling NaN values

In [3]:
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier



def handle_missing_values(data):
    # Separating categorical and numeric columns
    cat_cols = [col for col in data.columns if data[col].dtype == 'object']
    num_cols = [col for col in data.columns if col not in cat_cols]

    # Handling missing numerical data with KNNImputer
    knn_imputer = KNNImputer(n_neighbors=5)
    data[num_cols] = knn_imputer.fit_transform(data[num_cols])

    # Imputing categorical data with KNeighborsClassifier
    for col in cat_cols:
        missing_mask = data[col].isna()
        if missing_mask.sum() > 0:
            # Separate training and rpediction sets
            X_train = data.loc[~missing_mask, num_cols]
            y_train = data.loc[~missing_mask, col]
            X_missing = data.loc[missing_mask, num_cols]

            # Train a KNeighborsClassifier
            knn = KNeighborsClassifier(n_neighbors=5)
            knn.fit(X_train, y_train)
            imputed_values = knn.predict(X_missing)

            # Fill missing values
            data.loc[missing_mask, col] = imputed_values

    return data

train_data = handle_missing_values(train_data)
train_data.head()

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,0.0,N/A - non-malignant indication,No,Poor,No,2.0,7.0,No TBI,No,6.0,...,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,1.0,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2,2.0,N/A - non-malignant indication,No,Poor,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793
3,3.0,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,102.349
4,4.0,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Related,MEL,8.0,No,2.0,No,10.0,0.0,16.223


# Simple NN

In [4]:
# Separate target and drop unneeded columns
y = train_data['efs']
X = train_data.drop(columns=['efs', 'efs_time', 'ID'], errors='ignore')

# Encode categorical variables
cat_cols = [col for col in X.columns if X[col].dtype == 'object']
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

# Scale numeric featuers
num_cols = [col for col in X.columns if col not in cat_cols]
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Ensure all numerica
X = X.astype(np.float32)


# Convert to NumPy arrays
X_array = X.values.astype(np.float32)
y_array = y.values.astype(np.float32)

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, test_size=0.2, random_state=42
)

# Convert NumPy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

# Create Datasets and Dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [5]:
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau

# Define a simple feedforward model
class SimpleNet(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

In [6]:
# Initialize the model, loss, and optimizer
model = SimpleNet(input_dim=X_train.shape[1])
criterion = nn.BCELoss() # Binary cross-entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Learning Rate scheduler
# StepLR
# Decrease the learning rate every 10 epochs by a factor of 0.1
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

# ReduceLROnPlateaus
# scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)


# Training loop 
epochs = 100
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = outputs.view(-1) # Flatten for loss calculation
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_loader.dataset)

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs).view(-1)
            loss = criterion(outputs, targets)
            val_loss += loss.item() * inputs.size(0)

            preds = (outputs >= 0.5).float()
            val_correct += (preds == targets).sum().item()
            val_total += targets.size(0)
    val_loss /= len(val_loader.dataset)
    val_accuracy = val_correct / val_total

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

    # Update the learning rate using scheduler
    scheduler.step()

Epoch 1/100, Train Loss: 0.6057, Val Loss: 0.5917, Val Acc: 0.6875
Epoch 2/100, Train Loss: 0.5837, Val Loss: 0.5982, Val Acc: 0.6759
Epoch 3/100, Train Loss: 0.5769, Val Loss: 0.6012, Val Acc: 0.6788
Epoch 4/100, Train Loss: 0.5696, Val Loss: 0.6024, Val Acc: 0.6771
Epoch 5/100, Train Loss: 0.5593, Val Loss: 0.6073, Val Acc: 0.6813
Epoch 6/100, Train Loss: 0.5470, Val Loss: 0.6186, Val Acc: 0.6682
Epoch 7/100, Train Loss: 0.5327, Val Loss: 0.6261, Val Acc: 0.6644
Epoch 8/100, Train Loss: 0.5171, Val Loss: 0.6455, Val Acc: 0.6576
Epoch 9/100, Train Loss: 0.5029, Val Loss: 0.6555, Val Acc: 0.6535
Epoch 10/100, Train Loss: 0.4844, Val Loss: 0.6769, Val Acc: 0.6531
Epoch 11/100, Train Loss: 0.4429, Val Loss: 0.6839, Val Acc: 0.6517
Epoch 12/100, Train Loss: 0.4327, Val Loss: 0.6939, Val Acc: 0.6503
Epoch 13/100, Train Loss: 0.4269, Val Loss: 0.6989, Val Acc: 0.6483
Epoch 14/100, Train Loss: 0.4224, Val Loss: 0.7058, Val Acc: 0.6453
Epoch 15/100, Train Loss: 0.4186, Val Loss: 0.7141, Val A

In [7]:
class ImprovedNet(nn.Module):
    def __init__(self, input_dim):
        super(ImprovedNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32,1)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.dropout(self.relu(self.bn1(self.fc1(x))))
        x = self.dropout(self.relu(self.bn2(self.fc2(x))))
        x = self.relu(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        return x

def training(model, criterion, optimizer, epochs, train_loader, val_loader, device):
    """
    Train and evaluate the model.

    Args:
        model: PyTorch model to train.
        train_loader = DataLoader for the training dataset.
        val_loader = DataLoader for the validation dataset.
        criterion: Loss function.
        optimizer: Optimizer for training the model.
        scheduler: Learning rate scheduler (optional).
        epochs: Number of epochs to train.
        device: Device to use for training ('cpu' or 'mps')

    Returns:
        history: Dictionary containing training and validation metrics
    """
    
    # Move model to the specified device
    model = model.to(device)
    
    # Learning Rate Scheduler
    scheduler = StepLR(optimizer, step_size=10, gamma=0.1)
    history = {
        'train_loss': [],
        'val_loss': [],
        'val_accuracy': [],
        'val_auc': []
    }
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()
            outputs = model(inputs).view(-1)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        all_targets = []
        all_outputs = []
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)

                outputs = model(inputs).view(-1)
                loss = criterion(outputs, targets)
                val_loss += loss.item() * inputs.size(0)

                # Collect outputs for metrics
                preds = (outputs >= 0.5).float()
                val_correct += (preds == targets).sum().item()
                val_total += targets.size(0)

                all_targets.extend(targets.cpu().numpy())
                all_outputs.extend(outputs.cpu().numpy())

        epoch_val_loss = val_loss / len(val_loader.dataset)
        epoch_val_accuracy = val_correct / val_total
        epoch_val_auc = roc_auc_score(all_targets, all_outputs)

        # Log metrics
        history['train_loss'].append(epoch_loss)
        history['val_loss'].append(epoch_val_loss)
        history['val_accuracy'].append(epoch_val_accuracy)
        history['val_auc'].append(epoch_val_auc)

        print(
            f"Epoch {epoch + 1}/{epochs}, "
            f"Train Loss: {epoch_loss:.4f}, "
            f"Val Loss: {epoch_val_loss:.4f}, "
            f"Val Acc: {epoch_val_accuracy:.4f}, "
            f"Val AUC: {epoch_val_auc:.4f}"
        )

        # Update the learning rate using scheduler
        scheduler.step()
    return history


In [8]:
# Initialize the model, loss, and optimizer
model = ImprovedNet(input_dim=X_train.shape[1])
criterion = nn.BCELoss() # Binary cross-entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

training(model, criterion, optimizer, 100, train_loader, val_loader, 'mps')

Epoch 1/100, Train Loss: 0.6173, Val Loss: 0.5941, Val Acc: 0.6833, Val AUC: 0.7474
Epoch 2/100, Train Loss: 0.5967, Val Loss: 0.5918, Val Acc: 0.6833, Val AUC: 0.7490
Epoch 3/100, Train Loss: 0.5933, Val Loss: 0.5909, Val Acc: 0.6858, Val AUC: 0.7495
Epoch 4/100, Train Loss: 0.5891, Val Loss: 0.5935, Val Acc: 0.6797, Val AUC: 0.7467
Epoch 5/100, Train Loss: 0.5840, Val Loss: 0.5892, Val Acc: 0.6842, Val AUC: 0.7502
Epoch 6/100, Train Loss: 0.5795, Val Loss: 0.5921, Val Acc: 0.6795, Val AUC: 0.7462
Epoch 7/100, Train Loss: 0.5763, Val Loss: 0.5954, Val Acc: 0.6821, Val AUC: 0.7449
Epoch 8/100, Train Loss: 0.5749, Val Loss: 0.5988, Val Acc: 0.6790, Val AUC: 0.7456
Epoch 9/100, Train Loss: 0.5696, Val Loss: 0.5974, Val Acc: 0.6806, Val AUC: 0.7425
Epoch 10/100, Train Loss: 0.5640, Val Loss: 0.5953, Val Acc: 0.6844, Val AUC: 0.7439
Epoch 11/100, Train Loss: 0.5511, Val Loss: 0.6001, Val Acc: 0.6832, Val AUC: 0.7434
Epoch 12/100, Train Loss: 0.5450, Val Loss: 0.6014, Val Acc: 0.6825, Val A

{'train_loss': [0.6172730491807064,
  0.5966577172279358,
  0.5933147323628266,
  0.5891298026260402,
  0.5839842617511749,
  0.5794732965942886,
  0.5762583831532134,
  0.5748873870819807,
  0.5696068338635895,
  0.5640071781145202,
  0.5510744279043542,
  0.5449926531149282,
  0.5455320627325111,
  0.5430314879450533,
  0.543470657699638,
  0.5381491415202617,
  0.5357132205118735,
  0.5377162483003405,
  0.5372837848961354,
  0.5344616406079795,
  0.5300472452822659,
  0.532050444641047,
  0.5320482001536422,
  0.5349948260519239,
  0.5300473605593046,
  0.5319718196988106,
  0.5311547040939331,
  0.5315814144909382,
  0.5299713527990713,
  0.5335982034189833,
  0.5270945094939735,
  0.5311316789024406,
  0.5320541260557042,
  0.5296011115527816,
  0.5309965340627565,
  0.5322035948435465,
  0.5323977562702364,
  0.5322577784458796,
  0.5311930317017767,
  0.5307804639140765,
  0.5289709633423223,
  0.530569516080949,
  0.5291243636359771,
  0.5303948570870691,
  0.5311666465467877,

# Non traditional method

## XGBoost

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score
import xgboost as xgb

# Separate target and features
y = train_data['efs']
X = train_data.drop(columns=['efs', 'efs_time', 'ID'], errors='ignore')

# Encode categorical variables
cat_cols = [col for col in X.columns if X[col].dtype=='object']
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

# Scale numeric features
num_cols = [col for col in X.columns if X[col].dtype != 'uint8']
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Convert to NumPy arrays
X_array = X.values
y_array = y.values

# Split into train and test sets
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, test_size = 0.2, random_state=42
)

# Convert to DMatrix for XGBoost
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
val_dmatrix = xgb.DMatrix(X_val, label=y_val)

# Define XGBoost paramters
params = {
    "objective" : "binary:logistic", # Binary classification
    "eval_metric" : "auc",           # Evaluation metric
    "eta" : 0.1,                     # Learning rate
    "max_depth" : 6,                 # Max depth of tress
    "subsample" : 0.8,               # Row sampling
    "colsample_bytree" : 0.8,        # Feature Sampling
    "lambda" : 1,                    # L2 regularization
    "alpha" : 0                      # L1 regularization
}

# Trian the model
evals = [(train_dmatrix, 'train'), (val_dmatrix, 'eval')]
num_boost_round = 200
early_stopping_rounds = 10

xgb_model = xgb.train(
    params=params,
    dtrain=train_dmatrix,
    num_boost_round=num_boost_round,
    evals=evals,
    early_stopping_rounds=early_stopping_rounds,
    verbose_eval=10
)

# Make predictions on validation set
val_preds = xgb_model.predict(val_dmatrix)
val_preds_binary = (val_preds >= 0.5).astype(int)

# Evaluate model
auc = roc_auc_score(y_val, val_preds)
acc = accuracy_score(y_val, val_preds_binary)

print(f"Validation AUC: {auc:.4f}")
print(f"Validation ACC: {acc:.4f}")

[0]	train-auc:0.69296	eval-auc:0.66999
[10]	train-auc:0.75606	eval-auc:0.72814
[20]	train-auc:0.77368	eval-auc:0.73655
[30]	train-auc:0.78955	eval-auc:0.74152
[40]	train-auc:0.80433	eval-auc:0.74609
[50]	train-auc:0.81473	eval-auc:0.74908
[60]	train-auc:0.82327	eval-auc:0.75004
[69]	train-auc:0.83192	eval-auc:0.75000
Validation AUC: 0.7500
Validation ACC: 0.6839


## LightGBM

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score
import lightgbm as lgb

# Separate target and features
y = train_data['efs']
X = train_data.drop(columns=['efs', 'efs_time', 'ID'], errors='ignore')

# Encode categorical variables
cat_cols = [col for col in X.columns if X[col].dtype=='object']
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

# Scale numeric features
num_cols = [col for col in X.columns if X[col].dtype!='uint8']
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Convert to NumPy arrays
X_array = X.values
y_array = y.values

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, test_size=0.2, random_state=42
)

# Convert categorical column indices 
categorical_features = [X.columns.get_loc(col) for col in cat_cols if col in X.columns]

# LightGBM dataset
train_dataset = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
val_dataset = lgb.Dataset(X_val, label=y_val, categorical_feature=categorical_features, reference=train_dataset)

# Define LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'auc', # Evaluation metric
    'boosting_type': 'gbdt',
    'num_leaves': 31, # Controls complexity of the tree,
    'learning_rate': 0.1,
    'feature_fraction': 0.8, # Randomly select a fraction of features for each iteration
    'bagging_fraction': 0.8, # Randomly select a fraction of data for each iteration
    'bagging_freq': 5, # Frequency of bagging
    'verbose': -1
}

# Train the model
num_round = 200
early_stopping_rounds = 10
lgb_model = lgb.train(
    params=params,
    train_set=train_dataset,
    num_boost_round=num_round,
    valid_sets=[train_dataset, val_dataset],
    valid_names=['train', 'eval'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=early_stopping_rounds),
        lgb.log_evaluation(10)
    ]
)
# Make predictions on validation set
val_preds = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
val_preds_binary = (val_preds >= 0.5).astype(int)

# Evaluate model
auc = roc_auc_score(y_val, val_preds)
accuracy = accuracy_score(y_val, val_preds_binary)

print(f"Validation AUC: {auc:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

Training until validation scores don't improve for 10 rounds
[10]	train's auc: 0.742947	eval's auc: 0.726273
[20]	train's auc: 0.759245	eval's auc: 0.735371
[30]	train's auc: 0.774242	eval's auc: 0.741467
[40]	train's auc: 0.785755	eval's auc: 0.746016
[50]	train's auc: 0.797085	eval's auc: 0.748616
[60]	train's auc: 0.805966	eval's auc: 0.750482
[70]	train's auc: 0.813671	eval's auc: 0.752309
[80]	train's auc: 0.820852	eval's auc: 0.753117
[90]	train's auc: 0.828277	eval's auc: 0.75346
Early stopping, best iteration is:
[88]	train's auc: 0.826883	eval's auc: 0.753681
Validation AUC: 0.7537
Validation Accuracy: 0.6891


## Random Forest Classifier

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

# Separate target and features 
y = train_data['efs']
X = train_data.drop(columns=['efs', 'efs_time', 'ID'], errors='ignore')

# Encode categorical variables
cat_cols = [col for col in X.columns if X[col].dtype=='object']
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)

# Scale numeric ceatures
num_cols = [col for col in X.columns if col not in cat_cols]
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Convert to NumPy arrays
X_array = X.values
y_array = y.values

# Split into train and validation set
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, test_size=0.2, random_state=42
)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,         # Number of trees in the forest
    max_depth=10,            # Maximum depth of the tree
    min_samples_split=5,     # Maximum samples to split a node
    min_samples_leaf=2,      # Minimum samples at leaf node
    max_features='sqrt',     # Number of features to consider at each split
    random_state=42,         # For reproducibility
    n_jobs=-1,               # Use all available cores for training
)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on validation set
val_preds = rf_model.predict_proba(X_val)[:,1] # Probability of positive class
val_preds_binary = (val_preds >= 0.5).astype(int)

# Evaluate the model
auc = roc_auc_score(y_val, val_preds)
accuracy = accuracy_score(y_val, val_preds_binary)

print(f"Validation AUC: {auc:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

Validation AUC: 0.7297
Validation Accuracy: 0.6667


## Simple Ensemble (RandomForestClassifier, XGBClassifier, LGBMClassifier)   

## Voting Classifier

In [12]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

# Initialize individual models
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

xgb_model = XGBClassifier(
    n_estimators=200, 
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
)

lgbm_model = LGBMClassifier(
    n_estimators=200,
    learning_rate=0.1,
    num_leaves=31,
    random_state=42
)

# Combine models in voting classifier
voting_ensemble = VotingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgb_model),('lgbm', lgbm_model)],
    voting='soft'
)

# Train ensemble model
voting_ensemble.fit(X_train, y_train)

# Make predictions
val_preds = voting_ensemble.predict_proba(X_val)[:,1]
val_preds_binary = (val_preds >= 0.5).astype(int)

# Evaluate model
auc = roc_auc_score(y_val, val_preds)
accuracy = accuracy_score(y_val, val_preds_binary)

print(f"Validation AUC: {auc:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

Parameters: { "use_label_encoder" } are not used.



Validation AUC: 0.7531
Validation Accuracy: 0.6875


## Stacking Classifier

In [13]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Initialize base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100,
                                  max_depth=10,
                                  random_state=42,
                                  n_jobs=-1)),
    ('xgb', XGBClassifier(n_estimators=200,
                          max_depth=6,
                          learning_rate=0.1,
                          random_state=42,
                          use_label_encoder=False,
                          eval_metric='logloss'
                          )),
    ('lgbm', LGBMClassifier(n_estimators=200,
                            learning_rate=0.1,
                            num_leaves=31,
                            random_state=42))
]

# Meta-modeling 
meta_model = LogisticRegression(random_state=42)

# Create stacking ensemble
stacking_ensemble = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5 # Use cross-validation
)

# Train stacking ensemble
stacking_ensemble.fit(X_train, y_train)

# Make predictions
val_preds = stacking_ensemble.predict_proba(X_val)[:,1]
val_preds_binary = (val_preds >= 0.5).astype(int)

# Evaluate model
auc = roc_auc_score(y_val, val_preds)
accuracy_score = accuracy_score(y_val, val_preds_binary)

print(f"Validation AUC: {auc:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Validation AUC: 0.7532
Validation Accuracy: 0.6875
