1. Use the same dataset from the previous task
2. Reuse validation strategy and preprocessing without changes
3. Train xgboost model
4. Train lightgbm model
5. Train catboost model
6. Compare performance on local validation and on test set on kaggle

In [143]:
# load train data
# reuse the preprocessing approach from the previous homework
import numpy as np
import pandas as pd
import xgboost as xgb
from collections import Counter
from sklearn.base import clone
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

train_df = pd.read_csv('C:\\Users\\FILMINVASION\\Downloads\\ML2024\\train.csv')
test_df = pd.read_csv('C:\\Users\\FILMINVASION\\Downloads\\ML2024\\test.csv')

passenger_ids = test_df['PassengerId'].copy()

def preprocess_data(train_df, test_df):
    
    # Зберігаємо статистичні значення з тренувального набору даних
    age_mean = train_df['Age'].mean()
    embarked_mode = train_df['Embarked'].mode()[0]

    # Заповнюємо пропуски у тренувальному та тестовому наборі однаковими значеннями
    train_df['Age'] = train_df['Age'].fillna(age_mean)
    test_df['Age'] = test_df['Age'].fillna(age_mean)

    train_df['Embarked'] = train_df['Embarked'].fillna(embarked_mode)
    test_df['Embarked'] = test_df['Embarked'].fillna(embarked_mode)

    # Drop unnecessary columns that are not useful for modeling
    train_df = train_df.drop(['Name', 'Ticket', 'Cabin'], axis=1)
    test_df = test_df.drop([ 'Name', 'Ticket', 'Cabin'], axis=1)

    # Застосовуємо one-hot encoding до обох наборів даних
    train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked'], drop_first=True)
    test_df = pd.get_dummies(test_df, columns=['Sex', 'Embarked'], drop_first=True)

    return train_df, test_df

# Call this function before splitting data
train_df, test_df = preprocess_data(train_df, test_df)

X = train_df.drop('Survived', axis=1)  # Features
y = train_df['Survived']  # Target

n_splits = 5
stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for train_index, val_index in stratified_kfold.split(X, y):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]



# XGBoost hyperparameters (initially set)
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',  # You can also use 'error' or other metrics
    'use_label_encoder': False,
    'n_estimators': 1000,  # Set high and allow early stopping to adjust
    'learning_rate': 0.1,
    'max_depth': 3,  # Can tune further
    'subsample': 0.8,  # Can tune further
    'colsample_bytree': 0.8,  # Can tune further
    'random_state': 42
}

# Loop over each fold
best_iteration_per_fold = []
accuracy_per_fold = []

for fold, (train_index, val_index) in enumerate(stratified_kfold.split(X, y)):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    # Create an XGBoost classifier with the defined hyperparameters
    xgb_model = xgb.XGBClassifier(**xgb_params)
    
    # Fit the model with early stopping
    xgb_model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_val_fold, y_val_fold)],
        early_stopping_rounds=10,  # Stop after 10 rounds without improvement
        verbose=False
    )
    
    # Store the best iteration (number of rounds)
    best_iteration_per_fold.append(xgb_model.best_iteration)
    
    # Predict on validation set and compute accuracy
    y_pred_val = xgb_model.predict(X_val_fold)
    accuracy = accuracy_score(y_val_fold, y_pred_val)
    accuracy_per_fold.append(accuracy)
    
    print(f"Fold {fold+1} - Best Iteration: {xgb_model.best_iteration}, Accuracy: {accuracy:.4f}")

# Calculate the average best number of boosting rounds and accuracy
average_best_iteration = int(np.mean(best_iteration_per_fold))
average_accuracy = np.mean(accuracy_per_fold)

print(f"Average Best Iteration: {average_best_iteration}")
print(f"Average Accuracy: {average_accuracy:.4f}")

# Retrain the model on the full training data using the best number of boosting rounds
xgb_final_model = xgb.XGBClassifier(
    **xgb_params,
    n_estimators=average_best_iteration  # Use the average best boosting rounds
)

xgb_final_model.fit(X, y)

In [144]:
# define the xgboost model (from xgboost package)
# You can now use xgb_final_model for making predictions or further evaluations
# define the hyperparameters
# train the model
# try to improve the model by changing the hyperparameters on local validation (remember that using gridsearch is a bad idea, because it can't use the early stopping)
# retrain the model on the whole train dataset
# don't forget to specify the number of boosting rounds you found optimal


xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'learning_rate': 0.1,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 32
}

# Stratified K-Fold
n_splits = 5
stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

best_iteration_per_fold = []
accuracy_per_fold = []

for fold, (train_index, val_index) in enumerate(stratified_kfold.split(X, y)):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    # Create an XGBoost classifier with the defined hyperparameters
    xgb_model = xgb.XGBClassifier(**xgb_params, early_stopping_rounds=10)
    
    # Fit the model with early stopping
    xgb_model.fit(
    X_train_fold, y_train_fold,
    eval_set=[(X_val_fold, y_val_fold)],
    verbose=False
)


    
    # Store the best iteration (number of boosting rounds)
    best_iteration_per_fold.append(xgb_model.best_iteration)
    
    # Predict on validation set and compute accuracy
    y_pred_val = xgb_model.predict(X_val_fold)
    accuracy = accuracy_score(y_val_fold, y_pred_val)
    accuracy_per_fold.append(accuracy)
    
    print(f"Fold {fold+1} - Best Iteration: {xgb_model.best_iteration}, Accuracy: {accuracy:.4f}")

# Calculate the average best number of boosting rounds and accuracy
average_best_iteration_xgb = int(np.mean(best_iteration_per_fold))
average_accuracy_xgb = np.mean(accuracy_per_fold)

print(f"Average Best Iteration: {average_best_iteration_xgb}")
print(f"Average Accuracy: {average_accuracy_xgb:.4f}")

# Retrain the model on the full training data using the best number of boosting rounds
xgb_final_model = xgb.XGBClassifier(
    **xgb_params,
    n_estimators=average_best_iteration_xgb
)

# Fit the final model on the entire dataset
xgb_final_model.fit(X, y)


Fold 1 - Best Iteration: 89, Accuracy: 0.8603
Fold 2 - Best Iteration: 33, Accuracy: 0.8202
Fold 3 - Best Iteration: 37, Accuracy: 0.8034
Fold 4 - Best Iteration: 39, Accuracy: 0.7921
Fold 5 - Best Iteration: 57, Accuracy: 0.8315
Average Best Iteration: 51
Average Accuracy: 0.8215


In [145]:
#Перероблений варіант

xgb_params = {
      #default
    "objective": "binary:logistic",
    "eta": 0.1,
    "verbosity": 0,
    "nthread": 10,
    "random_seed": 1,
    "eval_metric": "logloss",

    # regularization parameters
    "max_depth": 5,
    "subsample": 0.7,
    "colsample_bytree": 0.7
}

# Stratified K-Fold
n_splits = 5
stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Convert data to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)

# Perform cross-validation using xgb.cv
cv_results = xgb.cv(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=1000,
    nfold=n_splits,
    stratified=True,
    folds=stratified_kfold,
    early_stopping_rounds=10,
    verbose_eval=10
)

# Extract the best number of boosting rounds
best_iteration_xgb = cv_results['test-logloss-mean'].idxmin()

print(f"Best Iteration (from CV): {best_iteration_xgb}")

# Retrain the model on the full training data using the best number of boosting rounds
xgb_final_model = xgb.XGBClassifier(
    **xgb_params,
    n_estimators=best_iteration_xgb
)

# Fit the final model on the entire dataset
xgb_final_model.fit(X, y)

# Now you can make predictions or further evaluate the model
y_pred = xgb_final_model.predict(X)
accuracy_xgb = accuracy_score(y, y_pred)

print(f"Final Model Accuracy on Test Data: {accuracy_xgb:.4f}")

[0]	train-logloss:0.61716+0.00165	test-logloss:0.62181+0.00447
[10]	train-logloss:0.44317+0.00669	test-logloss:0.48967+0.02211
[20]	train-logloss:0.36486+0.00743	test-logloss:0.44336+0.03109
[30]	train-logloss:0.31951+0.00784	test-logloss:0.42889+0.04027
[40]	train-logloss:0.28866+0.00775	test-logloss:0.42836+0.04577
[44]	train-logloss:0.27856+0.00723	test-logloss:0.43074+0.04731
Best Iteration (from CV): 34
Final Model Accuracy on Test Data: 0.8799


In [146]:
import lightgbm as lgb
# define the lightgbm model (from lightgbm package)
# define the hyperparameters
# train the model
# try to improve the model by changing the hyperparameters on local validation (remember that using gridsearch is a bad idea, because it can't use the early stopping)
# retrain the model on the whole train dataset
# don't forget to specify the number of boosting rounds you found optimal
# Set up hyperparameters
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.1,
    'max_depth': 3,
    'num_leaves': 31,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    "early_stopping_rounds": 10,
    'verbosity' : -1
    }

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_iteration_per_fold = []
accuracy_per_fold = []

for fold, (train_index, val_index) in enumerate(stratified_kfold.split(X, y)):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    # Create LightGBM dataset
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)

    # Train model with early stopping
    lgb_model = lgb.train(lgb_params, train_data, num_boost_round=1000, valid_sets=[train_data, val_data])

    # Store best iteration and accuracy
    best_iteration_per_fold.append(lgb_model.best_iteration)
    y_pred_val = (lgb_model.predict(X_val_fold, num_iteration=lgb_model.best_iteration) > 0.5).astype(int)
    accuracy = accuracy_score(y_val_fold, y_pred_val)
    accuracy_per_fold.append(accuracy)

    print(f"Fold {fold+1} - Best Iteration: {lgb_model.best_iteration}, Accuracy: {accuracy:.4f}")

# Calculate the average best boosting rounds and accuracy
average_best_iteration_lgb = int(np.mean(best_iteration_per_fold))
average_accuracy_lgb = np.mean(accuracy_per_fold)

print(f"Average Best Iteration: {average_best_iteration_lgb}")
print(f"Average Accuracy: {average_accuracy_lgb:.4f}")

# Retrain the model on the full training data using the best number of boosting rounds
final_train_data = lgb.Dataset(X, label=y)
lgb_final_model = lgb.train({**lgb_params, 'early_stopping_rounds': None},
                            final_train_data, num_boost_round=average_best_iteration_lgb, )



Fold 1 - Best Iteration: 60, Accuracy: 0.8380
Fold 2 - Best Iteration: 103, Accuracy: 0.8202
Fold 3 - Best Iteration: 44, Accuracy: 0.8146
Fold 4 - Best Iteration: 46, Accuracy: 0.8146
Fold 5 - Best Iteration: 33, Accuracy: 0.8315
Average Best Iteration: 57
Average Accuracy: 0.8238


In [147]:
import lightgbm as lgb
from lightgbm import early_stopping
# Define the parameters
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.1,
    'max_depth': 3,
    'num_leaves': 31,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'verbosity': -1
}

# Create LightGBM dataset
train_data = lgb.Dataset(X, label=y)

# Perform cross-validation using lgb.cv
cv_results = lgb.cv(
    lgb_params,
    train_data,
    num_boost_round=1000,
    nfold=5,
    stratified=True,
    shuffle=True,
    callbacks=[early_stopping(stopping_rounds=10)],
    seed=42
)

# Best number of boosting rounds
best_num_boost_rounds = len(cv_results['valid binary_logloss-mean'])
print(f"Best number of boosting rounds from CV: {best_num_boost_rounds}")

# Train the final model with the optimal number of boosting rounds
final_train_data = lgb.Dataset(X, label=y)
lgb_final_model = lgb.train(lgb_params, final_train_data, num_boost_round=best_num_boost_rounds)


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[48]	cv_agg's valid binary_logloss: 0.407485 + 0.0177201
Best number of boosting rounds from CV: 48


In [148]:
print(cv_results.keys())


dict_keys(['valid binary_logloss-mean', 'valid binary_logloss-stdv'])


In [149]:
import numpy as np
from catboost import CatBoostClassifier, Pool
# define the catboost model (from catboost package)
# define the hyperparameters
# train the model
# try to improve the model by changing the hyperparameters on local validation (remember that using gridsearch is a bad idea, because it can't use the early stopping)
# retrain the model on the whole train dataset
# don't forget to specify the number of boosting rounds you found optimal

catboost_params = {
    'iterations': 1000, 
    'depth': 6,
    'learning_rate': 0.1,
    'eval_metric': 'Logloss',
    'random_seed': 42,
    'logging_level': 'Silent',
    'early_stopping_rounds': 10
}

# Initialize stratified k-fold cross-validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_iteration_per_fold = []
accuracy_per_fold = []

# Cross-validation loop
for fold, (train_index, val_index) in enumerate(stratified_kfold.split(X, y)):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    # Create CatBoost Pool objects
    train_data = Pool(data=X_train_fold, label=y_train_fold)
    val_data = Pool(data=X_val_fold, label=y_val_fold)

    # Train the CatBoost model
    catboost_model = CatBoostClassifier(**catboost_params)
    catboost_model.fit(train_data, eval_set=val_data, use_best_model=True)

    # Store best iteration and accuracy
    best_iteration_per_fold.append(catboost_model.best_iteration_)
    y_pred_val = catboost_model.predict(X_val_fold)
    accuracy = accuracy_score(y_val_fold, y_pred_val)
    accuracy_per_fold.append(accuracy)

    print(f"Fold {fold+1} - Best Iteration: {catboost_model.best_iteration_}, Accuracy: {accuracy:.4f}")

# Calculate the average best boosting rounds and accuracy
average_best_iteration_cat = int(np.mean(best_iteration_per_fold))
average_accuracy_cat = np.mean(accuracy_per_fold)

print(f"Average Best Iteration: {average_best_iteration_cat}")
print(f"Average Accuracy: {average_accuracy_cat:.4f}")

# Retrain the model on the full training data using the best number of boosting rounds
final_train_data = Pool(data=X, label=y)
final_catboost_model = CatBoostClassifier(**{**catboost_params, 'iterations': average_best_iteration_cat})
final_catboost_model.fit(final_train_data)





Fold 1 - Best Iteration: 84, Accuracy: 0.8603
Fold 2 - Best Iteration: 46, Accuracy: 0.8258
Fold 3 - Best Iteration: 34, Accuracy: 0.8034
Fold 4 - Best Iteration: 28, Accuracy: 0.8258
Fold 5 - Best Iteration: 28, Accuracy: 0.8315
Average Best Iteration: 44
Average Accuracy: 0.8294


<catboost.core.CatBoostClassifier at 0x2a9b9c52030>

In [150]:
# compare the results of the three models from this homework and with models from the previous homework
# make a conclusion on which model is better and why
# if your boosting is worse than the RF, try to improve it
print("XGBoost:\nAverage best iteration:", average_best_iteration_xgb,"Average accuracy:", average_accuracy_xgb)
print("Lightgbm:\nAverage best iteration:", average_best_iteration_lgb, "Average accuracy:", average_accuracy_lgb)
print("Catboost:\nAverage best iteration:", average_best_iteration_cat, "Average accuracy:", average_accuracy_cat)


print("Random Forest result from previous homework:\nBest Cross-Validation Accuracy: 0.8451007469713137")




XGBoost:
Average best iteration: 51 Average accuracy: 0.8215052413533362
Lightgbm:
Average best iteration: 57 Average accuracy: 0.8237775406440273
Catboost:
Average best iteration: 44 Average accuracy: 0.8293704098926622
Random Forest result from previous homework:
Best Cross-Validation Accuracy: 0.8451007469713137


In [154]:
# load test data
# do the same preprocessing as for train data

# using retrained models make predictions on the test data for all new three models
# save the predictions to a file
# upload the predictions to Kaggle and make a submission
# report the score you got and compare it with the score you got on the validation data
# make a conclusion on how well the models generalizes



# XGBoost
y_pred_xgb = (xgb_final_model.predict(test_df) > 0.5).astype(int)


# LightGBM
y_pred_lgb = (lgb_final_model.predict(test_df) > 0.5).astype(int)


# CatBoost
y_pred_catboost = final_catboost_model.predict(test_df).astype(int)


# After making predictions, create the submission DataFrame
submission_xgb = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': y_pred_xgb 
})

submission_lgb = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': y_pred_lgb
})

submission_cat = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': y_pred_catboost
})
# Save the submission file
submission_xgb.to_csv('submission_xgb.csv', index=False)
submission_lgb.to_csv('submission_lgb.csv', index=False)
submission_cat.to_csv('submission_cat.csv', index=False)


In [155]:
print("Submission to Kaggle for XGBoost score is: 0.75837")
print("Submission to Kaggle for LightGBM score is: 0.77990")
print("Submission to Kaggle for CatBoost score is: 0.77272")



Submission to Kaggle for XGBoost score is: 0.75837
Submission to Kaggle for LightGBM score is: 0.77990
Submission to Kaggle for CatBoost score is: 0.77272
