1. Use the same dataset from the previous task
2. Reuse validation strategy and preprocessing without changes
3. Train xgboost model
4. Train lightgbm model
5. Train catboost model
6. Compare performance on local validation and on test set on kaggle

In [15]:
# load train data
# reuse the preprocessing approach from the previous homework
import numpy as np
import pandas as pd
import xgboost as xgb
from collections import Counter
from sklearn.base import clone
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

train_df = pd.read_csv('C:\\Users\\FILMINVASION\\Downloads\\ML2024\\train.csv')
test_df = pd.read_csv('C:\\Users\\FILMINVASION\\Downloads\\ML2024\\test.csv')


def preprocess_data(train_df, test_df):
    # Зберігаємо статистичні значення з тренувального набору даних
    age_mean = train_df['Age'].mean()
    embarked_mode = train_df['Embarked'].mode()[0]

    # Заповнюємо пропуски у тренувальному та тестовому наборі однаковими значеннями
    train_df['Age'] = train_df['Age'].fillna(age_mean)
    test_df['Age'] = test_df['Age'].fillna(age_mean)

    train_df['Embarked'] = train_df['Embarked'].fillna(embarked_mode)
    test_df['Embarked'] = test_df['Embarked'].fillna(embarked_mode)

    # Drop unnecessary columns that are not useful for modeling
    train_df = train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    test_df = test_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

    # Застосовуємо one-hot encoding до обох наборів даних
    train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked'], drop_first=True)
    test_df = pd.get_dummies(test_df, columns=['Sex', 'Embarked'], drop_first=True)

    return train_df, test_df

# Call this function before splitting data
train_df, test_df = preprocess_data(train_df, test_df)

X_train = train_df.drop('Survived', axis=1)  # Features
y_train = train_df['Survived']  # Target
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


n_splits = 5
stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for train_index, val_index in stratified_kfold.split(train_df, train_df['Survived']):
    X_train_fold, X_val_fold = train_df.iloc[train_index], train_df.iloc[val_index]
    y_train_fold, y_val_fold = train_df['Survived'].iloc[train_index], train_df['Survived'].iloc[val_index]



# XGBoost hyperparameters (initially set)
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',  # You can also use 'error' or other metrics
    'use_label_encoder': False,
    'n_estimators': 1000,  # Set high and allow early stopping to adjust
    'learning_rate': 0.1,
    'max_depth': 3,  # Can tune further
    'subsample': 0.8,  # Can tune further
    'colsample_bytree': 0.8,  # Can tune further
    'random_state': 42
}

# Loop over each fold
best_iteration_per_fold = []
accuracy_per_fold = []

for fold, (train_index, val_index) in enumerate(stratified_kfold.split(X, y)):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    # Create an XGBoost classifier with the defined hyperparameters
    xgb_model = xgb.XGBClassifier(**xgb_params)
    
    # Fit the model with early stopping
    xgb_model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_val_fold, y_val_fold)],
        early_stopping_rounds=10,  # Stop after 10 rounds without improvement
        verbose=False
    )
    
    # Store the best iteration (number of rounds)
    best_iteration_per_fold.append(xgb_model.best_iteration)
    
    # Predict on validation set and compute accuracy
    y_pred_val = xgb_model.predict(X_val_fold)
    accuracy = accuracy_score(y_val_fold, y_pred_val)
    accuracy_per_fold.append(accuracy)
    
    print(f"Fold {fold+1} - Best Iteration: {xgb_model.best_iteration}, Accuracy: {accuracy:.4f}")

# Calculate the average best number of boosting rounds and accuracy
average_best_iteration = int(np.mean(best_iteration_per_fold))
average_accuracy = np.mean(accuracy_per_fold)

print(f"Average Best Iteration: {average_best_iteration}")
print(f"Average Accuracy: {average_accuracy:.4f}")

# Retrain the model on the full training data using the best number of boosting rounds
xgb_final_model = xgb.XGBClassifier(
    **xgb_params,
    n_estimators=average_best_iteration  # Use the average best boosting rounds
)

xgb_final_model.fit(X, y)

In [16]:
# define the xgboost model (from xgboost package)
# You can now use xgb_final_model for making predictions or further evaluations
# define the hyperparameters
# train the model
# try to improve the model by changing the hyperparameters on local validation (remember that using gridsearch is a bad idea, because it can't use the early stopping)
# retrain the model on the whole train dataset
# don't forget to specify the number of boosting rounds you found optimal


xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'learning_rate': 0.1,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 32
}

# Stratified K-Fold
n_splits = 5
stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

best_iteration_per_fold = []
accuracy_per_fold = []

for fold, (train_index, val_index) in enumerate(stratified_kfold.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Create an XGBoost classifier with the defined hyperparameters
    xgb_model = xgb.XGBClassifier(**xgb_params, early_stopping_rounds=10)
    
    # Fit the model with early stopping
    xgb_model.fit(
    X_train_fold, y_train_fold,
    eval_set=[(X_val_fold, y_val_fold)],
    verbose=False
)


    
    # Store the best iteration (number of boosting rounds)
    best_iteration_per_fold.append(xgb_model.best_iteration)
    
    # Predict on validation set and compute accuracy
    y_pred_val = xgb_model.predict(X_val_fold)
    accuracy = accuracy_score(y_val_fold, y_pred_val)
    accuracy_per_fold.append(accuracy)
    
    print(f"Fold {fold+1} - Best Iteration: {xgb_model.best_iteration}, Accuracy: {accuracy:.4f}")

# Calculate the average best number of boosting rounds and accuracy
average_best_iteration_xgb = int(np.mean(best_iteration_per_fold))
average_accuracy_xgb = np.mean(accuracy_per_fold)

print(f"Average Best Iteration: {average_best_iteration_xgb}")
print(f"Average Accuracy: {average_accuracy_xgb:.4f}")

# Retrain the model on the full training data using the best number of boosting rounds
xgb_final_model = xgb.XGBClassifier(
    **xgb_params,
    n_estimators=average_best_iteration_xgb
)

# Fit the final model on the entire dataset
xgb_final_model.fit(X_train, y_train)


Fold 1 - Best Iteration: 46, Accuracy: 0.7972
Fold 2 - Best Iteration: 68, Accuracy: 0.8741
Fold 3 - Best Iteration: 63, Accuracy: 0.8521
Fold 4 - Best Iteration: 23, Accuracy: 0.8169
Fold 5 - Best Iteration: 48, Accuracy: 0.8451
Average Best Iteration: 49
Average Accuracy: 0.8371


In [17]:
import lightgbm as lgb
# define the lightgbm model (from lightgbm package)
# define the hyperparameters
# train the model
# try to improve the model by changing the hyperparameters on local validation (remember that using gridsearch is a bad idea, because it can't use the early stopping)
# retrain the model on the whole train dataset
# don't forget to specify the number of boosting rounds you found optimal
# Set up hyperparameters
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.1,
    'max_depth': 3,
    'num_leaves': 31,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    "early_stopping_rounds": 10,
    'verbosity' : -1
    }

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_iteration_per_fold = []
accuracy_per_fold = []

for fold, (train_index, val_index) in enumerate(stratified_kfold.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Create LightGBM dataset
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)

    # Train model with early stopping
    lgb_model = lgb.train(lgb_params, train_data, num_boost_round=1000, valid_sets=[train_data, val_data])

    # Store best iteration and accuracy
    best_iteration_per_fold.append(lgb_model.best_iteration)
    y_pred_val = (lgb_model.predict(X_val_fold, num_iteration=lgb_model.best_iteration) > 0.5).astype(int)
    accuracy = accuracy_score(y_val_fold, y_pred_val)
    accuracy_per_fold.append(accuracy)

    print(f"Fold {fold+1} - Best Iteration: {lgb_model.best_iteration}, Accuracy: {accuracy:.4f}")

# Calculate the average best boosting rounds and accuracy
average_best_iteration_lgb = int(np.mean(best_iteration_per_fold))
average_accuracy_lgb = np.mean(accuracy_per_fold)

print(f"Average Best Iteration: {average_best_iteration_lgb}")
print(f"Average Accuracy: {average_accuracy_lgb:.4f}")

# Retrain the model on the full training data using the best number of boosting rounds
final_train_data = lgb.Dataset(X_train, label=y_train)
lgb_final_model = lgb.train({**lgb_params, 'early_stopping_rounds': None},
                            final_train_data, num_boost_round=average_best_iteration_lgb, )



Fold 1 - Best Iteration: 28, Accuracy: 0.7832
Fold 2 - Best Iteration: 67, Accuracy: 0.8601
Fold 3 - Best Iteration: 39, Accuracy: 0.8380
Fold 4 - Best Iteration: 29, Accuracy: 0.8099
Fold 5 - Best Iteration: 63, Accuracy: 0.8380
Average Best Iteration: 45
Average Accuracy: 0.8259


In [18]:
import numpy as np
from catboost import CatBoostClassifier, Pool
# define the catboost model (from catboost package)
# define the hyperparameters
# train the model
# try to improve the model by changing the hyperparameters on local validation (remember that using gridsearch is a bad idea, because it can't use the early stopping)
# retrain the model on the whole train dataset
# don't forget to specify the number of boosting rounds you found optimal

catboost_params = {
    'iterations': 1000, 
    'depth': 6,
    'learning_rate': 0.1,
    'eval_metric': 'Logloss',
    'random_seed': 42,
    'logging_level': 'Silent',
    'early_stopping_rounds': 10
}

# Initialize stratified k-fold cross-validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_iteration_per_fold = []
accuracy_per_fold = []

# Cross-validation loop
for fold, (train_index, val_index) in enumerate(stratified_kfold.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Create CatBoost Pool objects
    train_data = Pool(data=X_train_fold, label=y_train_fold)
    val_data = Pool(data=X_val_fold, label=y_val_fold)

    # Train the CatBoost model
    catboost_model = CatBoostClassifier(**catboost_params)
    catboost_model.fit(train_data, eval_set=val_data, use_best_model=True)

    # Store best iteration and accuracy
    best_iteration_per_fold.append(catboost_model.best_iteration_)
    y_pred_val = catboost_model.predict(X_val_fold)
    accuracy = accuracy_score(y_val_fold, y_pred_val)
    accuracy_per_fold.append(accuracy)

    print(f"Fold {fold+1} - Best Iteration: {catboost_model.best_iteration_}, Accuracy: {accuracy:.4f}")

# Calculate the average best boosting rounds and accuracy
average_best_iteration_cat = int(np.mean(best_iteration_per_fold))
average_accuracy_cat = np.mean(accuracy_per_fold)

print(f"Average Best Iteration: {average_best_iteration_cat}")
print(f"Average Accuracy: {average_accuracy_cat:.4f}")

# Retrain the model on the full training data using the best number of boosting rounds
final_train_data = Pool(data=X_train, label=y_train)
final_catboost_model = CatBoostClassifier(**{**catboost_params, 'iterations': average_best_iteration_cat})
final_catboost_model.fit(final_train_data)





Fold 1 - Best Iteration: 21, Accuracy: 0.7902
Fold 2 - Best Iteration: 33, Accuracy: 0.8741
Fold 3 - Best Iteration: 36, Accuracy: 0.8380
Fold 4 - Best Iteration: 25, Accuracy: 0.8028
Fold 5 - Best Iteration: 37, Accuracy: 0.8310
Average Best Iteration: 30
Average Accuracy: 0.8272


<catboost.core.CatBoostClassifier at 0x1a66f823ad0>

In [22]:
# compare the results of the three models from this homework and with models from the previous homework
# make a conclusion on which model is better and why
# if your boosting is worse than the RF, try to improve it
print("Average accuracy for XGBoost model:", average_accuracy_xgb)
print("Average accuracy for Lightgbm model:", average_accuracy_lgb)
print("Average accuracy for Catboost model:", average_accuracy_cat)




Average accuracy for XGBoost model: 0.837082635674185
Average accuracy for Lightgbm model: 0.8258544272628778
Average accuracy for Catboost model: 0.8272333300502316


In [20]:
# load test data
# do the same preprocessing as for train data

# using retrained models make predictions on the test data for all new three models
# save the predictions to a file
# upload the predictions to Kaggle and make a submission
# report the score you got and compare it with the score you got on the validation data
# make a conclusion on how well the models generalizes