# Q5 Boosting Algorithm Practice

## (a)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

# read data from csv file
data = pd.read_csv("Titanic.csv")
data = data.dropna(axis='columns')
data.loc[data['Sex'] == 'female', 'Sex'] = 1
data.loc[data['Sex'] == 'male', 'Sex'] = 0
data['Sex'] = pd.to_numeric(data['Sex'])

# split train and test dataset
x = data.drop("Survived", axis=1)
y = data['Survived']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, train_size=0.8, random_state=42, shuffle=True, stratify=y)

## (b)

In [2]:
# sklearn's Adaboost
# tune parameters of the Adaboost using cv
adaboost_hyperparam = {
    'base_estimator': [DecisionTreeClassifier(max_depth=2), DecisionTreeClassifier(max_depth=3)],
    'n_estimators': [50, 75, 100],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1],
    'algorithm': ['SAMME', 'SAMME.R']
}
adaboost_model = AdaBoostClassifier()
adaboost_grid = GridSearchCV(adaboost_model, adaboost_hyperparam, cv=5, scoring='accuracy')
adaboost_grid.fit(x_train, y_train)

best_adaboost_model = adaboost_grid.best_estimator_
# print(adaboost_grid.best_score_)
# print(adaboost_grid.best_params_)
# print(best_adaboost_model)

In [3]:
# XGBoost
# tune parameters of the XGBoost using cv
xgboost_hyperparam = {
    'booster': ['gbtree'],
    'tree_method': ['auto', 'exact', 'approx'],
    'n_estimators': [30, 50, 75, 100],
    'eta': [0.01, 0.1, 0.5, 1],
    'max_depth': [2, 3, 4],
    'gamma': [0, 0.1, 0.2]
}
xgboost_model = XGBClassifier()
xgboost_grid = GridSearchCV(xgboost_model, xgboost_hyperparam, cv=5, scoring='accuracy')
xgboost_grid.fit(x_train, y_train)

best_xgboost_model = xgboost_grid.best_estimator_
# print(xgboost_grid.best_score_)
# print(xgboost_grid.best_params_)
# print(best_xgboost_model)

In [4]:
# performance on the test set
y_pred_ada = best_adaboost_model.predict(x_test)
y_pred_xgb = best_xgboost_model.predict(x_test)

table_data = [
    [f1_score(y_test, y_pred_ada), roc_auc_score(y_test, y_pred_ada), accuracy_score(y_test, y_pred_ada)],
    [f1_score(y_test, y_pred_xgb), roc_auc_score(y_test, y_pred_xgb), accuracy_score(y_test, y_pred_xgb)]
]
table = pd.DataFrame(table_data, index=['Adaboost', 'XGBoost'], columns=['f1', 'roc_auc', 'accuracy'])
print(table)

                f1   roc_auc  accuracy
Adaboost  0.671642  0.735178  0.754190
XGBoost   0.715328  0.768709  0.782123


### Comment on performance difference
When random_state of train_test_split is 42, XGBoost outperforms Adaboost in f1 score, auc, and accuracy.