# Models

## Train and Test Data

In [1]:
# load packages
import pandas as pd

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# read dataframe

from datetime import datetime
import os

files = sorted([f for f in os.listdir("data") if (f.endswith(".csv") and (f.startswith("preprocessed_2")))], reverse=True)
latest = files[0]
df = pd.read_csv(f"data/{latest}")

# drop new generated index column
df.drop(df.columns[0], axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Age_true,AgeGroup,FareGroup,CabinLvl,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Mr,Title_Mrs,Title_Ms,Title_Noble
0,0,3,1,0,1,2,0,0,0,0,1,0,1,0,0,0
1,1,1,1,0,1,4,4,5,1,0,0,0,0,1,0,0
2,1,3,0,0,1,3,1,0,0,0,1,0,0,0,1,0
3,1,1,1,0,1,4,4,5,0,0,1,0,0,1,0,0
4,0,3,0,0,1,4,1,0,0,0,1,0,1,0,0,0


In [3]:
# train-test-split

from sklearn.model_selection import train_test_split

y = df["Survived"]
X = df.drop("Survived", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

## Baseline Model

We build a model which predicts "Survival" (Class 1) for first-class passengers and "No Survival" (Class 0) if a passenger has ticket class 2 or 3.

In [4]:
df_train = X_train.join(y_train)
display(df_train)

Unnamed: 0,Pclass,SibSp,Parch,Age_true,AgeGroup,FareGroup,CabinLvl,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Mr,Title_Mrs,Title_Ms,Title_Noble,Survived
445,1,0,2,1,0,4,7,0,0,1,1,0,0,0,0,1
650,3,0,0,0,3,1,0,0,0,1,0,1,0,0,0,0
172,3,1,1,1,0,2,0,0,0,1,0,0,0,1,0,1
450,2,1,2,1,4,3,0,0,0,1,0,1,0,0,0,0
314,2,1,1,1,4,3,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,3,0,0,1,2,0,0,0,0,1,0,0,0,1,0,1
270,1,0,0,0,3,3,0,0,0,1,0,1,0,0,0,0
860,3,2,0,1,4,2,0,0,0,1,0,1,0,0,0,0
435,1,1,2,1,1,5,6,0,0,1,0,0,0,1,0,1


In [5]:
# for each 'PClass' find number of passengers that survived and did not survive
df_train.groupby(['Pclass', 'Survived']).size()

# if 'Pclass'==1, we predict 'Survived'=1, else we predict 'Survived'=0

Pclass  Survived
1       0            56
        1            83
2       0            69
        1            63
3       0           267
        1            85
dtype: int64

In [6]:
X_test['baseline_pred'] = 0
X_test.loc[X_test['Pclass'] == 1, 'baseline_pred'] = 1
baseline_pred = X_test.baseline_pred
X_test.drop('baseline_pred', axis=1, inplace=True)

In [7]:
baseline_acc = accuracy_score(y_test, baseline_pred)
print(baseline_acc)

print("Classification Report:")
print(classification_report(y_test, baseline_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, baseline_pred))

0.6940298507462687
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.85      0.76       157
           1       0.69      0.48      0.56       111

    accuracy                           0.69       268
   macro avg       0.69      0.66      0.66       268
weighted avg       0.69      0.69      0.68       268

Confusion Matrix:
[[133  24]
 [ 58  53]]


## XGBoost

https://www.datacamp.com/tutorial/xgboost-in-python  
https://thinkingneuron.com/how-to-create-a-classification-model-using-xgboost-in-python/  
https://towardsdatascience.com/a-guide-to-xgboost-hyperparameters-87980c7f44a9 (Hyperparameter Cheatsheet)

In [8]:
from xgboost import XGBClassifier

### Simple XGB-Classifier with default parameters

In [9]:
# simple XGB-Classifier with default parameters

xgb_simple = XGBClassifier()
xgb_simple.fit(X_train, y_train)
xgb_simple_pred = xgb_simple.predict(X_test)
xgb_simple_acc = accuracy_score(y_test, xgb_simple_pred)
print(xgb_simple_acc)

print("Classification Report:")
print(classification_report(y_test, xgb_simple_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, xgb_simple_pred))

0.8097014925373134
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.89      0.84       157
           1       0.81      0.70      0.75       111

    accuracy                           0.81       268
   macro avg       0.81      0.79      0.80       268
weighted avg       0.81      0.81      0.81       268

Confusion Matrix:
[[139  18]
 [ 33  78]]


### Hyperparameter-Tuning for best parameter setting

In [10]:
# Grid Search

from sklearn.model_selection import GridSearchCV

# create an estimator
xgb = XGBClassifier()

# specify the parameter grid
xgb_parameters = {
    'learning_rate': [0.3, 0.7]
    , 'max_depth': [3, 5]
    , 'colsample_bytree': [0.3, 0.8]
    , 'n_estimators': [50, 100]
    #, 'gamma': [0.5, 1, 3]
}

# specify the cross validation
#stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# create grid search instance
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=10)
#cv=stratified_10_fold_cv

# run the grid search
xgb_grid_search.fit(X_train, y_train)

# print the results of all hyper-parameter combinations
xgb_grid_search_results = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results)

# print the best parameter setting
print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))
#best score is 0.8410906298003071 with params {'colsample_bytree': 0.8, 'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 50}

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.068931,0.03971,0.009811,0.010169,0.3,0.3,3,50,"{'colsample_bytree': 0.3, 'learning_rate': 0.3...",0.793651,...,0.903226,0.822581,0.774194,0.83871,0.83871,0.806452,0.935484,0.839555,0.050559,2
1,0.068498,0.003103,0.006383,0.000508,0.3,0.3,3,100,"{'colsample_bytree': 0.3, 'learning_rate': 0.3...",0.793651,...,0.903226,0.806452,0.758065,0.822581,0.854839,0.822581,0.903226,0.831541,0.048016,9
2,0.044174,0.001858,0.006423,0.000457,0.3,0.3,5,50,"{'colsample_bytree': 0.3, 'learning_rate': 0.3...",0.825397,...,0.887097,0.790323,0.758065,0.83871,0.822581,0.806452,0.919355,0.831464,0.046744,10
3,0.077895,0.003375,0.006317,0.000459,0.3,0.3,5,100,"{'colsample_bytree': 0.3, 'learning_rate': 0.3...",0.825397,...,0.854839,0.806452,0.741935,0.822581,0.822581,0.822581,0.919355,0.826651,0.046436,13
4,0.03705,0.000859,0.006081,0.000477,0.3,0.7,3,50,"{'colsample_bytree': 0.3, 'learning_rate': 0.7...",0.793651,...,0.887097,0.806452,0.774194,0.822581,0.822581,0.806452,0.935484,0.829928,0.049618,11
5,0.06813,0.005492,0.006394,0.001207,0.3,0.7,3,100,"{'colsample_bytree': 0.3, 'learning_rate': 0.7...",0.793651,...,0.887097,0.806452,0.741935,0.822581,0.822581,0.806452,0.919355,0.82509,0.050869,15
6,0.046167,0.00374,0.006199,0.000589,0.3,0.7,5,50,"{'colsample_bytree': 0.3, 'learning_rate': 0.7...",0.809524,...,0.870968,0.806452,0.725806,0.83871,0.870968,0.822581,0.935484,0.836303,0.055531,5
7,0.094865,0.012035,0.007302,0.001066,0.3,0.7,5,100,"{'colsample_bytree': 0.3, 'learning_rate': 0.7...",0.809524,...,0.870968,0.83871,0.741935,0.854839,0.870968,0.806452,0.919355,0.837942,0.048274,3
8,0.048056,0.005365,0.007493,0.001447,0.8,0.3,3,50,"{'colsample_bytree': 0.8, 'learning_rate': 0.3...",0.84127,...,0.903226,0.822581,0.741935,0.854839,0.790323,0.806452,0.935484,0.837865,0.059051,4
9,0.07634,0.009925,0.007461,0.000548,0.8,0.3,3,100,"{'colsample_bytree': 0.8, 'learning_rate': 0.3...",0.809524,...,0.887097,0.822581,0.741935,0.83871,0.806452,0.822581,0.919355,0.834665,0.051819,7


best score is 0.8410906298003071 with params {'colsample_bytree': 0.8, 'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 50}


In [11]:
# Fit and evaluate best model

xgb_best = XGBClassifier(colsample_bytree = 0.8, learning_rate = 0.3, max_depth = 5, n_estimators = 50)
xgb_best.fit(X_train, y_train)
xgb_best_pred = xgb_best.predict(X_test)
xgb_best_acc = accuracy_score(y_test, xgb_best_pred)
print(xgb_best_acc)

print("Classification Report:")
print(classification_report(y_test, xgb_best_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, xgb_best_pred))

0.8022388059701493
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       157
           1       0.80      0.69      0.74       111

    accuracy                           0.80       268
   macro avg       0.80      0.79      0.79       268
weighted avg       0.80      0.80      0.80       268

Confusion Matrix:
[[138  19]
 [ 34  77]]


### cross_val_score and cross_val_predict
Das sagt uns nur für welchen Fold der Estimator am besten ist, aber gibt kein Modell

In [12]:
# cross_val_score

from sklearn.model_selection import cross_val_score

xgb_cv = XGBClassifier()
xgb_cv_score = cross_val_score(xgb_cv, X_train, y_train, cv=10, scoring = 'accuracy') #scoring='f1_macro', 'f1_micro'

for i, acc in enumerate(xgb_cv_score):
    print("Fold {}: Accuracy = {}%".format(i, acc*100.0))
print ("Average Accuracy = {}%".format(xgb_cv_score.mean()*100.0))

Fold 0: Accuracy = 84.12698412698413%
Fold 1: Accuracy = 80.95238095238095%
Fold 2: Accuracy = 87.3015873015873%
Fold 3: Accuracy = 87.09677419354838%
Fold 4: Accuracy = 82.25806451612904%
Fold 5: Accuracy = 72.58064516129032%
Fold 6: Accuracy = 82.25806451612904%
Fold 7: Accuracy = 83.87096774193549%
Fold 8: Accuracy = 82.25806451612904%
Fold 9: Accuracy = 90.32258064516128%
Average Accuracy = 83.3026113671275%


In [13]:
# cross_val_predict

from sklearn.model_selection import cross_val_predict

xgb_cv_pred = cross_val_predict(xgb_cv, X_train, y_train, cv=10)
xgb_cv_acc = accuracy_score(y_train, xgb_cv_pred)
xgb_cv_acc

0.8330658105939005

In [14]:
# Nested CV

from sklearn.model_selection import cross_val_score

# create an estimator
xgb_nested = XGBClassifier()

# specify the parameter grid
parameters = {
    'learning_rate': [0.3, 0.7]
    , 'max_depth': [3, 5]
    , 'colsample_bytree': [0.3, 0.8]
    , 'n_estimators': [50, 100]
    #, 'gamma': [0.5, 1, 3]
}

# specify the cross validation
#stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# create grid search instance
xgb_nested_grid_search = GridSearchCV(xgb_nested, parameters, scoring='accuracy', cv=5)
#cv=stratified_10_fold_cv

xgb_nested_cv_score = cross_val_score(xgb_nested_grid_search, X_train, y_train, cv=5, scoring = 'accuracy') # kein Modell
display(xgb_nested_cv_score.mean())

# run the grid search
xgb_nested_grid_search.fit(X_train, y_train)

# print the results of all hyper-parameter combinations
xgb_nested_grid_search_results = pd.DataFrame(xgb_nested_grid_search.cv_results_)
display(xgb_nested_grid_search_results)

# print the best parameter setting
print("best score is {} with params {}".format(xgb_nested_grid_search.best_score_, xgb_nested_grid_search.best_params_))
#best score is 0.8298838709677421 with params {'colsample_bytree': 0.3, 'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 100}

0.8235096774193549

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.092668,0.068974,0.010241,0.004584,0.3,0.3,3,50,"{'colsample_bytree': 0.3, 'learning_rate': 0.3...",0.8,0.856,0.8,0.798387,0.854839,0.821845,0.027422,11
1,0.288282,0.145202,0.013407,0.008197,0.3,0.3,3,100,"{'colsample_bytree': 0.3, 'learning_rate': 0.3...",0.792,0.888,0.792,0.814516,0.862903,0.829884,0.038921,1
2,0.120098,0.070939,0.015562,0.011004,0.3,0.3,5,50,"{'colsample_bytree': 0.3, 'learning_rate': 0.3...",0.8,0.864,0.784,0.814516,0.854839,0.823471,0.031034,8
3,0.114842,0.010915,0.009151,0.001635,0.3,0.3,5,100,"{'colsample_bytree': 0.3, 'learning_rate': 0.3...",0.784,0.848,0.808,0.814516,0.862903,0.823484,0.028404,7
4,0.052624,0.002154,0.00751,0.000662,0.3,0.7,3,50,"{'colsample_bytree': 0.3, 'learning_rate': 0.7...",0.792,0.848,0.792,0.814516,0.870968,0.823497,0.031364,6
5,0.090101,0.019783,0.008794,0.001738,0.3,0.7,3,100,"{'colsample_bytree': 0.3, 'learning_rate': 0.7...",0.768,0.848,0.784,0.846774,0.879032,0.825161,0.042077,5
6,0.059273,0.005971,0.007694,0.001034,0.3,0.7,5,50,"{'colsample_bytree': 0.3, 'learning_rate': 0.7...",0.792,0.848,0.784,0.830645,0.854839,0.821897,0.02889,10
7,0.082045,0.010339,0.007185,0.000484,0.3,0.7,5,100,"{'colsample_bytree': 0.3, 'learning_rate': 0.7...",0.8,0.832,0.776,0.846774,0.854839,0.821923,0.029638,9
8,0.04089,0.003696,0.007092,0.000434,0.8,0.3,3,50,"{'colsample_bytree': 0.8, 'learning_rate': 0.3...",0.808,0.888,0.776,0.814516,0.854839,0.828271,0.039002,3
9,0.085391,0.00237,0.008212,0.00073,0.8,0.3,3,100,"{'colsample_bytree': 0.8, 'learning_rate': 0.3...",0.808,0.872,0.784,0.814516,0.862903,0.828284,0.03368,2


best score is 0.8298838709677421 with params {'colsample_bytree': 0.3, 'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 100}
