# Modeling

In [1]:
cd ../src

D:\Projekty\roche-recruitment-task\src


In [2]:
import pandas as pd

from preprocess import impute_missing_values
from build_features import convert_features, encode_features, add_new_features

In [3]:
df = pd.read_csv("../data/train.csv", sep=";")
df = df.drop(["PassengerId", "Name", "Ticket", "Fare", "Cabin"], axis=1)

df = impute_missing_values(df, cat_columns=["Embarked"], num_columns=["Age"])
df = convert_features(df)
df = encode_features(df, columns=["Pclass", "Age", "Embarked"])
df = add_new_features(df)

df.head(10)

Unnamed: 0,Survived,Sex,2,3,Medium,Old,Q,S,IsAlone
0,0,0,0,1,0,0,0,1,1
1,0,0,0,1,1,0,0,1,1
2,0,0,0,1,0,0,0,1,1
3,0,0,0,0,0,1,0,1,0
4,0,0,0,1,1,0,0,1,0
5,0,0,0,1,0,1,0,1,1
6,0,0,0,1,1,0,0,1,1
7,0,0,0,0,0,1,0,0,1
8,1,0,0,0,0,0,0,0,0
9,1,1,1,0,0,1,0,1,0


## Comparing models

In [15]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import KFold, cross_validate

from sklearn.metrics import accuracy_score, f1_score

X = df.drop(["Survived"], axis=1)
y = df["Survived"]

classifiers = [
    SVC(kernel="linear"),
    GaussianNB(),
    DecisionTreeClassifier(max_depth=5),
    LogisticRegression(),
    RandomForestClassifier(max_depth=5, n_estimators=10),
    KNeighborsClassifier()]

clf_names = [
    "Linear SVM", 
    "Naive Bayes", 
    "Decision Tree", 
    "Logistic Regression", 
    "Random Forest", 
    "kNN"]

kfold = KFold(n_splits=10, shuffle=True)
scoring = {"accuracy": "accuracy",
           "f1": "f1_macro"}

clf_scores = []
for clf in classifiers:
    cross_val_scores = cross_validate(clf, X, y, cv=kfold, scoring=scoring)
    cross_val_scores = {key: score.mean() for key, score in cross_val_scores.items()}
    clf_scores.append(cross_val_scores)

results = pd.DataFrame(clf_scores)
results = results.drop(["fit_time", "score_time"], axis=1)
results["model"] = clf_names
results = results.sort_values(by="test_accuracy", ascending=False)
results

Unnamed: 0,test_accuracy,test_f1,model
4,0.800448,0.770686,Random Forest
2,0.797948,0.769387,Decision Tree
5,0.786806,0.759317,kNN
0,0.78412,0.762701,Linear SVM
3,0.783086,0.763414,Logistic Regression
1,0.761898,0.741353,Naive Bayes


Random Forest classifier has the highest accuracy and F1 score.

## Hyperparameter tuning

In [21]:
from sklearn.model_selection import GridSearchCV

param_grid = {"max_depth": [1, 2, 5, 10], 
              "min_samples_leaf": [1, 5, 10, 25, 50], 
              "min_samples_split": [2, 4, 10, 12, 16, 18], 
              "n_estimators": [10, 50, 100, 200]}

rf = RandomForestClassifier(random_state=1, n_jobs=-1)
clf = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1)
clf.fit(X, y)
clf.best_params_

{'max_depth': 5,
 'min_samples_leaf': 5,
 'min_samples_split': 12,
 'n_estimators': 10}

## Final classifier

In [23]:
clf = RandomForestClassifier(max_depth=5,
                             min_samples_leaf=5,
                             min_samples_split=12,
                             n_estimators=10)

clf.fit(X, y)

scores = cross_validate(clf, X, y, cv=kfold, scoring=scoring)
scores = {key: score.mean() for key, score in scores.items()}

print("Accuracy: {}".format(str(scores["test_accuracy"] * 100)))
print("F1 score: {}".format(str(scores["test_f1"] * 100)))

Accuracy: 80.30401234567901
F1 score: 77.24400772026289
