Import libraries

In [15]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

from numpy.random import RandomState
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, roc_curve, precision_recall_curve, confusion_matrix, average_precision_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from catboost import CatBoostClassifier , cv, Pool
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Exploratory Analysis

ETL

In [16]:
def preprocess_data(data):
    data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    data['Sex'] = data['Sex'].map({'female': 1, 'male': 0})
    data = pd.get_dummies(data, columns=['Embarked'])
    return data

In [17]:
train = pd.read_csv("train.csv", index_col=0)
train = preprocess_data(train)

y_train = train['Survived']
X_train = train.drop('Survived', axis=1)

len(X_train), len(y_train)

(891, 891)

In [18]:
test = pd.read_csv('test_with_survived.csv', index_col=0)
test = preprocess_data(test)

y_test = test['Survived']
X_test = test.drop(["Survived"], axis=1)

len(X_test), len(y_test)

(418, 418)

Create a Model

Due the nature of the problem (lots of categorical variables) we will include CatBoost into the comparison of models.

In [19]:
# rename all the columns to lowercase
train.columns = train.columns.str.lower()
test.columns = test.columns.str.lower()

In [20]:
features = X_train
target = y_train

In [21]:
train_for_cat = train.copy()
test_for_cat = test.copy()
train_features_for_cat = train_for_cat.drop('survived', axis=1)
train_target_for_cat = train_for_cat['survived']
test_features_for_cat = test_for_cat.drop('survived', axis=1)
test_target_for_cat = test_for_cat['survived']

In [22]:
# get the index of the features that are not float or int
features_index = np.where(train_features_for_cat.dtypes != float)[0]

In [23]:
cat = CatBoostClassifier(loss_function='Logloss',
                         eval_metric='Accuracy',
                         random_seed=42,
                         verbose=False)

cat.fit(train_features_for_cat, train_target_for_cat, cat_features=features_index)

<catboost.core.CatBoostClassifier at 0x1f8202324d0>

In [24]:
x_train_for_cat = train_features_for_cat
x_test_for_cat = test_features_for_cat
y_train_for_cat = train_target_for_cat
y_test_for_cat = test_target_for_cat

#x_train_for_cat, x_test_for_cat, y_train_for_cat, y_test_for_cat = train_test_split(features_for_cat, target_for_cat, test_size=0.15, random_state=42)

In [25]:
cat_features_index = np.where(train_features_for_cat.dtypes != float)[0]

In [26]:
train_features_for_cat.dtypes != float

pclass         True
sex            True
age           False
sibsp          True
parch          True
fare          False
embarked_c     True
embarked_q     True
embarked_s     True
dtype: bool

In [27]:
cat_features_index

array([0, 1, 3, 4, 6, 7, 8], dtype=int64)

In [32]:
train_features_for_cat

Unnamed: 0_level_0,pclass,sex,age,sibsp,parch,fare,embarked_c,embarked_q,embarked_s
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,0,22.0,1,0,7.2500,0,0,1
2,1,1,38.0,1,0,71.2833,1,0,0
3,3,1,26.0,0,0,7.9250,0,0,1
4,1,1,35.0,1,0,53.1000,0,0,1
5,3,0,35.0,0,0,8.0500,0,0,1
...,...,...,...,...,...,...,...,...,...
887,2,0,27.0,0,0,13.0000,0,0,1
888,1,1,19.0,0,0,30.0000,0,0,1
889,3,1,28.0,1,2,23.4500,0,0,1
890,1,0,26.0,0,0,30.0000,1,0,0


Comparison of models with default parameters

In [28]:
dct_with_models = {}

In [29]:
for label_model, model in {'RF': [RandomForestClassifier(random_state=42), 'no_scaler'],
                           'DT': [DecisionTreeClassifier(random_state=42), 'no_scaler'],
                           'LR': [LogisticRegression(random_state=42), 'need_scaler'],
                           'KNB': [KNeighborsClassifier(), 'need_scaler'],
                           'SVC': [SVC(random_state=42, probability=True), 'need_scaler'],
                           'CAT': [CatBoostClassifier(loss_function='Logloss', eval_metric='Accuracy', verbose=False, use_best_model=True, random_seed=42), 'cat']}.items():
    
    
    if model[1] == 'need_scaler':
        scaled_features = StandardScaler().fit_transform(features)
        scores = cross_val_score(model[0], scaled_features, target, cv=9, scoring='accuracy')
        scaler = StandardScaler()
        scaled_train = scaler.fit_transform(X_train)
        scaled_test = scaler.transform(X_test)
        model[0].fit(scaled_train, y_train)
        dct_with_models[f'{label_model}_overall_accuracy_for_model_for_dataset'] = np.mean(scores)
        dct_with_models[f'{label_model}_accuracy_for_x_test_with_default_params'] = accuracy_score(y_test, model[0].predict(scaled_test))    
        dct_with_models[f'{label_model}_roc_auc_for_x_test_with_default_params'] = roc_auc_score(y_test, model[0].predict_proba(scaled_test)[:,1])
        
    elif model[1] == 'no_scaler':
        scores = cross_val_score(model[0], features, target, cv=9, scoring='accuracy')
        model[0].fit(X_train, y_train)
        dct_with_models[f'{label_model}_overall_accuracy_for_model_for_dataset'] = np.mean(scores)
        dct_with_models[f'{label_model}_accuracy_for_x_test_with_default_params'] = accuracy_score(y_test, model[0].predict(X_test))    
        dct_with_models[f'{label_model}_roc_auc_for_x_test_with_default_params'] = roc_auc_score(y_test, model[0].predict_proba(X_test)[:,1])
    
    elif model[1] == 'cat':
        scores = cv(Pool(train_features_for_cat, train_target_for_cat, cat_features=cat_features_index),
                    {"loss_function": "Logloss",
                     "eval_metric": "Accuracy",
                     "verbose": False,
                     "random_seed": 42},
                    fold_count=5)       
        model[0].fit(x_train_for_cat, y_train_for_cat,
                     cat_features=cat_features_index,
                     eval_set=(x_test_for_cat, y_test_for_cat),
                     verbose=True,
                     plot=False)
        dct_with_models[f'{label_model}_overall_accuracy_for_model_for_dataset'] = scores['test-Accuracy-mean'].mean()
        dct_with_models[f'{label_model}_accuracy_for_x_test_with_default_params'] = accuracy_score(y_test_for_cat, model[0].predict(x_test_for_cat))    
        dct_with_models[f'{label_model}_roc_auc_for_x_test_with_default_params'] = roc_auc_score(y_test_for_cat, model[0].predict_proba(x_test_for_cat)[:,1])  

Training on fold [0/5]

bestTest = 0.8100558659
bestIteration = 13

Training on fold [1/5]

bestTest = 0.8882681564
bestIteration = 60

Training on fold [2/5]

bestTest = 0.8595505618
bestIteration = 1

Training on fold [3/5]

bestTest = 0.7865168539
bestIteration = 130

Training on fold [4/5]

bestTest = 0.8192090395
bestIteration = 188

Learning rate set to 0.030798
0:	learn: 0.8226712	test: 0.7775120	best: 0.7775120 (0)	total: 33.8ms	remaining: 33.7s
1:	learn: 0.8193042	test: 0.7751196	best: 0.7775120 (0)	total: 82.2ms	remaining: 41s
2:	learn: 0.8204265	test: 0.7727273	best: 0.7775120 (0)	total: 115ms	remaining: 38.3s
3:	learn: 0.8193042	test: 0.7751196	best: 0.7775120 (0)	total: 157ms	remaining: 39.2s
4:	learn: 0.8170595	test: 0.7751196	best: 0.7775120 (0)	total: 187ms	remaining: 37.3s
5:	learn: 0.8260382	test: 0.7751196	best: 0.7775120 (0)	total: 229ms	remaining: 37.9s
6:	learn: 0.8226712	test: 0.7727273	best: 0.7775120 (0)	total: 292ms	remaining: 41.4s
7:	learn: 0.8249158	test: 0

In [30]:
model = []
overall_accuracy_for_dataset = []
accuracy_for_x_test_with_default_params = []
roc_auc_for_x_test_with_default_params = []

for name_model in ['DT', 'RF', 'LR', 'KNB', 'SVC', 'CAT']:
    model.append(name_model)
    overall_accuracy_for_dataset.append(dct_with_models[f'{name_model}_overall_accuracy_for_model_for_dataset'])
    accuracy_for_x_test_with_default_params.append(dct_with_models[f'{name_model}_accuracy_for_x_test_with_default_params'])
    roc_auc_for_x_test_with_default_params.append(dct_with_models[f'{name_model}_roc_auc_for_x_test_with_default_params'])

In [31]:
results = {
    'Model' : model,
    'OVERALL ACCURACY FOR TRAIN DATASET' : pd.Series(overall_accuracy_for_dataset),
    'ROC_AUC FOR X_TEST WITH DEFAULT PARAMS' : pd.Series(roc_auc_for_x_test_with_default_params),
    'ACCURACY FOR X_TEST WITH DEFAULT PARAMS' : pd.Series(accuracy_for_x_test_with_default_params)}

display(pd.DataFrame(results).style.highlight_max(color='green'))

Unnamed: 0,Model,OVERALL ACCURACY FOR TRAIN DATASET,ROC_AUC FOR X_TEST WITH DEFAULT PARAMS,ACCURACY FOR X_TEST WITH DEFAULT PARAMS
0,DT,0.790123,0.710723,0.712919
1,RF,0.808081,0.799769,0.758373
2,LR,0.791246,0.811916,0.76555
3,KNB,0.800224,0.796775,0.717703
4,SVC,0.823793,0.810236,0.779904
5,CAT,0.814097,0.828201,0.799043
