Import libraries

In [17]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

from numpy.random import RandomState
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, roc_curve, precision_recall_curve, confusion_matrix, average_precision_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from catboost import CatBoostClassifier , cv, Pool
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Exploratory Analysis

ETL

In [18]:
def preprocess_data(data):

    # Extract titles from the 'Name' column
    data['Title'] = data['Name'].apply(lambda x: x.split(",")[1].split(".")[0].strip())

    # Replace titles with general categories
    data['Title'] = data['Title'].replace({
        "Capt": "man", "Don": "man", "Major": "man", "Col": "man", 
        "Rev": "man", "Dr": "man", "Sir": "man", "Mr": "man", "Jonkheer": "man",
        "Dona": "woman", "the Countess": "woman", "Mme": "woman", 
        "Mlle": "woman", "Ms": "woman", "Miss": "woman", "Lady": "woman", "Mrs": "woman",
        "Master": "boy"
    })

    # Extract surnames from the 'Name' column
    data['Surname'] = data['Name'].apply(lambda x: x.split(",")[0])

    # Group 'man' titles under 'noGroup'
    data.loc[data['Title'] == 'man', 'Surname'] = 'noGroup'

    # Calculate the frequency of surnames
    data['SurnameFreq'] = data.groupby('Surname')['Surname'].transform('count')

    # Group surnames that appear only once under 'noGroup'
    data.loc[data['SurnameFreq'] <= 1, 'Surname'] = 'noGroup'

    # Calculate the survival rates for 'woman-child-groups'
    data['SurnameSurvival'] = data.groupby('Surname')['Survived'].transform('mean')

    # Adjust survival rates for use on the training set
    data['AdjustedSurvival'] = (data['SurnameSurvival'] * data['SurnameFreq'] - data['Survived']) / (data['SurnameFreq'] - 1)

    # if the adjust survival rate is -inf or inf
    data['AdjustedSurvival'] = data['AdjustedSurvival'].replace(np.inf, 1) # this means that all the people with the same surname survived
    data['AdjustedSurvival'] = data['AdjustedSurvival'].replace(-np.inf, 0) # this means that all the people with the same surname died

    data.drop(['Name', 'Ticket', 'Cabin', 'Surname', 'SurnameFreq', 'Title'], axis=1, inplace=True)
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    data['Sex'] = data['Sex'].map({'female': 1, 'male': 0})
    data = pd.get_dummies(data, columns=['Embarked'])
    return data

In [19]:
train = pd.read_csv("train.csv", index_col=0)
train = preprocess_data(train)

y_train = train['Survived']
X_train = train.drop('Survived', axis=1)

len(X_train), len(y_train)

(891, 891)

In [20]:
test = pd.read_csv('test_with_survived.csv', index_col=0)
test = preprocess_data(test)

y_test = test['Survived']
X_test = test.drop(["Survived"], axis=1)

len(X_test), len(y_test)

(418, 418)

Create a Model

Due the nature of the problem (lots of categorical variables) we will include CatBoost into the comparison of models.

In [21]:
# rename all the columns to lowercase
train.columns = train.columns.str.lower()
test.columns = test.columns.str.lower()

In [22]:
features = X_train
target = y_train

In [23]:
train_for_cat = train.copy()
test_for_cat = test.copy()
train_features_for_cat = train_for_cat.drop('survived', axis=1)
train_target_for_cat = train_for_cat['survived']
test_features_for_cat = test_for_cat.drop('survived', axis=1)
test_target_for_cat = test_for_cat['survived']

In [24]:
# get the index of the features that are not float or int
features_index = np.where(train_features_for_cat.dtypes != float)[0]

In [25]:
cat = CatBoostClassifier(loss_function='Logloss',
                         eval_metric='Accuracy',
                         random_seed=42,
                         verbose=False)

cat.fit(train_features_for_cat, train_target_for_cat, cat_features=features_index)

<catboost.core.CatBoostClassifier at 0x19e362a59c0>

In [26]:
x_train_for_cat = train_features_for_cat
x_test_for_cat = test_features_for_cat
y_train_for_cat = train_target_for_cat
y_test_for_cat = test_target_for_cat

#x_train_for_cat, x_test_for_cat, y_train_for_cat, y_test_for_cat = train_test_split(features_for_cat, target_for_cat, test_size=0.15, random_state=42)

In [27]:
cat_features_index = np.where(train_features_for_cat.dtypes != float)[0]

In [28]:
train_features_for_cat.dtypes != float

pclass               True
sex                  True
age                 False
sibsp                True
parch                True
fare                False
surnamesurvival     False
adjustedsurvival    False
embarked_c           True
embarked_q           True
embarked_s           True
dtype: bool

In [29]:
cat_features_index

array([ 0,  1,  3,  4,  8,  9, 10], dtype=int64)

In [30]:
train_features_for_cat

Unnamed: 0_level_0,pclass,sex,age,sibsp,parch,fare,surnamesurvival,adjustedsurvival,embarked_c,embarked_q,embarked_s
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3,0,22.0,1,0,7.2500,0.34713,0.347776,0,0,1
2,1,1,38.0,1,0,71.2833,0.34713,0.000000,1,0,0
3,3,1,26.0,0,0,7.9250,0.34713,0.000000,0,0,1
4,1,1,35.0,1,0,53.1000,0.34713,0.000000,0,0,1
5,3,0,35.0,0,0,8.0500,0.34713,0.347776,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
887,2,0,27.0,0,0,13.0000,0.34713,0.347776,0,0,1
888,1,1,19.0,0,0,30.0000,1.00000,1.000000,0,0,1
889,3,1,28.0,1,2,23.4500,0.34713,1.000000,0,0,1
890,1,0,26.0,0,0,30.0000,0.34713,0.345914,1,0,0


Comparison of models with default parameters

In [31]:
dct_with_models = {}

In [32]:
for label_model, model in {'RF': [RandomForestClassifier(random_state=42), 'no_scaler'],
                           'DT': [DecisionTreeClassifier(random_state=42), 'no_scaler'],
                           'LR': [LogisticRegression(random_state=42), 'need_scaler'],
                           'KNB': [KNeighborsClassifier(), 'need_scaler'],
                           'SVC': [SVC(random_state=42, probability=True), 'need_scaler'],
                           'CAT': [CatBoostClassifier(loss_function='Logloss', eval_metric='Accuracy', verbose=False, use_best_model=True, random_seed=42), 'cat']}.items():
    
    
    if model[1] == 'need_scaler':
        scaled_features = StandardScaler().fit_transform(features)
        scores = cross_val_score(model[0], scaled_features, target, cv=9, scoring='accuracy')
        scaler = StandardScaler()
        scaled_train = scaler.fit_transform(X_train)
        scaled_test = scaler.transform(X_test)
        model[0].fit(scaled_train, y_train)
        dct_with_models[f'{label_model}_overall_accuracy_for_model_for_dataset'] = np.mean(scores)
        dct_with_models[f'{label_model}_accuracy_for_x_test_with_default_params'] = accuracy_score(y_test, model[0].predict(scaled_test))    
        dct_with_models[f'{label_model}_roc_auc_for_x_test_with_default_params'] = roc_auc_score(y_test, model[0].predict_proba(scaled_test)[:,1])
        
    elif model[1] == 'no_scaler':
        scores = cross_val_score(model[0], features, target, cv=9, scoring='accuracy')
        model[0].fit(X_train, y_train)
        dct_with_models[f'{label_model}_overall_accuracy_for_model_for_dataset'] = np.mean(scores)
        dct_with_models[f'{label_model}_accuracy_for_x_test_with_default_params'] = accuracy_score(y_test, model[0].predict(X_test))    
        dct_with_models[f'{label_model}_roc_auc_for_x_test_with_default_params'] = roc_auc_score(y_test, model[0].predict_proba(X_test)[:,1])
    
    elif model[1] == 'cat':
        scores = cv(Pool(train_features_for_cat, train_target_for_cat, cat_features=cat_features_index),
                    {"loss_function": "Logloss",
                     "eval_metric": "Accuracy",
                     "verbose": False,
                     "random_seed": 42},
                    fold_count=5)       
        model[0].fit(x_train_for_cat, y_train_for_cat,
                     cat_features=cat_features_index,
                     eval_set=(x_test_for_cat, y_test_for_cat),
                     verbose=True,
                     plot=False)
        dct_with_models[f'{label_model}_overall_accuracy_for_model_for_dataset'] = scores['test-Accuracy-mean'].mean()
        dct_with_models[f'{label_model}_accuracy_for_x_test_with_default_params'] = accuracy_score(y_test_for_cat, model[0].predict(x_test_for_cat))    
        dct_with_models[f'{label_model}_roc_auc_for_x_test_with_default_params'] = roc_auc_score(y_test_for_cat, model[0].predict_proba(x_test_for_cat)[:,1])  

Training on fold [0/5]

bestTest = 1
bestIteration = 0

Training on fold [1/5]

bestTest = 0.9944134078
bestIteration = 0

Training on fold [2/5]

bestTest = 1
bestIteration = 2

Training on fold [3/5]

bestTest = 1
bestIteration = 19

Training on fold [4/5]

bestTest = 1
bestIteration = 14

Learning rate set to 0.030798
0:	learn: 0.9966330	test: 0.8947368	best: 0.8947368 (0)	total: 40.2ms	remaining: 40.2s
1:	learn: 0.9932660	test: 0.8947368	best: 0.8947368 (0)	total: 98.6ms	remaining: 49.2s
2:	learn: 0.9966330	test: 0.8947368	best: 0.8947368 (0)	total: 151ms	remaining: 50s
3:	learn: 0.9966330	test: 0.8947368	best: 0.8947368 (0)	total: 197ms	remaining: 49s
4:	learn: 0.9955107	test: 0.8947368	best: 0.8947368 (0)	total: 231ms	remaining: 46s
5:	learn: 0.9966330	test: 0.8947368	best: 0.8947368 (0)	total: 257ms	remaining: 42.6s
6:	learn: 0.9966330	test: 0.8947368	best: 0.8947368 (0)	total: 292ms	remaining: 41.4s
7:	learn: 0.9966330	test: 0.8947368	best: 0.8947368 (0)	total: 331ms	remaining:

In [33]:
model = []
overall_accuracy_for_dataset = []
accuracy_for_x_test_with_default_params = []
roc_auc_for_x_test_with_default_params = []

for name_model in ['DT', 'RF', 'LR', 'KNB', 'SVC', 'CAT']:
    model.append(name_model)
    overall_accuracy_for_dataset.append(dct_with_models[f'{name_model}_overall_accuracy_for_model_for_dataset'])
    accuracy_for_x_test_with_default_params.append(dct_with_models[f'{name_model}_accuracy_for_x_test_with_default_params'])
    roc_auc_for_x_test_with_default_params.append(dct_with_models[f'{name_model}_roc_auc_for_x_test_with_default_params'])

In [34]:
results = {
    'Model' : model,
    'OVERALL ACCURACY FOR TRAIN DATASET' : pd.Series(overall_accuracy_for_dataset),
    'ROC_AUC FOR X_TEST WITH DEFAULT PARAMS' : pd.Series(roc_auc_for_x_test_with_default_params),
    'ACCURACY FOR X_TEST WITH DEFAULT PARAMS' : pd.Series(accuracy_for_x_test_with_default_params)}

display(pd.DataFrame(results).style.highlight_max(color='green'))

Unnamed: 0,Model,OVERALL ACCURACY FOR TRAIN DATASET,ROC_AUC FOR X_TEST WITH DEFAULT PARAMS,ACCURACY FOR X_TEST WITH DEFAULT PARAMS
0,DT,0.995511,0.860759,0.894737
1,RF,0.996633,0.84473,0.892344
2,LR,0.896745,0.908179,0.889952
3,KNB,0.876543,0.87797,0.830144
4,SVC,0.896745,0.902313,0.892344
5,CAT,0.996564,0.832376,0.894737
