# DATA

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline

titanic = pd.read_csv('train.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# CLEAN DATA

## Continuous variables

In [5]:
titanic['Age'].fillna(titanic['Age'].mean(), inplace=True)

In [6]:
titanic.drop(['PassengerId'], axis=1, inplace=True)

## Categorical data

In [7]:
titanic.groupby(titanic['Cabin'].isnull())['Survived'].mean()

Cabin
False    0.666667
True     0.299854
Name: Survived, dtype: float64

In [8]:
titanic['Cabin_ind'] = np.where(titanic['Cabin'].isnull(), 0, 1)

In [9]:
gender_num = {'male': 0, 'female': 1}

titanic['Sex'] = titanic['Sex'].map(gender_num)

In [10]:
titanic.drop(['Cabin', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)

In [11]:
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_ind
0,0,3,0,22.0,1,0,7.25,0
1,1,1,1,38.0,1,0,71.2833,1
2,1,3,1,26.0,0,0,7.925,0
3,1,1,1,35.0,1,0,53.1,1
4,0,3,0,35.0,0,0,8.05,0


# TRAINING AND VALIDATION

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
import joblib

## Split Data

In [13]:
features = titanic.drop('Survived', axis=1)
labels = titanic['Survived']

X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.25, random_state=42)

for dataset in [y_train, y_val]:
    print(round(len(dataset) / len(labels), 2))

0.75
0.25


In [14]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

## Algorithms

### Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression

#?LogisticRegression

#dir(LogisticRegression)

In [16]:
lr = LogisticRegression()
parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
}

cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(X_train, y_train.values.ravel())

print_results(cv)

cv.best_estimator_

BEST PARAMS: {'C': 1000}

0.666 (+/-0.045) for {'C': 0.001}
0.717 (+/-0.084) for {'C': 0.01}
0.79 (+/-0.085) for {'C': 0.1}
0.792 (+/-0.072) for {'C': 1}
0.79 (+/-0.07) for {'C': 10}
0.792 (+/-0.064) for {'C': 100}
0.793 (+/-0.067) for {'C': 1000}
0.792 (+/-0.064) for {'C': 10000}


LogisticRegression(C=1000)

In [17]:
joblib.dump(cv.best_estimator_, 'LR_model.pkl')

['LR_model.pkl']

### Support Vector Machines

In [18]:
from sklearn.svm import SVC

#?SVC

#dir(LogisticRegression)

In [19]:
svc = SVC()
parameters = {
    'kernel': ['linear', 'rbf'],
    'C': [0.01, 0.1, 1, 10]
}

cv = GridSearchCV(svc, parameters, cv=5)
cv.fit(X_train, y_train.values.ravel())

print_results(cv)

cv.best_estimator_

BEST PARAMS: {'C': 0.1, 'kernel': 'linear'}

0.729 (+/-0.071) for {'C': 0.01, 'kernel': 'linear'}
0.621 (+/-0.005) for {'C': 0.01, 'kernel': 'rbf'}
0.787 (+/-0.083) for {'C': 0.1, 'kernel': 'linear'}
0.663 (+/-0.062) for {'C': 0.1, 'kernel': 'rbf'}
0.787 (+/-0.083) for {'C': 1, 'kernel': 'linear'}
0.665 (+/-0.061) for {'C': 1, 'kernel': 'rbf'}
0.786 (+/-0.078) for {'C': 10, 'kernel': 'linear'}
0.693 (+/-0.064) for {'C': 10, 'kernel': 'rbf'}


SVC(C=0.1, kernel='linear')

In [20]:
joblib.dump(cv.best_estimator_, 'SVM_model.pkl')

['SVM_model.pkl']

### Multilayer Perceptron Algorithm

In [21]:
from sklearn.neural_network import MLPClassifier

In [22]:
mlp = MLPClassifier()
parameters = {
    'hidden_layer_sizes': [(10,), (50,), (100,)],
    'activation': ['relu', 'tanh', 'logistic'],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}

cv = GridSearchCV(mlp, parameters, cv=5)
cv.fit(X_train, y_train.values.ravel())

print_results(cv)

cv.best_estimator_

BEST PARAMS: {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'invscaling'}

0.736 (+/-0.068) for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'constant'}
0.72 (+/-0.108) for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'invscaling'}
0.723 (+/-0.102) for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'adaptive'}
0.784 (+/-0.074) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'constant'}
0.796 (+/-0.089) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'invscaling'}
0.781 (+/-0.099) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive'}
0.793 (+/-0.085) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
0.79 (+/-0.065) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'invscaling'}
0.792 (+/-0.062) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learni

MLPClassifier(hidden_layer_sizes=(50,), learning_rate='invscaling')

In [23]:
joblib.dump(cv.best_estimator_, 'MLP_model.pkl')

['MLP_model.pkl']

### Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(X_train, y_train.values.ravel())

print_results(cv)

BEST PARAMS: {'max_depth': 8, 'n_estimators': 50}

0.781 (+/-0.063) for {'max_depth': 2, 'n_estimators': 5}
0.789 (+/-0.055) for {'max_depth': 2, 'n_estimators': 50}
0.786 (+/-0.075) for {'max_depth': 2, 'n_estimators': 250}
0.793 (+/-0.051) for {'max_depth': 4, 'n_estimators': 5}
0.811 (+/-0.062) for {'max_depth': 4, 'n_estimators': 50}
0.802 (+/-0.073) for {'max_depth': 4, 'n_estimators': 250}
0.82 (+/-0.063) for {'max_depth': 8, 'n_estimators': 5}
0.822 (+/-0.077) for {'max_depth': 8, 'n_estimators': 50}
0.817 (+/-0.075) for {'max_depth': 8, 'n_estimators': 250}
0.807 (+/-0.052) for {'max_depth': 16, 'n_estimators': 5}
0.817 (+/-0.044) for {'max_depth': 16, 'n_estimators': 50}
0.808 (+/-0.072) for {'max_depth': 16, 'n_estimators': 250}
0.792 (+/-0.071) for {'max_depth': 32, 'n_estimators': 5}
0.805 (+/-0.048) for {'max_depth': 32, 'n_estimators': 50}
0.799 (+/-0.071) for {'max_depth': 32, 'n_estimators': 250}
0.792 (+/-0.076) for {'max_depth': None, 'n_estimators': 5}
0.805 (+/-0.04

In [26]:
joblib.dump(cv.best_estimator_, 'RF_model.pkl')

['RF_model.pkl']

### Boosted Trees

In [27]:
from sklearn.ensemble import GradientBoostingClassifier

In [28]:
gb = GradientBoostingClassifier()
parameters = {
    'n_estimators': [5, 50, 250, 500],
    'max_depth': [1, 3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 1, 10, 100]
}

cv = GridSearchCV(gb, parameters, cv=5)
cv.fit(X_train, y_train.values.ravel())

print_results(cv)

BEST PARAMS: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 250}

0.621 (+/-0.005) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 5}
0.787 (+/-0.083) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 50}
0.787 (+/-0.083) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 250}
0.795 (+/-0.085) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 500}
0.621 (+/-0.005) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 5}
0.799 (+/-0.044) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
0.817 (+/-0.07) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 250}
0.814 (+/-0.057) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}
0.621 (+/-0.005) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 5}
0.787 (+/-0.069) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}
0.804 (+/-0.069) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 250}
0.808 (+/-0.064) for {'learning_r

In [29]:
joblib.dump(cv.best_estimator_, 'GB_model.pkl')

['GB_model.pkl']

## Validation

In [30]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from time import time

In [31]:
models = {}

for mdl in ['LR', 'SVM', 'MLP', 'RF', 'GB']:
    models[mdl] = joblib.load('{}_model.pkl'.format(mdl))

In [32]:
def evaluate_model(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()
    accuracy = round(accuracy_score(labels, pred), 3)
    precision = round(precision_score(labels, pred), 3)
    recall = round(recall_score(labels, pred), 3)
    print('{} -- Accuracy: {} / Precision: {} / Recall: {} / Latency: {}ms'.format(name,
                                                                                   accuracy,
                                                                                   precision,
                                                                                   recall,
                                                                                   round((end - start)*1000, 1)))

In [33]:
for name, mdl in models.items():
    evaluate_model(name, mdl, X_val, y_val)

LR -- Accuracy: 0.821 / Precision: 0.795 / Recall: 0.742 / Latency: 1.0ms
SVM -- Accuracy: 0.785 / Precision: 0.741 / Recall: 0.708 / Latency: 2.0ms
MLP -- Accuracy: 0.807 / Precision: 0.774 / Recall: 0.73 / Latency: 0.9ms
RF -- Accuracy: 0.812 / Precision: 0.822 / Recall: 0.674 / Latency: 3.9ms
GB -- Accuracy: 0.812 / Precision: 0.805 / Recall: 0.697 / Latency: 1.3ms


# TEST

In [34]:
titanic_test = pd.read_csv('test.csv')
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Cleaning data in the same way as before

In [35]:
titanic_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [36]:
titanic_test['Age'].fillna(titanic_test['Age'].mean(), inplace=True)
titanic_test['Fare'].fillna(titanic_test['Fare'].median(), inplace=True)
titanic_test.drop(['PassengerId'], axis=1, inplace=True)
titanic_test['Cabin_ind'] = np.where(titanic_test['Cabin'].isnull(), 0, 1)
titanic_test['Sex'] = titanic_test['Sex'].map(gender_num)
titanic_test.drop(['Cabin', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)
titanic_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_ind
0,3,0,34.5,0,0,7.8292,0
1,3,1,47.0,1,0,7.0,0
2,2,0,62.0,0,0,9.6875,0
3,3,0,27.0,0,0,8.6625,0
4,3,1,22.0,1,1,12.2875,0


In [37]:
titanic_test.isnull().sum()

Pclass       0
Sex          0
Age          0
SibSp        0
Parch        0
Fare         0
Cabin_ind    0
dtype: int64

## Generating prediction

In [38]:
best_model = joblib.load('LR_model.pkl')

y_pred = best_model.predict(titanic_test)


In [39]:
titanic_test = pd.read_csv('test.csv')

In [40]:
prediction = pd.DataFrame({"PassengerId": titanic_test["PassengerId"],
                   "Survived": y_pred
                  })


In [41]:
prediction.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [42]:
prediction.to_csv("submit_prediction.csv", index=False)

## Check correct format

In [43]:
check = pd.read_csv('gender_submission.csv')
check.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [44]:
check = pd.read_csv('submit_prediction.csv')
check.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
