# The Titanic Project

The Goal of this project is to apply the tools of machine learning to predict which passengers survived the Titanic tragedy.

Let's import the needed libraries and import the dataset

In [1]:
import numpy
from pandas import read_csv,get_dummies,to_numeric
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

"""Importing the Classification Algorithms"""

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

"""Importing Ensemble Algorithms"""

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

filename_train = 'train.csv'
Dataset = read_csv(filename_train)

#### Let's Understand the Data first

The goal is to predict if a person survives based on some features
The features are:
    
- Survived: Survived (1) or died (0)
- Pclass: Passenger’s class
- Name: Passenger’s name
- Sex: Passenger’s sex
- Age: Passenger’s age
- SibSp: Number of siblings/spouses aboard
- Parch: Number of parents/children aboard
- Ticket: Ticket number
- Fare: Fare
- Cabin: Cabin
- Embarked: Port of embarkation

We can start with some expectations
- I expect Females have a more chance to survive than men, so sex has an impact
- I expect Children have more chance to survive than adults, so Age matters
- I expect wealthy people to have more chance to survive, Pclass matters

#### Let's Explore the Data

Let's look at the missing data

In [2]:
Missing_Values_Table = Dataset.isnull().sum()

Lets first focus on the Age, based on my expectation I do not really care about the exact age, all I need to know is if the individual is a kid or an adult

We can create a new column based on age and splitit in 2 categories child, adult
- child 0-18
- adult 19-

In [3]:
Dataset['Age'] = to_numeric(Dataset['Age'])

In [4]:
for item in Dataset['Age']:
    if item<19:
        Dataset['Age'] = Dataset['Age'].replace(item,'Child')
    elif item>=19:
        Dataset['Age'] = Dataset['Age'].replace(item,'Adult')

In [5]:
print(Dataset['Age'].value_counts(ascending=True))

Child    139
Adult    575
Name: Age, dtype: int64


Let's look at the missing values

In [6]:
Missing_Values_Table = Dataset.isnull().sum()
print(Missing_Values_Table)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


And then to fill the NaN values of Age, if there is Mr or Mrs in the name it's an adult othewise it's a child

In [7]:
dataMr = Dataset[Dataset['Name'].str.contains("Mr.")]
Dataset['Age'] = dataMr['Age'].fillna('Adult') 

dataMrs = Dataset[Dataset['Name'].str.contains("Mrs.")] 
Dataset['Age'] = dataMrs['Age'].fillna('Adult') 

Dataset['Age'] = Dataset['Age'].fillna('Child')        

I would Argue that there are some columns that can be dropped because they have no impact whatso ever on the outcome

In [8]:
Dataset = Dataset.drop(['PassengerId'], axis=1)

Dataset = Dataset.drop(['Name'], axis=1)

Dataset = Dataset.drop(['Ticket'], axis=1)

Dataset = Dataset.drop(['Cabin'], axis=1)

## The Encoding Part

In [9]:
encoder = LabelEncoder()

Dataset['Sex']=encoder.fit_transform(Dataset['Sex'])

Dataset['Age']=encoder.fit_transform(Dataset['Age'])

Dataset = get_dummies(Dataset, columns=['Embarked'])

## The Machine Learning Part

In [11]:
array = Dataset.values
X = array[:,1:]
Y = array[:,0]

validation_size = 0.2
seed = 7

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y,
        test_size=validation_size, random_state=seed)

Let's try a bunch of algorithms and measure the performances

In [12]:
#Standardize the dataset
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',
LogisticRegression(solver = 'liblinear'))])))
pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB',
GaussianNB())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN',
KNeighborsClassifier())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART',
DecisionTreeClassifier())])))
pipelines.append(('ScaledSVC', Pipeline([('Scaler', StandardScaler()),('SVC', SVC())])))
results = []
names = []
for name, model in pipelines:
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print(f'{name}: {cv_results.mean()} {cv_results.std()}')

ScaledLR: 0.8062793427230048 0.041563591676032616
ScaledNB: 0.7879499217527387 0.055264119093866436
ScaledKNN: 0.8091353677621284 0.056965809921960216
ScaledCART: 0.7781690140845071 0.049851857703379195
ScaledSVC: 0.823180751173709 0.054511198348989924


Looks like the SVM algorithm has the best performance, Let's apply some tuning.

In [13]:
# svm Algorithm tuning
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
C_values = numpy.array([0.1,0.3,0.5,0.7,0.9,1.0,1.3,1.5,1.7,2.0])
kernel_values = ['linear','poly','rbf','sigmoid']
param_grid = dict(C=C_values, kernel=kernel_values)
model = SVC(gamma='auto')
kfold = KFold(n_splits=10, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=kfold)
grid_result = grid.fit(rescaledX, Y_train)
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"{mean} ({stdev}) with: {param}")

Best: 0.8258426966292135 using {'C': 0.5, 'kernel': 'rbf'}
0.8019662921348315 (0.043292918282728514) with: {'C': 0.1, 'kernel': 'linear'}
0.7752808988764045 (0.05567158960480092) with: {'C': 0.1, 'kernel': 'poly'}
0.8132022471910112 (0.04493118019579273) with: {'C': 0.1, 'kernel': 'rbf'}
0.797752808988764 (0.04401759072585399) with: {'C': 0.1, 'kernel': 'sigmoid'}
0.8019662921348315 (0.043292918282728514) with: {'C': 0.3, 'kernel': 'linear'}
0.7837078651685393 (0.04633845605512466) with: {'C': 0.3, 'kernel': 'poly'}
0.8230337078651685 (0.05035188452116672) with: {'C': 0.3, 'kernel': 'rbf'}
0.7401685393258427 (0.05053517104299552) with: {'C': 0.3, 'kernel': 'sigmoid'}
0.8019662921348315 (0.043292918282728514) with: {'C': 0.5, 'kernel': 'linear'}
0.7921348314606742 (0.04117316762672349) with: {'C': 0.5, 'kernel': 'poly'}
0.8258426966292135 (0.0499296578863745) with: {'C': 0.5, 'kernel': 'rbf'}
0.6980337078651685 (0.05800423665434465) with: {'C': 0.5, 'kernel': 'sigmoid'}
0.80196629213483

## The Prediction Part

In [14]:
# prepare the model
model = SVC(gamma='auto',C=0.5, kernel = 'rbf')
model.fit(X_train, Y_train)
# estimate accuracy on validation dataset
predictions = model.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

0.7541899441340782
[[89 21]
 [23 46]]
              precision    recall  f1-score   support

         0.0       0.79      0.81      0.80       110
         1.0       0.69      0.67      0.68        69

   micro avg       0.75      0.75      0.75       179
   macro avg       0.74      0.74      0.74       179
weighted avg       0.75      0.75      0.75       179

