# Predicting Survival in the Titanic
By Luis Alberto Denis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Load data

In [None]:
raw_train = pd.read_csv('../input/train.csv')
raw_test = pd.read_csv('../input/test.csv')

In [None]:
raw_train.head()

In [None]:
raw_train.info()
print('-'*40)
raw_test.info()

In [None]:
raw_train.describe()

In [None]:
correlation = raw_train.corr()['Survived']

In [None]:
correlation.sort_values(ascending=False)

## Survival Analysis

In [None]:
raw_train['Survived'].value_counts()

## Feature Name Analysis

In [None]:
raw_train['Name'].unique().size

In [None]:
raw_train['Name'].head()

## Feature Sex Analysis

In [None]:
raw_train[['Sex', 'PassengerId']].groupby(['Sex'], as_index=True).count()

In [None]:
raw_train[['Sex', 'Survived']].groupby(['Sex'], as_index=True).mean()

In [None]:
data_sex = raw_train[['Sex', 'Pclass', 'Survived']].groupby(['Sex', 'Pclass'], as_index=True).mean()

data_sex.loc['female'].plot(kind='bar', ylim=[0, 1], title='Female Survival Rate', legend=False)
data_sex.loc['male'].plot(kind='bar', ylim=[0, 1], title='Male Survival Rate', legend=False)

## Feature Pclass Analysis

In [None]:
data_pclass = raw_train[['Pclass', 'PassengerId']].groupby(['Pclass'], as_index=True).count()
data_pclass

In [None]:
data_pclass.plot(kind='pie', y='PassengerId', legend=False, title='Pclass percentage of total', figsize=(5, 5))

In [None]:
raw_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()

## Feature Fare Analysis

In [None]:
raw_train['Fare'].hist(bins=100, color='red')

In [None]:
raw_train['Fare'].hist(bins=100, normed=True, alpha=0.5, color='red')
raw_train['Fare'].plot(kind='kde', style='k--')

In [None]:
raw_train['Fare'].skew()

In [None]:
plt.boxplot(raw_train['Fare'], vert=False)

In [None]:
raw_train.loc[raw_train['Fare'] > 100, 'Pclass'].value_counts()

## Feature SibSp Analysis

In [None]:
raw_train['SibSp'].value_counts()

In [None]:
raw_train['SibSp'].hist(bins=20, color='orange')

In [None]:
raw_train['SibSp'].hist(bins=20, color='orange', normed=True, alpha=0.5)
raw_train['SibSp'].plot(kind='kde', color='black')

In [None]:
data_sibsp = raw_train[['SibSp', 'Survived']].groupby('SibSp', as_index=True).mean()
data_sibsp.plot(kind='bar', color='orange', ylim=[0, 1])

In [None]:
data_sibsp = raw_train[['SibSp', 'Sex', 'Survived']].groupby(['Sex', 'SibSp'], as_index=True).mean()
data_sibsp.loc['male'].plot(kind='bar', color='orange', title='Male SibSp', ylim=[0, 1])
data_sibsp.loc['female'].plot(kind='bar', color='orange', title='Female SibSp', ylim=[0, 1])

## Feature Parch Analysis

In [None]:
raw_train['Parch'].value_counts()

In [None]:
raw_train['Parch'].hist(bins=20, color='magenta')

In [None]:
raw_train['Parch'].hist(bins=20, color='magenta', normed=True, alpha=0.5)
raw_train['Parch'].plot(kind='kde', color='black')

In [None]:
data_parch = raw_train[['Parch', 'Survived']].groupby('Parch', as_index=True).mean()
data_parch.plot(kind='bar', color='magenta', ylim=[0, 1])

In [None]:
data_parch = raw_train[['Parch', 'Sex', 'Survived']].groupby(['Sex', 'Parch'], as_index=True).mean()
data_parch.loc['male'].plot(kind='bar', color='magenta', ylim=[0, 1])
data_parch.loc['female'].plot(kind='bar', color='magenta', ylim=[0, 1])

## Feature Age Analysis

In [None]:
raw_train['Age'].isnull().sum()

In [None]:
raw_train['Age'].hist(bins=100, grid=True, alpha=0.8, color='gray')

In [None]:
raw_train['Age'].hist(bins=100, grid=True, alpha=0.8, normed=True, color='gray')
raw_train.loc[raw_train['Survived']==0, 'Age'].plot(kind='kde', color='red', label='Not Survived', legend=True)
raw_train.loc[raw_train['Survived']==1, 'Age'].plot(kind='kde', color='blue', label='Survived', legend=True)

## Feature Embarked Analysis

In [None]:
raw_train['Embarked'].value_counts().plot(kind='bar', title='Amount of passenger per port', color='g')

In [None]:
data_embarked = raw_train[['Embarked', 'Survived']].groupby('Embarked').mean()
data_embarked.plot(kind='bar', ylim=[0, 1], legend=False, title='Survival Rate per Port', color='g')

In [None]:
pd.crosstab(raw_train['Embarked'], raw_train['Pclass']).plot(kind='bar', title='Amount of Passengers per Port')

## Feature Ticket Analysis

In [None]:
raw_train['Ticket'].unique().size

In [None]:
raw_train['Ticket'].head()

## Feature Cabin Analysis

In [None]:
raw_train['Cabin'].unique()

In [None]:
raw_train.loc[raw_train['Cabin'].isnull(), 'Cabin'].size

## Filling of Missing Values in Fare

In [None]:
from sklearn.preprocessing import Imputer

In [None]:
fare_imputer = Imputer(strategy='median')
fare_imputer.fit(raw_train['Fare'].values.reshape(-1, 1))

raw_test['Fare'] = fare_imputer.transform(raw_test['Fare'].values.reshape(-1, 1))

## Feature LogFare

In [None]:
raw_train['LogFare'] = raw_train['Fare'].apply(lambda x: np.log(x) if x > 0 else 0)

raw_test['LogFare'] = raw_test['Fare'].apply(lambda x: np.log(x) if x > 0 else 0)

In [None]:
raw_train['LogFare'].hist(bins=100, normed=True, alpha=0.5, color='yellow')
raw_train['LogFare'].plot(kind='kde', style='k--')

In [None]:
raw_train['LogFare'].skew()

## Filling of Missing Values in Cabin

In [None]:
raw_train['Cabin'] = raw_train['Cabin'].fillna('U') #Unknown

raw_test['Cabin'] = raw_test['Cabin'].fillna('U')

## Feature Deck

In [None]:
raw_train['Deck'] = raw_train['Cabin'].apply(lambda x: x[0])

raw_test['Deck'] = raw_test['Cabin'].apply(lambda x: x[0])

In [None]:
raw_train['Deck'].value_counts().plot(kind='bar', color='brown')

In [None]:
raw_train[['Deck', 'Survived']].groupby('Deck').mean().plot(kind='bar', color='brown')

## Feature Title

In [None]:
def extract_title_from_name(name):
    for word in name.split():
        if word.endswith('.') and len(word) > 2: return word[:-1]
    return None

raw_train['Title'] = raw_train['Name'].apply(lambda x: extract_title_from_name(x))

raw_test['Title'] = raw_test['Name'].apply(lambda x: extract_title_from_name(x))

raw_train['Title'].unique()

In [None]:
raw_test['Title'].unique()

In [None]:
pd.crosstab(raw_train['Title'], raw_train['Sex'])

In [None]:
raw_train.loc[(raw_train['Title']=='Dr') & (raw_train['Sex']=='female')]

In [None]:
title_mapping = {'Capt':'Mr', 'Col':'Mr','Don':'Mr','Dona':'Mrs',
                 'Dr':'Mr','Jonkheer':'Mr','Lady':'Mrs','Major':'Mr',
                 'Master':'Master','Miss':'Miss','Mlle':'Miss','Mme':'Mrs',
                 'Mr':'Mr','Mrs':'Mrs','Ms':'Miss','Rev':'Mr','Sir':'Mr',
                 'Countess':'Mrs'}

raw_train.loc[(raw_train['Title']=='Dr') & (raw_train['Sex']=='female'),'Title'] = 'Mrs'
raw_train['Title'] = raw_train['Title'].map(title_mapping)

raw_test['Title'] = raw_test['Title'].map(title_mapping)

## Filling of Missing Values in Age

In [None]:
raw_train[raw_train['Age'].isnull()]['Title'].value_counts()

In [None]:
raw_train['Age'].hist(bins=100, normed=True, alpha=0.5, color='gray')
raw_train['Age'].plot(kind='kde', style='b--')

In [None]:
ages = dict()

for title in raw_train['Title'].unique():
    ages[title] = dict()

for title in ages.keys():    
    for pclass in raw_train['Pclass'].unique():
        ages[title][pclass] = raw_train[(raw_train['Title'] == title) & (raw_train['Pclass'] == pclass)]['Age'].median()

ages

In [None]:
raw_train['Age'] = raw_train['Age'].fillna(-1)
for index, row in raw_train.iterrows():
    if row['Age'] == -1:
        raw_train.loc[index, 'Age'] = ages[row['Title']][row['Pclass']]
        
raw_test['Age'] = raw_test['Age'].fillna(-1)
for index, row in raw_test.iterrows():
    if row['Age'] == -1:
        raw_test.loc[index, 'Age'] = ages[row['Title']][row['Pclass']]

In [None]:
raw_train['Age'].hist(bins=100, normed=True, alpha=0.5, color='gray')
raw_train['Age'].plot(kind='kde', style='r--')

## Filling of Missing Values in Embarked

In [None]:
raw_train.loc[raw_train['Embarked'].isnull()]

In [None]:
filling_data = raw_train.loc[raw_train['Pclass'] == 1, ['Fare', 'Embarked']].groupby('Embarked', as_index=True)
filling_data.boxplot(subplots=False)

In [None]:
raw_train['Embarked'] = raw_train['Embarked'].fillna('C')

In [None]:
raw_train['Embarked'].count()

## Feature Male

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
bin_sex = LabelEncoder()
raw_train['Male'] = bin_sex.fit_transform(raw_train['Sex'])

raw_test['Male'] = bin_sex.transform(raw_test['Sex'])

In [None]:
raw_train['Male'].value_counts()

## Feature FamilyMembers

In [None]:
raw_train['FamilyMembers'] = raw_train['SibSp'] + raw_train['Parch']

raw_test['FamilyMembers'] = raw_test['SibSp'] + raw_test['Parch']

In [None]:
raw_train['FamilyMembers'].value_counts().sort_index().plot(kind='bar',  legend=False, color='c')

In [None]:
raw_train[['FamilyMembers', 'Survived']].groupby('FamilyMembers').mean().plot(kind='bar', legend=False, color='c')

## Binning Feature FamilyMembers into FamiySize

In [None]:
def binning_family(x):
    if x == 0:
        return 'Alone'
    elif (x > 0) & (x <= 3):
        return 'Small'
    elif (x > 3):
        return 'Large'

raw_train['FamilySize'] = raw_train['FamilyMembers'].apply(lambda x: binning_family(x))

raw_test['FamilySize'] = raw_test['FamilyMembers'].apply(lambda x: binning_family(x))

In [None]:
raw_train['FamilySize'].value_counts()

## Categorical Embarked into Dummies

In [None]:
from sklearn.preprocessing import LabelBinarizer

In [None]:
bin_embarked = LabelBinarizer()

ports = bin_embarked.fit_transform(raw_train['Embarked'])
ports_df = pd.DataFrame(ports, columns=['Port_' + p for p in bin_embarked.classes_.tolist()])
raw_train = raw_train.join(ports_df)

ports = bin_embarked.transform(raw_test['Embarked'])
ports_df = pd.DataFrame(ports, columns=['Port_' + p for p in bin_embarked.classes_.tolist()])
raw_test = raw_test.join(ports_df)

## Categorical FamilySize into Dummies

In [None]:
bin_fsize = LabelBinarizer()

fsize = bin_fsize.fit_transform(raw_train['FamilySize'])
fsize_df = pd.DataFrame(fsize, columns=['FamilySize_' + f for f in bin_fsize.classes_.tolist()])
raw_train = raw_train.join(fsize_df)

fsize = bin_fsize.transform(raw_test['FamilySize'])
fsize_df = pd.DataFrame(fsize, columns=['FamilySize_' + f for f in bin_fsize.classes_.tolist()])
raw_test = raw_test.join(fsize_df)

## Categorical Deck into Dummies

In [None]:
bin_deck = LabelBinarizer()

decks = bin_deck.fit_transform(raw_train['Deck'])
decks_df = pd.DataFrame(decks, columns=['Deck_' + d for d in bin_deck.classes_.tolist()])
raw_train = raw_train.join(decks_df)

decks = bin_deck.transform(raw_test['Deck'])
decks_df = pd.DataFrame(decks, columns=['Deck_' + d for d in bin_deck.classes_.tolist()])
raw_test = raw_test.join(decks_df)

## Scaling of Feature Age

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
age_scaler = MinMaxScaler()
raw_train['Age'] = age_scaler.fit_transform(raw_train['Age'].values.reshape(-1, 1))

raw_test['Age'] = age_scaler.transform(raw_test['Age'].values.reshape(-1, 1))

In [None]:
raw_train['Age'].hist(bins=100, normed=True, alpha=0.5)
raw_train['Age'].plot(kind='kde')

## Removing Features

In [None]:
train = raw_train.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Title', 'Ticket', 'Cabin', 'Sex', 'Embarked', 'FamilyMembers', 'FamilySize', 'Deck', 'Fare'], axis=1)
test = raw_test.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Title', 'Ticket', 'Cabin', 'Sex', 'Embarked', 'FamilyMembers', 'FamilySize', 'Deck', 'Fare'], axis=1)

In [None]:
train.head()

In [None]:
test.tail()

## Correlation with Survived

In [None]:
corr_with_survived = train.corrwith(train['Survived'])
corr_with_survived

## Model Selection

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

In [None]:
y = train['Survived']
X = train.drop('Survived', axis=1)

In [None]:
seed = 4

In [None]:
gradientb = GradientBoostingClassifier(random_state=seed)
scores_mlp = cross_val_score(gradientb, X, y, cv=10)

In [None]:
rforest = RandomForestClassifier(random_state=seed)
scores_rf = cross_val_score(rforest, X, y, cv=10)

In [None]:
logistic_r = LogisticRegression(random_state=seed)
scores_lr = cross_val_score(logistic_r, X, y, cv=10)

In [None]:
support_v = SVC(random_state=seed)
scores_sv = cross_val_score(support_v, X, y, cv=10)

In [None]:
knn = KNeighborsClassifier()
scores_knn = cross_val_score(knn, X, y, cv=10)

In [None]:
list_scores = [
    scores_mlp.mean(), 
    scores_rf.mean(), 
    scores_lr.mean(), 
    scores_sv.mean(), 
    scores_knn.mean()
]
list_std = [
    scores_mlp.std(),
    scores_rf.std(),
    scores_lr.std(),
    scores_sv.std(),
    scores_knn.std()
]
columns = [
    'GradientBoosting',
    'RandomForest',
    'LogisticRegression',
    'SupportVector',
    'KNearestNeighbors'
]

scores = pd.DataFrame(columns=columns)
scores.loc['scores'] = list_scores
scores.loc['std'] = list_std
scores

## Fine-Tune the Hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_gb = {
    'loss' : ["deviance"],
    'n_estimators' : [75, 100, 200],
    'learning_rate': [0.1, 0.7, 0.05, 0.03, 0.01],
    'max_depth': [5, 8, None],
    'min_samples_leaf': [25, 50, 75, 100],
    'max_features': [1.0, 0.3, 0.1] 
}

grid_search_gb = GridSearchCV(GradientBoostingClassifier(random_state=seed), param_gb, cv=10, n_jobs=-1)
grid_search_gb.fit(X, y)
grid_search_gb.best_params_

In [None]:
param_rf = {
    "max_depth": [5, 8, None],
    "min_samples_split": [2, 5, 10, 15, 100],
    "min_samples_leaf": [5, 10, 25, 50],
    "max_features": ['log2', 'sqrt', None],
    "n_estimators": [75, 100, 200]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=seed), param_rf, cv=10, n_jobs=-1)
grid_search_rf.fit(X, y)
grid_search_rf.best_params_

In [None]:
param_lr = {
    "penalty": ["l2"],
    "C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

grid_search_lr = GridSearchCV(LogisticRegression(random_state=seed), param_lr, cv=10, n_jobs=-1)
grid_search_lr.fit(X, y)
grid_search_lr.best_params_

In [None]:
param_sv = {
    "kernel": ['rbf'],
    "C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "gamma": [0.001, 0.01, 0.1, 1]
}

grid_search_sv = GridSearchCV(SVC(probability=True, random_state=seed), param_sv, cv=10, n_jobs=-1)
grid_search_sv.fit(X, y)
grid_search_sv.best_params_

In [None]:
param_knn = {
    "n_neighbors": [2, 3, 4, 5, 6, 7, 8, 9, 10],
    "p": [2, 3],
    "weights": ['uniform', 'distance']
}

grid_search_knn = GridSearchCV(KNeighborsClassifier(), param_knn, cv=10, n_jobs=-1)
grid_search_knn.fit(X, y)
grid_search_knn.best_params_

In [None]:
list_scores_tuning = [grid_search_gb.best_score_, grid_search_rf.best_score_, grid_search_lr.best_score_, grid_search_sv.best_score_, grid_search_knn.best_score_]
index_tuning = ['GradientBoosting', 'RandomForest', 'LogisticRegression', 'SupportVector', 'KNearestNeighbors']
scores_tuning = pd.Series(list_scores_tuning, index=index_tuning)
scores_tuning.sort_values(ascending=False)

## Evaluating the Model

In [None]:
from sklearn.ensemble import VotingClassifier

estimators = [
    ('gb', grid_search_gb.best_estimator_),
    ('rf', grid_search_rf.best_estimator_),
    ('lr', grid_search_lr.best_estimator_),
    ('sv', grid_search_sv.best_estimator_),
    ('knn', grid_search_knn.best_estimator_)
]

voting_classifier = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)
cross_val_score(voting_classifier, X, y, cv=10).mean()

## Making predictions

In [None]:
voting_classifier = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)
voting_classifier.fit(X, y)
predictions = voting_classifier.predict(test)

In [None]:
submission = pd.DataFrame({
        "PassengerId": raw_test["PassengerId"],
        "Survived": predictions
    })
submission.to_csv('predictions.csv', index=False)