# Titanic: Machine Learning from Disaster

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline
sns.set(font_scale=1.56)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, learning_curve, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_predict
import warnings
warnings.filterwarnings("ignore")

# 1. Data Overview

In [None]:
# loading data
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")
train

There are 10 predictors, of which:
* 'Pclass', 'Title', 'Sex', 'Embarked' are categorical.
* 'Age', 'Fare' are continuous.
* 'SibSp', 'Parch' are discrete.
* 'Ticket', 'Cabin' are alphanumeric.

In [None]:
train.shape

In [None]:
test.shape

# 2.Data Cleaning and Analysis

In [None]:
train.isnull().sum()

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
g = sns.heatmap(train[['PassengerId', 'Age', 'SibSp', 'Parch', 'Fare', 'Survived']].corr(), annot = True, fmt=".2f", cmap="seismic")

By visualizing the correlation between the correlation between features, higher value means stronger correlation.From the graph we could see that Fare is a important feature for Survived.



In [None]:
# Plot the categorical predictors against survival rate

var = ['Pclass', 'Sex', 'Embarked']
num_cols = len(var)

fig, axes = plt.subplots(ncols=num_cols)
for i in var:
    train.pivot_table('PassengerId', i, 'Survived', 'count')\
    .apply(lambda x: x.div(x.sum()).mul(100), axis=1)[1]\
    .plot(ax=axes[var.index(i)%num_cols], kind='bar', stacked=True, title=i, figsize=(20, 10), color='C1')\
    .set_ylim([0,100])
    

In [None]:
#Relationship between Sex and Survival rate
train['Died'] = 1 - train['Survived']
train.groupby('Sex').agg('sum')[['Survived', 'Died']].plot(kind='bar', figsize=(25, 7),
                                                          stacked=True, );

From the plot above, we can see that the chance of surviving is higher if:
* The person came from a higher class (lower value of Pclass)
* The person was young (Master or Miss) or female (Miss or Mrs)
* The person was female
* The person embarked at C

In [None]:
#Relationship between Pclass and Survival rate
sns.catplot(x='Pclass', y='Survived', hue='Sex', col='Embarked', data=train,kind='point')

# 3. Feature Engineering 

In [None]:
all_data = [train, test]
for dataset in all_data:
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
train.head(n=10)

**Fare**

In [None]:
#method 1
# Filling missing values
all_data = [train, test]
for dataset in all_data:
    dataset['Fare'] = dataset['Fare'].fillna(dataset['Fare'].median())

# Making Bins
    dataset['FareCut'] = pd.qcut(dataset['Fare'], 5)
    label = LabelEncoder()
    dataset['FareCutTran'] = label.fit_transform(dataset['FareCut'])
    


In [None]:
dataset['FareCutTran'].unique()

**FmailyCount**

In [None]:
# Family_size
all_data = [train, test]
for dataset in all_data:
    dataset['FamilyCount'] = dataset['SibSp'] + dataset['Parch'] + 1
fig, axis1 = plt.subplots(1,1,figsize=(18,4))
name_length = train[['FamilyCount','Survived']].groupby(['FamilyCount'],as_index=False).mean()
sns.barplot(x='FamilyCount', y='Survived', data=name_length)

In [None]:
train[train['Ticket'].notnull()]['Ticket'].sample(5)

In [None]:
data=train.append(test)
# the same ticket family or friends

DEFAULT_SURVIVAL_VALUE = 0.5
data['FamilySurvival'] =   DEFAULT_SURVIVAL_VALUE# default 

for _, df_grp in data.groupby('Ticket'):
    if (len(df_grp) > 1):
        # A Family group is found.
        print(df_grp)
        for ind, row in df_grp.iterrows():
            smax = df_grp.drop(ind)['Survived'].max()
            print('smax',smax)
            smin = df_grp.drop(ind)['Survived'].min()
            print('smin',smin)
            passID = row['PassengerId']
            if (smax == 1.0):
                data.loc[data['PassengerId'] == passID,'FamilySurvival']= 1
            elif (smin==0.0):
                data.loc[data['PassengerId'] == passID,'FamilySurvival']= 0

print("Number of passengers with family survival information: %.0f" 
      %(data[data['FamilySurvival']!=0.5].shape[0]))
data.groupby('FamilySurvival')[['Survived']].mean().round(3)

In [None]:
train.head(153)

**Title**

In [None]:
train = data[:len(train)]
test = data[len(train):]

In [None]:
all_data = [train, test]

# replace the elements on 'Title' with 'Mr', 'Miss', 'Mrs', 'Rare'

for dataset in all_data:
    # extract titles
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # replace titles with a more common title or as Rare
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
train.pivot_table('PassengerId', 'Title', 'Survived', 'count')\
.apply(lambda x: x.div(x.sum()).mul(100), axis=1)[1]\
.plot(kind='bar', stacked=True, title='Title', figsize=(20, 10), color='C1')\
.set_ylim([0,100])

In [None]:
#Turn the string into number
all_data = [train, test]
titles = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Rare": 4}

for dataset in all_data:
    dataset['Title'] = dataset['Title'].map(titles)
    dataset['Title'] = dataset['Title'].fillna(0)

train.head(n=476)

In [None]:
data=train.append(test)

**Age**

In [None]:
import matplotlib.gridspec as gridspec
g = sns.FacetGrid(data = train, col = 'Survived', height = 4, aspect = 1.5)
g.map(sns.distplot, "Age")
fig = plt.figure(constrained_layout=True,figsize=(18,6))
gs = gridspec.GridSpec(2, 3, figure=fig)
ax = fig.add_subplot(121)
plot = sns.kdeplot(train["Age"][(train["Survived"] == 0) & (train["Age"].notnull())], ax = ax, color="Yellow", shade = True)
plot = sns.kdeplot(train["Age"][(train["Survived"] == 1) & (train["Age"].notnull())], ax = ax, color="Green", shade= True)

plot.set_xlabel("Age")
plot.set_ylabel("Frequency")

plot = plot.legend(["Not Survived","Survived"])
ax2 = fig.add_subplot(122)
data.Age[data.Title == 0].plot(kind='kde')
data.Age[data.Title == 1].plot(kind='kde')    
data.Age[data.Title == 2].plot(kind='kde')
data.Age[data.Title == 3].plot(kind='kde')
data.Age[data.Title == 4].plot(kind='kde')
plt.legend(('Mr', 'Miss','Mrs','Master','Rare'),loc='best') 

plt.xlabel("Age")    
plt.title("Age Distribution within Titles")

In [None]:
#Filling missing age
data=train.append(test)
AgeTitle = data.groupby('Title')['Age'].median().values
data['AgeTitle'] = data['Age']
for i in range(0,5):
    data.loc[(data.Age.isnull()) & (data.Title == i),'AgeTitle'] = AgeTitle[i]
data['AgeTitle'] = data['AgeTitle'].astype('int')
data['AgeDivide'] = ((data['AgeTitle']) < 16.0) * 1

# 4.Training

In [None]:
#Divide it back
train = data[:len(train)]
test = data[len(train):]
train

In [None]:
# Inputs set and labels

X_train = train.drop(labels=['Survived','PassengerId'],axis=1)
Y_train = train['Survived']
X_test = test.drop(labels=['PassengerId'],axis=1)

**XgbBoost**

In [None]:
subset = ['Sex','Pclass','FareCutTran','FamilySurvival','AgeDivide']
from xgboost.sklearn import XGBClassifier
model = XGBClassifier(learning_rate=0.003,n_estimators=1600,
                                max_depth=5, min_child_weight=2,
                                gamma=9, subsample=0.7,
                                colsample_bytree=0.7,
                                objective= 'binary:logistic',
                                scale_pos_weight=1, seed=35,
                                reg_alpha=0.4)
model.fit(X_train[subset], Y_train)
y_pred = model.predict(X_test[subset])
submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':y_pred})

**Random Forest**

In [None]:
subset = ['Sex','Pclass','FareCutTran','FamilySurvival','AgeDivide']
Model = RandomForestClassifier(random_state=35,n_estimators=100,min_samples_split=20,oob_score=True)
Model.fit(X_train[subset], Y_train)
print('oob score :%.5f' %(Model.oob_score_))

In [None]:
# submits

pred = Model.predict(X_test[subset])

submit = pd.DataFrame({"PassengerId": test['PassengerId'],
                      "Survived":pred.astype(int)})
submit.to_csv("submission.csv",index=False)

**KNN**

In [None]:
from sklearn.preprocessing import StandardScaler
subset = ['Sex','Pclass','FareCutTran','FamilySurvival','AgeDivide']
std_scaler = StandardScaler()
Xk_train = std_scaler.fit_transform(X_train[subset])
Xk_test = std_scaler.transform(X_test[subset])

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier #KNN
n_neighbors = [10,11,12,13,14,15,16,17,18,19]
algorithm = ['auto']
weights = ['uniform', 'distance']
leaf_size = list(range(1,50,5))
hyperparams = {'algorithm': algorithm, 'weights': weights, 'leaf_size': leaf_size, 
               'n_neighbors': n_neighbors}
gd = GridSearchCV(estimator = KNeighborsClassifier(), param_grid = hyperparams, verbose=True, 
                cv=10, scoring = "roc_auc")
gd.fit(Xk_train, Y_train)
print(gd.best_score_)
print(gd.best_estimator_)

In [None]:
gd.best_estimator_.fit(Xk_train, Y_train)

In [None]:
# submits

kpred = gd.best_estimator_.predict(Xk_test)

ksubmit = pd.DataFrame({"PassengerId": test['PassengerId'],
                      "Survived":kpred.astype(int)})


# 5. Model Evaluation

In [None]:
models = {'KNN':gd.best_estimator_,
          'RF':Model,
          'XGB':model}

In [None]:
def calculate_cross_validation_scores(models, X_train, Y_train, kfold):
    CrossValScores = pd.DataFrame(columns = ['Algorithm', 'Method', 'CVMean', 'CVSTD'])
    CrossValPredictions = pd.DataFrame()
    
    def calculate_model_cross_validation_scores(model):
        cv_results = cross_val_score(model, X_train, y = Y_train, scoring = "accuracy", cv = kfold, n_jobs=4) 
        cv_mean = cv_results.mean()
        cv_std = cv_results.std()
        cv_predictions = cross_val_predict(model, X_train, y = Y_train, cv = kfold, n_jobs=4)
        return cv_mean, cv_std, cv_predictions
    
    for model_name, model in models.items():
        cv_mean, cv_std, cv_predictions = calculate_model_cross_validation_scores(model)
        CrossValScores = CrossValScores.append({'Algorithm':model_name,'Method':'Single Classifier', 'CVMean':cv_mean,'CVSTD':cv_std}, ignore_index=True)
        CrossValPredictions[model_name] = cv_predictions
    
    return CrossValScores, CrossValPredictions

In [None]:
kfold = StratifiedKFold(n_splits=10)
CrossValScores, CrossValPredictions = calculate_cross_validation_scores(models, X_train[subset], Y_train, kfold)

In [None]:
CrossValScores.sort_values('CVMean', ascending=False, inplace=True)
g = sns.barplot('CVMean','Algorithm',data = CrossValScores, color='purple', xerr=CrossValScores['CVSTD']*1)
g.set_xlabel("Mean Accuracy")
g.set_title("Cross validation scores")
print(CrossValScores)

The result show that all three models have a good performance, with random forest slightly better. So we use it as submission.