# Titanic: Machine Learning from Disaster

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

np.random.seed(42)

In [2]:
dtypes = {
    "PassengerId" : "int64",
    "Name": "object",
    "Survived": "int64",
    "Pclass" : "object",
    "Sex" : "object",
    "Age" : "float64",
    "SibSp" : "int64",
    "Parch" : "int64",
    "Fare" : "float64",
    "Cabin" : "object",
    "Embarked" : "object"
}

## Importing data

In [38]:
train_df = pd.read_csv('train.csv',sep=',',header=0,usecols=["PassengerId","Name","Survived",
            "Pclass","Sex","Age","SibSp","Parch","Fare","Cabin","Embarked"],dtype=dtypes,na_values=[' '])
test_df = pd.read_csv('test.csv',sep=',',header=0,usecols=["PassengerId","Name","Pclass","Sex","Age",
                                "SibSp","Parch","Fare","Cabin","Embarked"],dtype=dtypes,na_values=[' '])

df = pd.concat([train_df,test_df])
df.set_index("PassengerId",inplace=True)
df.head(3)

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0
3,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1.0


In [39]:
df.dropna(subset=['Embarked'],inplace=True)
df.Age.fillna(df.Age.mean(),inplace=True)
df.Cabin.fillna("Without",inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1307 entries, 1 to 1309
Data columns (total 10 columns):
Age         1307 non-null float64
Cabin       1307 non-null object
Embarked    1307 non-null object
Fare        1306 non-null float64
Name        1307 non-null object
Parch       1307 non-null int64
Pclass      1307 non-null object
Sex         1307 non-null object
SibSp       1307 non-null int64
Survived    889 non-null float64
dtypes: float64(3), int64(2), object(5)
memory usage: 112.3+ KB


## Some Feature Engineering

In [40]:
df['Cabin'] = [(x.Cabin)[0] for index,x in df.iterrows()]
df['Child'] = [1 if x.Age < 15 else 0 for index,x in df.iterrows()]
df['FSize'] = [x.SibSp+x.Parch+1 for index,x in df.iterrows()]
df['Alone'] = [0 if x.FSize != 1 else 1 for index,x in df.iterrows()]
df['MFam'] = [1 if x.FSize <= 4 and x.FSize >= 2 else 0 for index,x in df.iterrows()]
df['LFam'] = [1 if x.FSize > 4 else 0 for index,x in df.iterrows()]
df['Name'] = [x.Name.split()[1] if (x.Name.split()[1])[-1] == '.' else 'Common' for index,x in df.iterrows()]
df.head(3)

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Child,FSize,Alone,MFam,LFam
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,22.0,W,S,7.25,Mr.,0,3,male,1,0.0,0,2,0,1,0
2,38.0,C,C,71.2833,Mrs.,0,1,female,1,1.0,0,2,0,1,0
3,26.0,W,S,7.925,Miss.,0,3,female,0,1.0,0,1,1,0,0


In [41]:
num_feat = df.select_dtypes('number').columns.values
cat_feat = df.select_dtypes('object').columns.values 
df_num = df[num_feat].drop('Survived',axis=1)
df_cat = df[cat_feat]

#df_num = (df_num-df_num.mean())/df_num.std()

df_cat = pd.get_dummies(df_cat)

new_df = pd.concat([df_num,df_cat],axis=1)

new_df = (new_df - new_df.mean())/new_df.std()

final = pd.concat([new_df,df.Survived],axis=1)

train = final[final.Survived.notna()]
test = final[final.Survived.isna()].drop('Survived',axis=1)

train.head(3)

Unnamed: 0_level_0,Age,Fare,Parch,SibSp,Child,FSize,Alone,MFam,LFam,Cabin_A,...,Name_Mr.,Name_Mrs.,Name_Ms.,Name_Rev.,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.609826,-0.501757,-0.445237,0.480088,-0.301522,0.072432,-1.231723,1.410434,-0.258626,-0.130796,...,0.880467,-0.412272,-0.039133,-0.078447,-0.570358,-0.518388,0.918039,-0.741616,0.741616,0.0
2,0.634305,0.735219,-0.445237,0.480088,-0.301522,0.072432,-1.231723,1.410434,-0.258626,-0.130796,...,-1.134892,2.423726,-0.039133,-0.078447,1.751942,-0.518388,-1.088445,1.347375,-1.347375,1.0
3,-0.298793,-0.488718,-0.445237,-0.479354,-0.301522,-0.558693,0.811249,-0.708459,-0.258626,-0.130796,...,-1.134892,-0.412272,-0.039133,-0.078447,-0.570358,-0.518388,0.918039,1.347375,-1.347375,1.0


In [42]:
X = train.drop('Survived',axis=1)
y = train.Survived

## Train Classifiers

In [43]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier,RandomForestClassifier
from sklearn.svm import SVC


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [44]:
param_grid = [
  {'learning_rate': np.linspace(0.1,0.01,5), 'loss': ['deviance','exponential'],
  'n_estimators': np.arange(50,100,5) }
 ]

search = GridSearchCV(GradientBoostingClassifier(),param_grid,cv=5,scoring='accuracy',n_jobs=4).fit(X_train,y_train)
means = search.cv_results_['mean_test_score']
print(search.best_estimator_.score(X_test,y_test))
print(means)

0.8044943820224719
[0.82432432 0.82207207 0.81756757 0.81756757 0.81531532 0.81981982
 0.82207207 0.81981982 0.81306306 0.81756757 0.83333333 0.83108108
 0.83333333 0.82657658 0.83558559 0.83558559 0.82882883 0.83333333
 0.82207207 0.82882883 0.82207207 0.82432432 0.82432432 0.82432432
 0.82207207 0.81981982 0.81981982 0.82207207 0.81981982 0.82432432
 0.83558559 0.83558559 0.83558559 0.83108108 0.83558559 0.83558559
 0.83558559 0.82882883 0.83333333 0.82882883 0.82657658 0.83333333
 0.82882883 0.82657658 0.82657658 0.83333333 0.83558559 0.83783784
 0.83333333 0.83333333 0.83783784 0.83783784 0.83783784 0.83333333
 0.82882883 0.83108108 0.82882883 0.83558559 0.83108108 0.83108108
 0.83333333 0.83558559 0.83333333 0.82882883 0.82882883 0.83108108
 0.83108108 0.83108108 0.82882883 0.82657658 0.83108108 0.83558559
 0.83333333 0.83108108 0.83558559 0.83558559 0.83333333 0.83783784
 0.83783784 0.83783784 0.84234234 0.85135135 0.85135135 0.8490991
 0.85135135 0.85135135 0.84459459 0.84234234



In [45]:
param_grid = [
  {'learning_rate': np.linspace(0.1,0.01,5), 'algorithm' : ['SAMME', 'SAMME.R'],
  'n_estimators': np.arange(50,100,5) }
 ]

search = GridSearchCV(AdaBoostClassifier(),param_grid,cv=5,scoring='accuracy',n_jobs=4).fit(X_train,y_train)
means = search.cv_results_['mean_test_score']
print(search.best_estimator_.score(X_test,y_test))
print(means)

0.8202247191011236
[0.78153153 0.78153153 0.78153153 0.78378378 0.79054054 0.79279279
 0.7972973  0.79504505 0.7972973  0.7972973  0.78153153 0.78153153
 0.78153153 0.78153153 0.78153153 0.78153153 0.78153153 0.78378378
 0.78153153 0.78828829 0.78153153 0.78153153 0.78603604 0.78153153
 0.78153153 0.78153153 0.78153153 0.78153153 0.78153153 0.78153153
 0.78153153 0.78153153 0.78153153 0.78603604 0.78153153 0.78603604
 0.78153153 0.78603604 0.78153153 0.78153153 0.78828829 0.78828829
 0.78828829 0.78828829 0.78828829 0.78603604 0.78603604 0.78153153
 0.78153153 0.78153153 0.82207207 0.82882883 0.83108108 0.82882883
 0.83108108 0.83783784 0.83333333 0.82432432 0.82657658 0.82432432
 0.81756757 0.82207207 0.82432432 0.82207207 0.82657658 0.82882883
 0.83333333 0.83333333 0.83558559 0.83783784 0.79504505 0.81081081
 0.81306306 0.81531532 0.81756757 0.81981982 0.81981982 0.82432432
 0.82432432 0.82432432 0.78828829 0.78828829 0.78828829 0.78603604
 0.78603604 0.79279279 0.79954955 0.8130630

In [22]:
param_grid = [
  {'n_estimators': np.arange(10,100,10) }
 ]

search = GridSearchCV(RandomForestClassifier(),param_grid,cv=5,scoring='accuracy',n_jobs=4).fit(X_train,y_train)
means = search.cv_results_['mean_test_score']
print(search.best_estimator_.score(X_test,y_test))
print(means)

0.8157303370786517
[0.8018018  0.81981982 0.79954955 0.80855856 0.81306306 0.81081081
 0.81306306 0.80855856 0.80630631]




In [23]:
param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

search = GridSearchCV(SVC(),param_grid,cv=5,scoring='accuracy',n_jobs=4).fit(X_train,y_train)
means = search.cv_results_['mean_test_score']
stds = search.cv_results_['std_test_score']
print(search.best_estimator_.score(X_test,y_test))
print(means)

0.8134831460674158
[0.63513514 0.61711712 0.84009009 0.63288288 0.84234234 0.83558559
 0.83558559 0.84234234 0.82882883 0.82432432 0.82432432 0.82432432]


## Select the best classifier

In [None]:
param_grid = [
  {'learning_rate': np.linspace(0.1,0.01,10), 'loss': ['deviance','exponential'],
  'n_estimators': np.arange(10,500,10) }
 ]

model = GridSearchCV(GradientBoostingClassifier(),param_grid,cv=10,scoring='accuracy',n_jobs=4).fit(X,y)
means = model.cv_results_['mean_test_score']
print(means)

## Write Results

In [451]:
test.Fare.fillna(test.Fare.mean(),inplace=True)

y_predict = rf_best_model.predict(test)
result_df = pd.DataFrame(columns=['PassengerId','Survived'],
                         dtype=np.int64)
result_df['PassengerId'] = test.index
result_df['Survived'] = y_predict.astype(np.int64)
result_df.to_csv('result.csv',index=False)
result_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0
