# Titanic: Machine Learning from Disaster

In [125]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

np.random.seed(42)

## Importing data

In [126]:
dtypes = {
    "PassengerId" : "int64",
    "Name": "object",
    "Survived": "int64",
    "Pclass" : "object",
    "Ticket" : "object",
    "Sex" : "object",
    "Age" : "float64",
    "SibSp" : "int64",
    "Parch" : "int64",
    "Fare" : "float64",
    "Cabin" : "object",
    "Embarked" : "object"
}

In [160]:
train_df = pd.read_csv('train.csv',sep=',',header=0,usecols=["PassengerId","Name","Survived",
            "Pclass","Ticket","Sex","Age","SibSp","Parch","Fare","Cabin","Embarked"],dtype=dtypes,na_values=[' '])
test_df = pd.read_csv('test.csv',sep=',',header=0,usecols=["PassengerId","Name","Pclass","Sex","Age",
            "Ticket","SibSp","Parch","Fare","Cabin","Embarked"],dtype=dtypes,na_values=[' '])

df = pd.concat([train_df,test_df])
df.set_index("PassengerId",inplace=True)
df.head(3)

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599
3,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1.0,STON/O2. 3101282


In [161]:
df.dropna(subset=['Embarked'],inplace=True)
df.Age.fillna(df.Age.mean(),inplace=True)
df.Cabin.fillna("Without",inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1307 entries, 1 to 1309
Data columns (total 11 columns):
Age         1307 non-null float64
Cabin       1307 non-null object
Embarked    1307 non-null object
Fare        1306 non-null float64
Name        1307 non-null object
Parch       1307 non-null int64
Pclass      1307 non-null object
Sex         1307 non-null object
SibSp       1307 non-null int64
Survived    889 non-null float64
Ticket      1307 non-null object
dtypes: float64(3), int64(2), object(6)
memory usage: 122.5+ KB


## Some Feature Engineering

Adding some new features that may lead to some more information:
- Fill null ages with information from person's title
- Check if it was a kid (Kids may have more chance of surviving, as they have priority on evacuation)
- Calculate the family size (Bigger families may have trouble on evacuating)
- Get the title from the person's name (Diferentiate tripulants)
- Get what I think it was the deck of the ship cabins (People from lower decks may have less chance of surviving)


In [162]:
df['Alone'] = [0 if x.SibSp+x.Parch > 0 else 1 for index,x in df.iterrows()]
df['MFam'] = [1 if x.SibSp+x.Parch <= 4 and x.SibSp+x.Parch >= 2 else 0 for index,x in df.iterrows()]
df['LFam'] = [1 if x.SibSp+x.Parch > 4 else 0 for index,x in df.iterrows()]
df['Deck'] = [(x.Cabin)[0] for index,x in df.iterrows()]
df['Kid'] = [1 if x.Age < 10 else 0 for index,x in df.iterrows()]
df['Title'] = [x.Name.split(',')[1].split(' ')[1] if (x.Name.split(',')[1].split(' ')[1])[-1] == '.' else 'Common' for index,x in df.iterrows()]
df.head(3)

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Alone,MFam,LFam,Deck,Kid,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,22.0,Without,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171,0,0,0,W,0,Mr.
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599,0,0,0,C,0,Mrs.
3,26.0,Without,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1.0,STON/O2. 3101282,1,0,0,W,0,Miss.


In [166]:
num_feat = df.select_dtypes('number').columns.values
cat_feat = df.select_dtypes('object').columns.values 
df_num = df[num_feat].drop('Survived',axis=1)
df_cat = df[cat_feat].drop(['Cabin','Ticket','Name'],axis=1)

df_cat = pd.get_dummies(df_cat)

new_df = pd.concat([df_num,df_cat],axis=1)

new_df = (new_df - new_df.min())/(new_df.max()-new_df.min())

final = pd.concat([new_df,df.Survived],axis=1)

final.head(3)

Unnamed: 0_level_0,Age,Fare,Parch,SibSp,Alone,MFam,LFam,Kid,Embarked_C,Embarked_Q,...,Title_Master.,Title_Miss.,Title_Mlle.,Title_Mme.,Title_Mr.,Title_Mrs.,Title_Ms.,Title_Rev.,Title_Sir.,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.273456,0.014151,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.473882,0.139136,0.0,0.125,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.323563,0.015469,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [167]:
# Check correlations
corr = final.corr()
corr.style.background_gradient()

Unnamed: 0,Age,Fare,Parch,SibSp,Alone,MFam,LFam,Kid,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_W,Title_Capt.,Title_Col.,Title_Common,Title_Don.,Title_Dona.,Title_Dr.,Title_Jonkheer.,Title_Lady.,Title_Major.,Title_Master.,Title_Miss.,Title_Mlle.,Title_Mme.,Title_Mr.,Title_Mrs.,Title_Ms.,Title_Rev.,Title_Sir.,Survived
Age,1.0,0.170424,-0.130058,-0.190107,0.114893,-0.155738,-0.157159,-0.519568,0.0775894,-0.0123586,-0.060633,0.3603,-0.0127671,-0.300828,-0.060777,0.060777,0.125836,0.104922,0.169194,0.13375,0.107374,-0.0725015,-0.0860102,0.0326261,-0.269071,0.0864384,0.104117,0.00679628,0.0218637,0.0197112,0.0733322,0.0175587,0.0390836,0.0568164,-0.364285,-0.254198,-0.0177921,-0.0125761,0.16877,0.197303,-0.00280558,0.069637,0.0412361,-0.0750737
Fare,0.170424,1.0,0.222327,0.16103,-0.276328,0.122838,0.125679,-0.0195771,0.287211,-0.129705,-0.170991,0.599391,-0.120768,-0.41889,0.184034,-0.184034,0.0202882,0.393756,0.402041,0.0730493,0.0742466,-0.0374143,-0.0227853,0.00121758,-0.50634,0.0202085,0.0495882,0.0285002,-0.00294394,0.0404832,0.0303897,-0.0177733,0.00341089,-0.00355631,0.0119099,0.0895503,0.0198124,0.019299,-0.190948,0.139565,-0.0172928,-0.0221726,0.0126812,0.25529
Parch,-0.130058,0.222327,1.0,0.373383,-0.548829,0.484552,0.573713,0.300225,-0.00898541,-0.101186,0.0723617,-0.0118725,-0.0104135,0.0188003,0.214371,-0.214371,-0.0308008,0.0771805,0.00941397,-0.0275202,0.000961988,0.0203976,0.0582923,-0.012325,-0.0381744,0.0196368,-0.0246783,-0.012325,-0.012325,-0.012325,-0.0236235,-0.012325,-0.012325,-0.0174368,0.25338,0.0690062,-0.0174368,-0.012325,-0.305945,0.218859,-0.0174368,-0.0122929,-0.012325,0.0831508
SibSp,-0.190107,0.16103,0.373383,1.0,-0.590884,0.253499,0.694851,0.284762,-0.048788,-0.0489262,0.074227,-0.0330586,-0.0528184,0.0718911,0.110768,-0.110768,-0.0399115,-0.00848862,0.0484236,-0.0158706,-0.0273175,-0.00871392,0.00597066,-0.0132694,0.00772663,0.0132897,-0.0132745,-0.0132694,-0.0132694,-0.0132694,0.00944414,-0.0132694,0.0132897,-0.0187729,0.329079,0.0803215,-0.0187729,-0.0132694,-0.244262,0.065973,-0.0187729,-0.0188018,0.0132897,-0.03404
Alone,0.114893,-0.276328,-0.548829,-0.590884,1.0,-0.560388,-0.270285,-0.312352,-0.107317,0.127688,0.0134445,-0.129084,-0.0344565,0.139795,-0.286747,0.286747,0.0454127,-0.0947827,-0.13723,-0.0741127,-0.042334,0.00421528,-0.0763588,0.0224569,0.178672,-0.0340963,0.0166563,0.0224569,0.0224569,0.0224569,0.00354336,0.0224569,-0.0340963,0.031771,-0.265224,-0.02806,0.031771,0.0224569,0.388317,-0.364258,0.031771,0.00354336,-0.0340963,-0.206207
MFam,-0.155738,0.122838,0.484552,0.253499,-0.560388,1.0,-0.099759,0.326085,0.0437609,-0.0979089,0.0237064,-0.00950214,0.0920386,-0.0672914,0.180228,-0.180228,-0.0121599,0.0777529,-0.00872439,-0.0427905,0.0229829,0.0710641,0.103372,-0.0125846,-0.0573702,0.0608441,-0.0251981,-0.0125846,-0.0125846,-0.0125846,0.0424021,-0.0125846,-0.0125846,-0.0178041,0.236258,0.0642344,-0.0178041,-0.0125846,-0.278575,0.178608,-0.0178041,-0.00965948,-0.0125846,0.169694
LFam,-0.157159,0.125679,0.573713,0.694851,-0.270285,-0.099759,1.0,0.244772,-0.111927,0.00442593,0.096,-0.0741977,-0.104808,0.150083,0.0435428,-0.0435428,-0.0287013,-0.0493631,0.0238404,-0.0418952,-0.0394746,-0.0280305,-0.0135932,-0.00606975,0.0653143,-0.00606975,-0.0121535,-0.00606975,-0.00606975,-0.00606975,-0.017214,-0.00606975,-0.00606975,-0.00858721,0.298101,0.0560389,-0.00858721,-0.00606975,-0.161073,2.35032e-05,-0.00858721,-0.017214,-0.00606975,-0.113523
Kid,-0.519568,-0.0195771,0.300225,0.284762,-0.312352,0.326085,0.244772,1.0,-0.0229093,-0.0401656,0.045801,-0.118302,0.0356782,0.0729463,0.0652053,-0.0652053,-0.0093265,-0.0582236,-0.0475981,-0.0494152,-0.010359,0.0673142,0.137298,-0.00715925,0.0482884,-0.00715925,-0.014335,-0.00715925,-0.00715925,-0.00715925,-0.0203039,-0.00715925,-0.00715925,-0.0101286,0.585953,0.180079,-0.0101286,-0.00715925,-0.303533,-0.10867,-0.0101286,-0.0203039,-0.00715925,0.129837
Embarked_C,0.0775894,0.287211,-0.00898541,-0.048788,-0.107317,0.0437609,-0.111927,-0.0229093,1.0,-0.164463,-0.778161,0.327916,-0.135134,-0.172468,0.06772,-0.06772,0.0948318,0.167512,0.157867,0.107658,0.0274318,-0.0201135,-0.0316207,-0.0141196,-0.26044,-0.0141196,0.07437,-0.0141196,0.0542295,0.0542295,0.00841657,-0.0141196,0.0542295,-0.0199757,-0.0143474,-0.0213546,0.0767215,0.0542295,-0.066535,0.0979791,-0.0199757,-0.0400436,0.0542295,0.169966
Embarked_Q,-0.0123586,-0.129705,-0.101186,-0.0489262,0.127688,-0.0979089,0.00442593,-0.0401656,-0.164463,1.0,-0.491534,-0.165641,-0.122263,0.24341,0.089445,-0.089445,-0.0421732,-0.0725332,-0.0592963,-0.0615599,-0.0429704,-0.0203466,-0.0199736,-0.00891877,0.14184,-0.00891877,-0.0178581,-0.00891877,-0.00891877,-0.00891877,0.0083028,-0.00891877,-0.00891877,-0.0126179,-0.00920075,0.201328,-0.0126179,-0.00891877,-0.0808928,-0.10602,0.054421,0.0083028,-0.00891877,0.00453573


It seems that with only these features, the classifier performs the best (Which does not surprises me at all, since classifying women as survivors and men as not, gives 0.7+ accuracy, and predicting that everyone dies gives ~0.67). 

In [None]:
# Select the features I'm going to use
final = final[['Sex','Survived','Embarked','Pclass']]

train = final[final.Survived.notna()]
test = final[final.Survived.isna()].drop('Survived',axis=1)

train.head(3)

In [88]:
X = train.drop('Survived',axis=1)
y = train.Survived

## Train Classifiers

In [90]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier,RandomForestClassifier
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [91]:
param_grid = [
  {'learning_rate': np.linspace(0.1,0.01,5), 'loss': ['deviance','exponential'],
  'n_estimators': np.arange(50,100,5) }
 ]

search = GridSearchCV(GradientBoostingClassifier(),param_grid,cv=5,scoring='accuracy',n_jobs=4).fit(X_train,y_train)
mean = search.cv_results_['mean_test_score'].mean()
print(search.best_score_)
print(mean)
print(search.best_estimator_.score(X_test,y_test))

0.8243243243243243
0.8240315315315317
0.797752808988764


In [92]:
param_grid = [
  {'learning_rate': np.linspace(0.1,0.01,5), 'algorithm' : ['SAMME', 'SAMME.R'],
  'n_estimators': np.arange(50,100,5) }
 ]

search = GridSearchCV(AdaBoostClassifier(),param_grid,cv=5,scoring='accuracy',n_jobs=4).fit(X_train,y_train)
mean = search.cv_results_['mean_test_score'].mean()
print(search.best_score_)
print(mean)
print(search.best_estimator_.score(X_test,y_test))

0.7882882882882883
0.7873423423423422
0.7842696629213484


In [100]:
param_grid = [
  {'n_estimators': np.arange(50,100,10) }
 ]

search = GridSearchCV(RandomForestClassifier(),param_grid,cv=5,scoring='accuracy',n_jobs=4).fit(X_train,y_train)
mean = search.cv_results_['mean_test_score'].mean()
print(search.best_score_)
print(mean)
print(search.best_estimator_.score(X_test,y_test))

0.8243243243243243
0.8243243243243243
0.797752808988764


In [94]:
param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

search = GridSearchCV(SVC(),param_grid,cv=5,scoring='accuracy',n_jobs=4).fit(X_train,y_train)
mean = search.cv_results_['mean_test_score'].mean()
print(search.best_score_)
print(mean)
print(search.best_estimator_.score(X_test,y_test))

0.8198198198198198
0.7719594594594595
0.797752808988764


## Select the best classifier

It seems that RandomForest peformed the best. Train the final classifier with all data.

In [95]:
param_grid = [
  {'n_estimators': np.arange(10,100,10) }
]

model = GridSearchCV(RandomForestClassifier(),param_grid,cv=10,scoring='accuracy',n_jobs=4).fit(X,y)
mean = model.cv_results_['mean_test_score'].mean()
print(model.best_score_)
print(mean)

0.8198198198198198
0.8110236220472441


## Write Results

In [98]:
#test.Fare.fillna(test.Fare.mean(),inplace=True)

y_predict = model.predict(test)
result_df = pd.DataFrame(columns=['PassengerId','Survived'],
                         dtype=np.int64)
result_df['PassengerId'] = test.index
result_df['Survived'] = y_predict.astype(np.int64)
result_df.to_csv('result.csv',index=False)
result_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0
