In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib as plt
%matplotlib inline

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.naive_bayes import *
import sklearn.neural_network as nk

In [2]:
# Load the data
train = pd.read_csv('./data/Titanic/train.csv')
test = pd.read_csv('./data/Titanic/test.csv')

In [3]:
# Lets have a peek of train data
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Lets check number of rows and number features in dataset
train.shape

(891, 12)

In [5]:
# Lets check the number of missing values in dataset
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
# We will drop the Cabin feature as it has 687 missing values and Ticket PassengerId as values are all unique
train = train.drop(['Cabin', 'Ticket', 'PassengerId'], axis=1)
train.shape

(891, 9)

In [7]:
train.Sex = train.Sex.replace(to_replace=['male', 'female'],value=[0,1])

In [8]:
male = train[train.Sex == 0]
female = train[train.Sex == 1]

In [9]:
train.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [10]:
male_child = male[~male.Name.str.contains('Mr.')]
male_adult = male[male.Name.str.contains('Mr.')]

In [11]:
male_child.Age = male_child.Age.fillna(male_child.Age.mean())
male_adult.Age = male_adult.Age.fillna(male_adult.Age.mean())
male = male_adult + male_child

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [12]:
female_married = female[female.Name.str.contains('Mrs.')]
female_unmarried = female[~female.Name.str.contains('Mrs.')]

In [13]:
female_married.Age = female_married.Age.fillna(female_married.Age.mean())
female_unmarried.Age = female_unmarried.Age.fillna(female_unmarried.Age.mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [14]:
frames = [male_adult,male_child,female_married,female_unmarried]

In [15]:
frames1 = pd.concat(frames)

In [16]:
frames1.corr(method='pearson')

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
Survived,1.0,-0.338481,0.543351,-0.087157,-0.035322,0.081629,0.257307
Pclass,-0.338481,1.0,-0.1319,-0.338059,0.083081,0.018443,-0.5495
Sex,0.543351,-0.1319,1.0,-0.119585,0.114631,0.245489,0.182333
Age,-0.087157,-0.338059,-0.119585,1.0,-0.256608,-0.192345,0.090901
SibSp,-0.035322,0.083081,0.114631,-0.256608,1.0,0.414838,0.159651
Parch,0.081629,0.018443,0.245489,-0.192345,0.414838,1.0,0.216225
Fare,0.257307,-0.5495,0.182333,0.090901,0.159651,0.216225,1.0


In [17]:
X = frames1.ix[:,1:]
Y = frames1.ix[:,0]

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)

In [19]:
del X_train['Name']
del X_test['Name']
del X_train['Embarked']
del X_test['Embarked']

In [20]:
logreg = LogisticRegression()
y_pred_logreg = logreg.fit(X_train, Y_train).predict(X_test)
print ("Model Accuracy:",round((metrics.accuracy_score(Y_test, y_pred_logreg)*100),2),"%")

Model Accuracy: 78.77 %


In [21]:
randomforrest = RandomForestClassifier(n_estimators=30)
y_pred_rand = randomforrest.fit(X_train,Y_train).predict(X_test)
print ("Model Accuracy:",round((metrics.accuracy_score(Y_test, y_pred_rand)*100),2),"%")

Model Accuracy: 83.8 %


In [22]:
svc_class = SVC(kernel='linear')
y_pred_svm = svc_class.fit(X_train,Y_train).predict(X_test)
print ("Model Accuracy:",round((metrics.accuracy_score(Y_test, y_pred_svm)*100),2),"%")

Model Accuracy: 78.21 %


In [23]:
linear_svc = LinearSVC()
y_pred_lsvc = linear_svc.fit(X_train,Y_train).predict(X_test)
print ("Model Accuracy:",round((metrics.accuracy_score(Y_test, y_pred_lsvc)*100),2),"%")

Model Accuracy: 63.69 %


In [24]:
nb = GaussianNB()
y_pred_nb = nb.fit(X_train,Y_train).predict(X_test)
print ("Model Accuracy:",round((metrics.accuracy_score(Y_test, y_pred_nb)*100),2),"%")

Model Accuracy: 77.09 %


In [25]:
neigh = KNeighborsClassifier()
y_pred_neigh = neigh.fit(X_train,Y_train).predict(X_test)
print ("Model Accuracy:",round((metrics.accuracy_score(Y_test, y_pred_neigh)*100),2),"%")

Model Accuracy: 72.63 %


In [26]:
neural = nk.MLPClassifier(max_iter=1000, hidden_layer_sizes=(50))
y_pred_neural = neural.fit(X_train,Y_train).predict(X_test)
print ("Model Accuracy:",round((metrics.accuracy_score(Y_test, y_pred_neural)*100),2),"%")

Model Accuracy: 68.16 %


In [27]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
y_pred_gb = gb.fit(X_train,Y_train).predict(X_test)
print ("Model Accuracy:",round((metrics.accuracy_score(Y_test, y_pred_gb)*100),2),"%")

Model Accuracy: 81.01 %


In [28]:
print ("\tLogistic classification report:\n%s\n" %
       metrics.classification_report(Y_test,y_pred_logreg,target_names=["Not Survived", "Survived"]))

print ("\n\tRandom Forrest classification report:\n%s\n" %
       metrics.classification_report(Y_test,y_pred_rand,target_names=["Not Survived", "Survived"]))

print ("\n\tSupport Vector Machine classification report:\n%s\n" %
       metrics.classification_report(Y_test,y_pred_svm,target_names=["Not Survived", "Survived"]))

print ("\n\tLinear Support Vector Classifier classification report:\n%s\n" %
       metrics.classification_report(Y_test,y_pred_lsvc,target_names=["Not Survived", "Survived"]))

print ("\n\tGradient Boosted classification report:\n%s\n" %
       metrics.classification_report(Y_test,y_pred_gb,target_names=["Not Survived", "Survived"]))

print ("\n\tNaive Bayes classification report:\n%s\n" %
       metrics.classification_report(Y_test,y_pred_nb,target_names=["Not Survived", "Survived"]))

print ("\n\tK-nearest Neighbour Classifier classification report:\n%s\n" %
       metrics.classification_report(Y_test,y_pred_neigh,target_names=["Not Survived", "Survived"]))

print ("\n\tMulti Layer Perception Neural network classification report:\n%s\n" %
       metrics.classification_report(Y_test,y_pred_neural,target_names=["Not Survived", "Survived"]))

	Logistic classification report:
              precision    recall  f1-score   support

Not Survived       0.81      0.86      0.83       112
    Survived       0.74      0.67      0.70        67

 avg / total       0.79      0.79      0.79       179



	Random Forrest classification report:
              precision    recall  f1-score   support

Not Survived       0.85      0.89      0.87       112
    Survived       0.81      0.75      0.78        67

 avg / total       0.84      0.84      0.84       179



	Support Vector Machine classification report:
              precision    recall  f1-score   support

Not Survived       0.81      0.86      0.83       112
    Survived       0.73      0.66      0.69        67

 avg / total       0.78      0.78      0.78       179



	Linear Support Vector Classifier classification report:
              precision    recall  f1-score   support

Not Survived       0.63      0.99      0.77       112
    Survived       0.75      0.04      0.08        6