In [68]:
#Kaggle Titanic dataset

import pandas as pd
import numpy as np
import sklearn
import quandl
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')

from sklearn import preprocessing, cross_validation, neighbors, svm, model_selection
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [52]:
#Importing the datset
#The dataset contains 892 observations, there are 11 features and one label (survived)
df = pd.read_csv("train.csv")
#Data legend:
#Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
#survival Survival (0 = No; 1 = Yes)
#name Name
#sex Sex
#age Age
#sibsp Number of Siblings/Spouses Aboard
#parch Number of Parents/Children Aboard
#ticket Ticket Number
#fare Passenger Fare (British pound)
#cabin Cabin
#embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

In [53]:
#Exploring and cleaning/pre-processing the data
df.head()
#We can see that our dataset contains non-numerical data which will have to be converted (Sex, Embarked,Cabin, Ticket, etc.)
#We can also see that there are NaN entries which will have to be dealt with

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [54]:
#Converting Sex
df['Sex'] = df['Sex'].apply(lambda x: 1 if x == "male" else 0)

#Converting Embarked
#1 = C, 2 = Q, 3 = S
#Converting Sex
df['Embarked'] = df['Embarked'].apply(lambda x: 1 if x == "C" else (2 if x == "Q"  else 3))

#Checking the number of Nan for the Cabin column
df.Cabin.isnull().sum()
#Cabin contains 687 NaN, there is too many missing data points and we will have to drop the feature
#We will also drop PassengerID, Name, and Ticket, as they do not provide meaningful information
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

#Checking final dataset for NaN values
df.isnull().sum()
#We can see that Age is the only variable containing NaN values
#We will fill these NaNs with the median age of the dataset which is 28
df.Age.median()
df.fillna(28, inplace=True)
#Thus our final dataset contains: Survived (label), Pclass, Sex, SibSP, Parch, Fare, Embarked
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,3
1,1,1,0,38.0,1,0,71.2833,1
2,1,3,0,26.0,0,0,7.925,3
3,1,1,0,35.0,1,0,53.1,3
4,0,3,1,35.0,0,0,8.05,3


In [55]:
#Training our ML classifier
#Train/Test split of 70/30
X = np.array(df.drop(['Survived'], 1))
y = np.array(df['Survived'])
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3)

In [None]:
#Classification accuracy is the number of correct predictions made as a ratio of all predictions made.

#The AUC represents a model’s ability to discriminate between positive and negative classes. 
#An area of 1.0 represents a model that made all predictions perfectly. 
#An area of 0.5 represents a model as good as random.

#The confusion matrix presents predictions on the x-axis and accuracy outcomes on the y-axis. 
#The cells of the table are the number of predictions made by a machine learning algorithm.

#Scikit-learn does provide a convenience report when working on classification problems to give you a quick idea of the accuracy of a model using a number of measures.
#The classification_report() function displays the precision, recall, f1-score and support for each class.

In [70]:
#Linear SVM
clf1 = svm.SVC(kernel='linear')
print("Beginning training of SVC")
clf1.fit(X_train, y_train)

accuracy1 = clf1.score(X_test, y_test)
print("Accuracy of SVC is :", accuracy1)
print("")

kfold = model_selection.KFold(n_splits=10, random_state=20)
model = svm.SVC(kernel='linear')
scoring = 'roc_auc'
results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print("AUC mean: ", results.mean())
print("AUC STD: ", results.std())
print("")

predicted = clf1.predict(X_test)
y_actu = pd.Series(y_test, name='Actual')
y_pred = pd.Series(predicted, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred)
print("CONFUSION MATRIX")
print(df_confusion)
print("")

predicted = clf1.predict(X_test)
report = classification_report(y_test, predicted)
print(report)


Beginning training of SVC
Accuracy of SVC is : 0.787313432836

AUC mean:  0.832385912145
AUC STD:  0.0353859160215

CONFUSION MATRIX
Predicted    0   1
Actual            
0          138  28
1           29  73

             precision    recall  f1-score   support

          0       0.83      0.83      0.83       166
          1       0.72      0.72      0.72       102

avg / total       0.79      0.79      0.79       268



In [71]:
#KNN
clf2 = neighbors.KNeighborsClassifier(n_jobs=-1, n_neighbors=5)
print("Beginning training of KNN")
clf2.fit(X_train, y_train)

accuracy2 = clf2.score(X_test, y_test)
print("Accuracy of KNN is :", accuracy2)

kfold = model_selection.KFold(n_splits=10, random_state=20)
model = neighbors.KNeighborsClassifier(n_jobs=-1, n_neighbors=5)
scoring = 'roc_auc'
results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print("AUC mean: ", results.mean())
print("AUC STD: ", results.std())
print("")

predicted = clf2.predict(X_test)
y_actu = pd.Series(y_test, name='Actual')
y_pred = pd.Series(predicted, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred)
print("CONFUSION MATRIX")
print(df_confusion)
print("")

predicted = clf2.predict(X_test)
report = classification_report(y_test, predicted)
print(report)

Beginning training of KNN
Accuracy of KNN is : 0.701492537313
AUC mean:  0.740835449445
AUC STD:  0.0505752485231

CONFUSION MATRIX
Predicted    0   1
Actual            
0          126  40
1           40  62

             precision    recall  f1-score   support

          0       0.76      0.76      0.76       166
          1       0.61      0.61      0.61       102

avg / total       0.70      0.70      0.70       268



In [72]:
#RF
clf3 = RandomForestClassifier(n_jobs=-1, random_state=20, n_estimators=1000, oob_score=True)
print("Beginning training of RF")
clf3.fit(X_train, y_train)
accuracy3 = clf3.score(X_test, y_test)

print("Accuracy of RF is: ", accuracy3)
print("Out-of-bag score estimate: ", clf3.oob_score_)
print("Feature importances: ", clf3.feature_importances_)

kfold = model_selection.KFold(n_splits=10, random_state=20)
model = RandomForestClassifier(n_jobs=-1, random_state=20, n_estimators=1000, oob_score=True)
scoring = 'roc_auc'
results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print("AUC mean: ", results.mean())
print("AUC STD: ", results.std())
print("")

predicted = clf3.predict(X_test)
y_actu = pd.Series(y_test, name='Actual')
y_pred = pd.Series(predicted, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred)
print("CONFUSION MATRIX")
print(df_confusion)
print("")

predicted = clf3.predict(X_test)
report = classification_report(y_test, predicted)
print(report)

Beginning training of RF
Accuracy of RF is:  0.813432835821
Out-of-bag score estimate:  0.810593900482
Feature importances:  [ 0.083853    0.25098183  0.25990209  0.05395578  0.04013897  0.27279139
  0.03837694]
AUC mean:  0.85798643294
AUC STD:  0.0398478724821

CONFUSION MATRIX
Predicted    0   1
Actual            
0          137  29
1           21  81

             precision    recall  f1-score   support

          0       0.87      0.83      0.85       166
          1       0.74      0.79      0.76       102

avg / total       0.82      0.81      0.81       268



In [74]:
#Voting hybrid
eclf = VotingClassifier(estimators=[('svc', clf1), ('knn', clf2), ('rf',clf3)], voting='hard')
eclf = eclf.fit(X_train, y_train)

accuracy = eclf.score(X_test, y_test)
print("Accuracy of the voting classifer is :", accuracy)

predicted = eclf.predict(X_test)
y_actu = pd.Series(y_test, name='Actual')
y_pred = pd.Series(predicted, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred)
print("CONFUSION MATRIX")
print(df_confusion)
print("")

report = classification_report(y_test, predicted)
print(report)

Accuracy of the voting classifer is : 0.805970149254
CONFUSION MATRIX
Predicted    0   1
Actual            
0          138  28
1           24  78

             precision    recall  f1-score   support

          0       0.85      0.83      0.84       166
          1       0.74      0.76      0.75       102

avg / total       0.81      0.81      0.81       268



In [75]:
#We can see that the random forest classifier outperforms the other ML algorithms across all metrics
model = clf3

In [76]:
#Applying our RF model to the Kaggle testing dataset
df = pd.read_csv("test.csv")
#Converting Sex
df['Sex'] = df['Sex'].apply(lambda x: 1 if x == "male" else 0)

#Converting Embarked
#1 = C, 2 = Q, 3 = S
#Converting Sex
df['Embarked'] = df['Embarked'].apply(lambda x: 1 if x == "C" else (2 if x == "Q"  else 3))

#Cabin contains 687 NaN, there is too many missing data points and we will have to drop the feature
#We will also drop PassengerID, Name, and Ticket, as they do not provide meaningful information
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

#We will fill these NaNs with the median age of the dataset which is 28
df.Age.median()
df.fillna(28, inplace=True)
#Thus our final dataset contains: Pclass, Sex, SibSP, Parch, Fare, Embarked
df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,2
1,893,3,0,47.0,1,0,7.0,3
2,894,2,1,62.0,0,0,9.6875,2
3,895,3,1,27.0,0,0,8.6625,3
4,896,3,0,22.0,1,1,12.2875,3


In [80]:
#Predicting and preparing the csv for Kaggle
X = np.array(df.drop(['PassengerId'], 1))
df['Survived'] = model.predict(X)
df = df.drop(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], axis=1)
df.head()
df.to_csv("KAGGLE_TITANIC_ENTRY.csv", index=False)

In [None]:
#The final accuracy, verified by Kaggle, is 76.067%