# Titanic

This example outlines how to build a model using machine learning algorithms to obtain the best accuracy in predicting the survival of individuals aboard the Titanic, based on the  [dataset](https://hbiostat.org/data/repo/titanic3.csv) attributes.


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

sns.set_theme(style="darkgrid")

In [2]:
titanic_df = pd.read_csv('./data/titanic_data.csv')
titanic_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Name      1309 non-null   object 
 2   Sex       1309 non-null   object 
 3   Age       1046 non-null   float64
 4   SibSp     1309 non-null   int64  
 5   Parch     1309 non-null   int64  
 6   Ticket    1309 non-null   object 
 7   Fare      1308 non-null   float64
 8   Cabin     295 non-null    object 
 9   Embarked  1307 non-null   object 
 10  Survived  1309 non-null   int64  
dtypes: float64(2), int64(4), object(5)
memory usage: 452.7 KB


## Data Exploration

In [3]:
titanic_df.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived
count,1309.0,1046.0,1309.0,1309.0,1308.0,1309.0
mean,2.294882,29.897706,0.498854,0.385027,33.296261,0.381971
std,0.837836,14.414973,1.041658,0.86556,51.758691,0.486055
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,7.9,0.0
50%,3.0,28.0,0.0,0.0,14.45,0.0
75%,3.0,39.0,1.0,0.0,31.28,1.0
max,3.0,80.0,8.0,9.0,512.33,1.0


In [4]:
titanic_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.34,B5,S,1
1,1,"Allison, Master. Hudson Trevor",male,1.0,1,2,113781,151.55,C22 C26,S,1
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,0
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,0
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,0


## Data Cleaning

In [None]:
age_median = titanic_df['Age'].median(skipna=True)
titanic_df['Age'].fillna(age_median, inplace=True)

In [6]:
titanic_df.drop('Cabin', axis=1, inplace=True)

In [None]:
titanic_df['Embarked'].fillna("S", inplace=True)

In [None]:
fare_median = titanic_df['Fare'].median(skipna=True)
titanic_df['Fare'].fillna(fare_median,inplace=True)

## Data Analysis

We might notice these two variables (SibSp - *Siblings or Spouse*) and (Parch- *Parents or Children*) are somewhat similar information for each passenger. As such, we're adding a new two variables (*TravelGroup* and *TravelAlone*) to indicate whether passenger is traveling alone or not. This simple change simplifies our analysis and eliminates redundancy in the data.


In [9]:
titanic_df['TravelGroup']=titanic_df["SibSp"]+titanic_df["Parch"]
titanic_df['TravelAlone']=np.where(titanic_df['TravelGroup']>0, 0, 1) 
titanic_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived,TravelGroup,TravelAlone
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.34,S,1,0,1
1,1,"Allison, Master. Hudson Trevor",male,1.0,1,2,113781,151.55,S,1,3,0
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,S,0,3,0
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,S,0,3,0
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,S,0,3,0


## Hypothesis

Does group size impact survival odds?

For example, individuals prioritizing the safety of others might spend extra time searching, potentially reducing their own survival chances by neglecting their escape.

In [10]:
titanic_df['TravelTotal'] = titanic_df['TravelGroup'] + 1

In [11]:
titanic_df.drop('SibSp', axis=1, inplace=True)
titanic_df.drop('Parch', axis=1, inplace=True)
titanic_df.drop('TravelGroup', axis=1, inplace=True)
titanic_df.drop('Ticket', axis=1, inplace=True)
titanic_df.drop('Name', axis=1, inplace=True)

In [12]:
le = preprocessing.LabelEncoder()
pclass_cat = le.fit_transform(titanic_df.Pclass)
sex_cat = le.fit_transform(titanic_df.Sex)
embarked_cat = le.fit_transform(titanic_df.Embarked)

#Initialize the encoded categorical columns
titanic_df['pclass_cat'] = pclass_cat
titanic_df['sex_cat'] = sex_cat
titanic_df['embarked_cat'] = embarked_cat

#Drop old categorical fields from dataframe and reindex
dummy_fields = ['Pclass','Sex','Embarked']
data = titanic_df.drop(dummy_fields, axis = 1)
data = titanic_df.reindex(['pclass_cat','sex_cat','Age','Fare','embarked_cat','TravelAlone', 'TravelTotal','Survived'],axis=1)

In [13]:
data

Unnamed: 0,pclass_cat,sex_cat,Age,Fare,embarked_cat,TravelAlone,TravelTotal,Survived
0,0,0,29.0,211.34,2,1,1,1
1,0,1,1.0,151.55,2,0,4,1
2,0,0,2.0,151.55,2,0,4,0
3,0,1,30.0,151.55,2,0,4,0
4,0,0,25.0,151.55,2,0,4,0
...,...,...,...,...,...,...,...,...
1304,2,0,15.0,14.45,0,0,2,0
1305,2,0,28.0,14.45,0,0,2,0
1306,2,1,27.0,7.23,0,1,1,0
1307,2,1,27.0,7.23,0,1,1,0


In [14]:
continuous = ['Age', 'Fare', 'TravelTotal']

scaler = StandardScaler()

for var in continuous:
    data[var] = data[var].astype('float64')
    data[var] = scaler.fit_transform(data[var].values.reshape(-1, 1))

In [15]:
data

Unnamed: 0,pclass_cat,sex_cat,Age,Fare,embarked_cat,TravelAlone,TravelTotal,Survived
0,0,0,-0.040027,3.442616,2,1,-0.558346,1
1,0,1,-2.210230,2.286623,2,0,1.336749,1
2,0,0,-2.132722,2.286623,2,0,1.336749,0
3,0,1,0.037481,2.286623,2,0,1.336749,0
4,0,0,-0.350056,2.286623,2,0,1.336749,0
...,...,...,...,...,...,...,...,...
1304,2,0,-1.125128,-0.364099,0,0,0.073352,0
1305,2,0,-0.117534,-0.364099,0,0,0.073352,0
1306,2,1,-0.195041,-0.503693,0,1,-0.558346,0
1307,2,1,-0.195041,-0.503693,0,1,-0.558346,0


In [16]:
#Make sure data is clean/check for null
data[data.isnull().any(axis=1)].head()

Unnamed: 0,pclass_cat,sex_cat,Age,Fare,embarked_cat,TravelAlone,TravelTotal,Survived


## Train and evaluate model

In [17]:
#Split inputs and output
X = data.iloc[:, 0:7] 
Y = data.iloc[:, 7]

In [18]:
#Test/Train Split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [19]:
gnb = GaussianNB().fit(X_train, y_train)

#Predictions
y_pred = gnb.predict(X_test)

#Accuracy Score
NB_all_accuracy = accuracy_score(y_test,y_pred)
print('Naive Bayes Model Accuracy with all attributes: {0:.2f}'.format
(NB_all_accuracy))

Naive Bayes Model Accuracy with all attributes: 0.81


In [20]:
tree = DecisionTreeClassifier(criterion = 'entropy', min_samples_split = 2, random_state=5).fit(X_train, y_train)

#Predictions
y_pred = tree.predict(X_test)

#Accuracy Score
tree_all_accuracy = accuracy_score(y_test, y_pred)
print('Decision Tree Accuracy with all attributes: {0:.2f}'.format(tree_all_accuracy))

Decision Tree Accuracy with all attributes: 0.77


In [21]:
model = GaussianNB()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Calculate accuracy score
accuracy = metrics.accuracy_score(y_test, predictions)

# Convert accuracy to percentage
accuracy_percentage = accuracy

# Print the accuracy percentage
print("Accuracy with all attributes: {0:.2f}".format(accuracy_percentage))


Accuracy with all attributes: 0.81


## Accuracy Score Comparison

In [22]:
print('NB accuracy: {:.0f}%'.format(round(NB_all_accuracy * 100)))
print("Decision Tree: {:.0f}%".format(round(tree_all_accuracy * 100)))
print("OVERALL: {:.0f}%".format(round(accuracy_percentage * 100)))
print()


NB accuracy: 81%
Decision Tree: 77%
OVERALL: 81%

