In [1]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB   #import Gaussian Bayes modeling function
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [17]:
location = "datasets/titanic.xls"

df = pd.read_excel(location)
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [18]:
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [19]:
df['age'].fillna(df.groupby(['survived', 'sex', 'pclass'])['age'].transform('mean'), inplace = True)

In [20]:
df.groupby(['survived', 'sex', 'pclass'])['age'].mean()

survived  sex     pclass
0         female  1         35.200000
                  2         34.090909
                  3         23.418750
          male    1         43.658163
                  2         33.092593
                  3         26.679598
1         female  1         37.109375
                  2         26.711051
                  3         20.814815
          male    1         36.168240
                  2         17.449274
                  3         22.436441
Name: age, dtype: float64

In [21]:
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age             0
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [22]:
embarked = df.loc[df['embarked'].isnull()==True]
embarked

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
168,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,6,,
284,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,6,,"Cincinatti, OH"


In [23]:
#OR embark = list(embark.index)
embark = list((df.loc[df['embarked'].isnull()==True]).index)

In [24]:
df['embarked'].value_counts()

S    914
C    270
Q    123
Name: embarked, dtype: int64

In [25]:
df['embarked'].fillna('S', inplace=True)

In [26]:
df.iloc[embark]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
168,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,S,6,,
284,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,S,6,,"Cincinatti, OH"


In [27]:
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age             0
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        0
boat          823
body         1188
home.dest     564
dtype: int64

In [60]:
model_df = df.drop(['name','ticket','fare', 'cabin', 'boat', 'body', 'home.dest'], axis=1)

In [61]:
model_df.columns

Index(['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'embarked'], dtype='object')

In [62]:
model_df = pd.get_dummies(data=model_df, columns=['pclass','embarked'])
model_df.head()

Unnamed: 0,survived,sex,age,sibsp,parch,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S
0,1,female,29.0,0,0,1,0,0,0,0,1
1,1,male,0.9167,1,2,1,0,0,0,0,1
2,0,female,2.0,1,2,1,0,0,0,0,1
3,0,male,30.0,1,2,1,0,0,0,0,1
4,0,female,25.0,1,2,1,0,0,0,0,1


In [63]:
model_df['sex'] = model_df['sex'].map({'female':0, 'male':1})
model_df.head()

Unnamed: 0,survived,sex,age,sibsp,parch,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S
0,1,0,29.0,0,0,1,0,0,0,0,1
1,1,1,0.9167,1,2,1,0,0,0,0,1
2,0,0,2.0,1,2,1,0,0,0,0,1
3,0,1,30.0,1,2,1,0,0,0,0,1
4,0,0,25.0,1,2,1,0,0,0,0,1


In [64]:
model_df['family_num'] = model_df['sibsp'] + model_df['parch']
model_df.drop(['sibsp', 'parch'], axis=1, inplace=True)
model_df.head()

Unnamed: 0,survived,sex,age,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S,family_num
0,1,0,29.0,1,0,0,0,0,1,0
1,1,1,0.9167,1,0,0,0,0,1,3
2,0,0,2.0,1,0,0,0,0,1,3
3,0,1,30.0,1,0,0,0,0,1,3
4,0,0,25.0,1,0,0,0,0,1,3


In [65]:
model_df['TravelAlone']=np.where((model_df['family_num'] > 0), 0, 1)
model_df.head()

Unnamed: 0,survived,sex,age,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S,family_num,TravelAlone
0,1,0,29.0,1,0,0,0,0,1,0,1
1,1,1,0.9167,1,0,0,0,0,1,3,0
2,0,0,2.0,1,0,0,0,0,1,3,0
3,0,1,30.0,1,0,0,0,0,1,3,0
4,0,0,25.0,1,0,0,0,0,1,3,0


In [67]:
#dataframe with predicting features
X = model_df.drop('survived', axis=1)

#column of predictive target values
y = model_df['survived']

In [68]:
#create training and test data
#will leave test size at default (35%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=109)

In [69]:
#initialize Gaussian Bayes classifier
gnb = GaussianNB()

In [70]:
#train the model to learn trends
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [71]:
#predictive score of the model on the training data
gnb.score(X_train, y_train)

0.7764705882352941

In [72]:
#test the model on unseen data
#score predictive values in variable
y_pred = gnb.predict(X_test)

In [75]:
#Confusion matrix shows which values model predicted correctly vs incorrectly

cm = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['Predicted Not Survival', 'Predicted Survival'],
    index=['True Not Survival', 'True Survival']
)

cm

Unnamed: 0,Predicted Not Survival,Predicted Survival
True Not Survival,232,51
True Survival,52,124


In [76]:
#frequency of passed students to failed students in the test dataset
y_test.value_counts()

0    283
1    176
Name: survived, dtype: int64

In [77]:
#predictive score of the model on the test data
gnb.score(X_test, y_test)

0.775599128540305

In [78]:
#predictive score of the model for each predictive category
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82       283
           1       0.71      0.70      0.71       176

   micro avg       0.78      0.78      0.78       459
   macro avg       0.76      0.76      0.76       459
weighted avg       0.78      0.78      0.78       459



In [79]:
#import Bernoulli Naïve Bayes function from scikit-learn library
from sklearn.naive_bayes import BernoulliNB

In [80]:
#initialize Bernoulli Naïve Bayes function to a variable
bnb = BernoulliNB()

In [81]:
#build the model with training data
bnb.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [82]:
#model's predictive score on the training data
bnb.score(X_train, y_train)

0.7470588235294118

In [83]:
#test the model on unseen data
#score predictive values in variable
y_pred = bnb.predict(X_test)

In [84]:
#Confusion matrix shows which values model predicted correctly vs incorrectly

cm = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['Predicted Failed', 'Predicted Passed'],
    index=['True Failed', 'True Passed']
)

cm

Unnamed: 0,Predicted Failed,Predicted Passed
True Failed,234,49
True Passed,58,118


In [85]:
#predictive score of the model on the test data
bnb.score(X_test, y_test)

0.7668845315904139

In [86]:
#predictive score of the model for each predictive category
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.83      0.81       283
           1       0.71      0.67      0.69       176

   micro avg       0.77      0.77      0.77       459
   macro avg       0.75      0.75      0.75       459
weighted avg       0.77      0.77      0.77       459



# Compare the two models against each other. Did one model perform better than the other?
With a 65/35 train/test split, the NB predicts has a slightly better test fitted model than Bernulli with 0.7755 vs 0.7668. Furthermore, the NB model seems to be a stronger predictor of "not survival" status.

How does the performance of these two models compare to the other classification algorithms, logistic regression and decision trees?
With the same 65/35 split, the Logistics regression model has better test (0.7908) and trained (0.7964) fitted models than both the NB and Bernulli and seems to get better at predicting "survival" (0.82 vs 0.81) and "not survival" (0.75 vs 0.71) status. With the same train/test slip, the Decision Tree model seems to be better at predicting "not survival" (0.87) status but worse at "survival" (0.69) status than the Logistic Regression, NB and Bernulli models.