# Predicting Survivors of the Titanic using Gaussian Naive Bayes | Example of Gaussian Naive Bayes Sklearn in Python

Naive Bayes classifiers are built on Bayesian classification methods. These rely on Bayes's theorem, which is an equation describing the relationship of conditional probabilities of statistical quantities.Create a Model using Naive Bayes classifiers to predict whether a passenger on the titanic would have been survived or not.

Dataset can be downloaded from[ Kaggle.](https://https://www.kaggle.com/c/titanic/data)


---



In [15]:
import pandas as pd

In [19]:
df = pd.read_csv("titanic_data.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,,S
1,2,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C
2,3,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S
3,4,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,C123,S
4,5,0.0,3.0,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,,S


In [20]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0.0,3.0,male,22.0,7.25
1,1.0,1.0,female,38.0,71.2833
2,1.0,3.0,female,26.0,7.925
3,1.0,1.0,female,35.0,53.1
4,0.0,3.0,male,35.0,8.05


In [21]:
inputs = df.drop('Survived',axis='columns')
target = df.Survived

In [22]:

#inputs.Sex = inputs.Sex.map({'male': 1, 'female': 2})

dummies = pd.get_dummies(inputs.Sex)
dummies.head(10)

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
5,0,1
6,0,1
7,0,1
8,1,0
9,1,0


In [23]:
inputs = pd.concat([inputs,dummies],axis='columns')
inputs.head(10)

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3.0,male,22.0,7.25,0,1
1,1.0,female,38.0,71.2833,1,0
2,3.0,female,26.0,7.925,1,0
3,1.0,female,35.0,53.1,1,0
4,3.0,male,35.0,8.05,0,1
5,3.0,male,,8.4583,0,1
6,1.0,male,54.0,51.8625,0,1
7,3.0,male,2.0,21.075,0,1
8,3.0,female,27.0,11.1333,1,0
9,2.0,female,14.0,30.0708,1,0


**I am dropping male column as well because of dummy variable trap theory. One column is enough to repressent male vs female**

In [24]:
inputs.drop(['Sex','male'],axis='columns',inplace=True)
inputs.head(3)

Unnamed: 0,Pclass,Age,Fare,female
0,3.0,22.0,7.25,0
1,1.0,38.0,71.2833,1
2,3.0,26.0,7.925,1


In [25]:
inputs.columns[inputs.isna().any()]

Index(['Pclass', 'Age', 'Fare'], dtype='object')

In [26]:
inputs.Age[:10]

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [27]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female
0,3.0,22.0,7.25,0
1,1.0,38.0,71.2833,1
2,3.0,26.0,7.925,1
3,1.0,35.0,53.1,1
4,3.0,35.0,8.05,0


**Data splitting**

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs,target,test_size=0.3)

**Naive bayes classification**

In [29]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [None]:
model.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
model.score(X_test,y_test)

0.753731343283582

In [None]:

X_test[0:10]


Unnamed: 0,Pclass,Age,Fare,female
655,2,24.0,73.5,0
793,1,29.699118,30.6958,0
184,3,4.0,22.025,1
826,3,29.699118,56.4958,0
595,3,36.0,24.15,0
339,1,45.0,35.5,0
879,1,56.0,83.1583,1
323,2,22.0,29.0,1
856,1,45.0,164.8667,1
441,3,20.0,9.5,0


In [None]:

y_test[0:10]


655    0
793    0
184    1
826    0
595    0
339    0
879    1
323    1
856    1
441    0
Name: Survived, dtype: int64

In [None]:
model.predict(X_test[0:10])

array([0, 0, 1, 0, 0, 0, 1, 1, 1, 0])

In [None]:
model.predict_proba(X_test[:10])

array([[7.62591182e-01, 2.37408818e-01],
       [7.20387589e-01, 2.79612411e-01],
       [1.78593179e-01, 8.21406821e-01],
       [9.26911086e-01, 7.30889136e-02],
       [9.57802141e-01, 4.21978592e-02],
       [6.78590521e-01, 3.21409479e-01],
       [7.52353441e-03, 9.92476466e-01],
       [1.38914553e-01, 8.61085447e-01],
       [9.40139023e-05, 9.99905986e-01],
       [9.57126656e-01, 4.28733437e-02]])

**Calculate the score using cross validation**

In [None]:

from sklearn.model_selection import cross_val_score
cross_val_score(GaussianNB(),X_train, y_train, cv=5)


array([0.808     , 0.736     , 0.776     , 0.81451613, 0.78225806])