In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import random as r

In [6]:
df = pd.read_csv("../Data/titanic.csv")
y = df['Survived']
X = df[['Pclass', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']]
FamilyMembers = X.loc[:,"Siblings/Spouses Aboard"] + X.loc[:,'Parents/Children Aboard']
data_train = pd.DataFrame({
    "Pclass" : X["Pclass"],
    "Sex" : X["Sex"],
    "Age" : X["Age"],
    "FamilyMembers" : FamilyMembers,
    "Fare": X["Fare"]
})

## Bayes 
---
- With Laplace smoothing

In [20]:
class Titanic:
    def __init__(self, X_train, y_train):
        self.y_train = y_train
        self.X_train = X_train
        self.prioriSurvive = y_train.value_counts() / y_train.count()
        self.Pclass_table = self.Laplace_table(y_train, X_train['Pclass'])
        self.Sex_table = self.Laplace_table(y_train, X_train['Sex'])
        self.Family_table = self.Laplace_table(y_train, X_train['FamilyMembers'])
        #self.Sibling_tble = self.Laplace_table(y_train, X_train['Siblings/Spouses Aboard'])
        #self.Parent_table = self.Laplace_table(y_train, X_train['Parents/Children Aboard'])
        self.Age_marginal = {
            'survive_mean' : X_train[y_train==1]['Age'].mean(),
            'noSvive_mean' : X_train[y_train==0]['Age'].mean(),
            'survive_var' : X_train[y_train==1]['Age'].var(),
            'noSvive_var' : X_train[y_train==0]['Age'].var()
        }
        self.Fare_marginal = {
            'survive_mean' : X_train[y_train==1]['Fare'].mean(),
            'noSvive_mean' : X_train[y_train==0]['Fare'].mean(),
            'survive_var' : X_train[y_train==1]['Fare'].var(),
            'noSvive_var' : X_train[y_train==0]['Fare'].var()
        }
    
    @staticmethod
    def Laplace_table(y_train, X_Series):
        adf = pd.crosstab(y_train, X_Series)
        adf = adf + np.array([1]*(adf.shape[0]*adf.shape[1])).reshape(adf.shape)
        return adf.apply(lambda col: col/ adf.sum(axis=1), axis=0)
    
    @staticmethod
    def Lklhd_prb(x, _var, _mean):
        return (1 / np.sqrt(2*np.pi*_var) * np.exp(-(x - _mean)**2/(2*_var)))
    
    def decision(self, X_testSeries):
        Pclass = X_testSeries['Pclass']
        Sex = X_testSeries['Sex']
        Age = X_testSeries['Age']
        Family = X_testSeries['FamilyMembers']
        #Sibling = X_testSeries['Siblings/Spouses Aboard']
        #Parent = X_testSeries['Parents/Children Aboard']
        Fare = X_testSeries['Fare']
        
        postNo = [self.prioriSurvive[0], 
                  self.Pclass_table.loc[0, Pclass], 
                  self.Sex_table.loc[0, Sex], 
                  self.Lklhd_prb(Age, self.Age_marginal['noSvive_var'], self.Age_marginal['noSvive_mean']), 
                  self.Family_table.loc[0, Family],
                  #self.Sibling_tble.loc[0, Sibling], 
                  #self.Parent_table.loc[0, Parent], 
                  self.Lklhd_prb(Fare, self.Fare_marginal['noSvive_var'], self.Fare_marginal['noSvive_mean'])]
        
        postYes = [self.prioriSurvive[1], 
                   self.Pclass_table.loc[1, Pclass],
                   self.Sex_table.loc[1, Sex],
                   self.Lklhd_prb(Age, self.Age_marginal['survive_var'], self.Age_marginal['survive_mean']),
                   self.Family_table.loc[1, Family],
                  #self.Sibling_tble.loc[1, Sibling], 
                  #self.Parent_table.loc[1, Parent],
                   self.Lklhd_prb(Fare, self.Fare_marginal['survive_var'], self.Fare_marginal['survive_mean'])]
        return {'postNo':postNo, 'postYes':postYes}

In [19]:
X_train, X_test, y_train, y_test = train_test_split(data_train, y , test_size=0.2, random_state=r.randint(1,100))
ttnc = Titanic(X_train, y_train)
prob_List = X_test.apply(ttnc.decision, axis=1)
y_pred = []
for p in prob_List:
    raw = {
        "No": sum(np.log(p['postNo'])),
        "Yes":sum(np.log(p['postYes']))
    }
    result = 1 if raw['Yes'] >= raw['No'] else 0
    y_pred.append(result)
accuracy_score(y_test, y_pred)

0.8202247191011236

## accuracy

In [21]:
experiment = []
for i in range(11):
    X_train, X_test, y_train, y_test = train_test_split(data_train, y , test_size=0.2, random_state=r.randint(1,100))
    ttnc = Titanic(X_train, y_train)
    prob_List = X_test.apply(ttnc.decision, axis=1)
    y_pred = []
    for p in prob_List:
        raw = {
            "No": sum(np.log(p['postNo'])),
            "Yes":sum(np.log(p['postYes']))
        }
        result = 1 if raw['Yes'] >= raw['No'] else 0
        y_pred.append(result)
    experiment.append(accuracy_score(y_test, y_pred))

In [22]:
np.mean(experiment)

0.7717058222676201

# Using built-in function

In [1]:
from sklearn.naive_bayes import GaussianNB

In [2]:
gnb = GaussianNB()

In [25]:
data_train['Sex'] = data_train['Sex'].map({'male':1, 'female':0})

In [29]:
experiment = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(data_train, y , test_size=0.2, random_state=r.randint(1,100))
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    experiment.append(accuracy_score(y_test, y_pred))

In [30]:
np.mean(experiment)

0.7904494382022472