# Importing Libraries

In [76]:
import numpy as np
import pandas as pd
from numpy import mean
from numpy import absolute
from numpy import sqrt

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score,mean_squared_error,r2_score

# Importing Datasets

In [77]:
train=pd.read_csv("train_dataset.csv")

In [78]:
test=pd.read_csv("test_dataset.csv")

# Extracting Data

In [79]:
train=train.set_index('Unnamed: 0')
X_train=train.iloc[:,0:9]
y_train=train.Survived

In [80]:
test=test.set_index('Unnamed: 0')
X_test=test.iloc[:,0:9]
y_test=test.Survived

# Taking a look at the data

In [81]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(534, 9)
(534,)
(179, 9)
(179,)


In [82]:
X_train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,FamilyCount,Title,Has Cabin,IsAlone
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
711,1,0,29.699118,26.55,0,0,1,1,0
466,2,0,29.699118,0.0,0,0,1,0,0
213,2,0,30.0,13.0,0,0,1,0,0
496,1,1,54.0,78.2667,2,1,2,1,1
583,1,0,36.0,40.125,2,0,1,1,0


In [83]:
y_train.head()

Unnamed: 0
711    0
466    0
213    0
496    1
583    0
Name: Survived, dtype: int64

# Gaussian Naive Bayes


In [84]:
gaussian = GaussianNB() 
gaussian.fit(X_train, y_train)  
y_pred = gaussian.predict(X_test)  

In [85]:
acc_gnb = accuracy_score(y_test,y_pred)
acc_gnb

0.7597765363128491

# Hyperparameter Tuning to improve Accuracy

In [87]:
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

In [88]:
from sklearn.model_selection import GridSearchCV
nbModel_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
nbModel_grid.fit(X_train, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


GridSearchCV(cv=10, estimator=GaussianNB(), n_jobs=-1,
             param_grid={'var_smoothing': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02, 6.57933225e-02, 5.33669923e-02, 4.32876128e-02,
       3.51119173e-02, 2.848035...
       1.23284674e-07, 1.00000000e-07, 8.11130831e-08, 6.57933225e-08,
       5.33669923e-08, 4.32876128e-08, 3.51119173e-08, 2.84803587e-08,
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09])},
             verbose=1)

In [89]:
nbModel_grid.best_params_

{'var_smoothing': 4.328761281083053e-06}

In [90]:
nbModel_grid.best_score_

0.8219426974143955

# Cross validating the model

In [91]:
#define cross-validation method to use
cv = KFold(n_splits=10, random_state=1, shuffle=True)

In [92]:
scores = cross_val_score(nbModel_grid, X_train, y_train, scoring='neg_mean_absolute_error',cv=cv, n_jobs=-1)

#view mean absolute error
mean(absolute(scores))

0.17414395527603074

In [93]:
#define cross-validation method to use
cv = KFold(n_splits=5, random_state=1, shuffle=True) 

#build gnb model
gaussian = GaussianNB() 

#use LOOCV to evaluate model
scores = cross_val_score(nbModel_grid, X_train, y_train, scoring='neg_mean_squared_error',cv=cv, n_jobs=-1)

#view RMSE
sqrt(mean(absolute(scores)))

0.4370231613080266

MSE is a bit high but RMSE is low 

# Pickle the model

In [94]:
# save the model to disc
import pickle
filename = "GNB model.pkl"
pickle.dump(nbModel_grid,open(filename,'wb'))