In [16]:
import pandas as pd
import numpy as np 

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

%run util.ipynb

# Naïve Bayes 

In [17]:
# https://scikit-learn.org/stable/modules/naive_bayes.html 
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

## Load Data

In [18]:
X, Y, df = get_data()

In [19]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=42)

In [20]:
s = StandardScaler()
Xtrain = s.fit_transform(Xtrain)
Xtest = s.transform(Xtest)

# Train Model

In [21]:
clf = GaussianNB()

In [22]:
clf.fit(Xtrain, Ytrain)

0,1,2
,priors,
,var_smoothing,1e-09


In [23]:
YtestPred = clf.predict(Xtest)
accuracy_score(Ytest, YtestPred)

0.5520833333333334

In [24]:
print(classification_report(Ytest, YtestPred))

              precision    recall  f1-score   support

           0       0.54      1.00      0.70        50
           1       1.00      0.07      0.12        46

    accuracy                           0.55        96
   macro avg       0.77      0.53      0.41        96
weighted avg       0.76      0.55      0.42        96



### Hyperparameter Tuning

In [25]:
clf.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [26]:
# tuning var_smoothing to enhance regularisation

paramGridNB = {
    'var_smoothing': [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
}

gridSearch = GridSearchCV(estimator=GaussianNB(), param_grid=paramGridNB, cv=10,
                           scoring='accuracy', n_jobs=-1)

In [27]:
gridSearch.fit(Xtrain, Ytrain)

0,1,2
,estimator,GaussianNB()
,param_grid,"{'var_smoothing': [1e-10, 1e-09, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,priors,
,var_smoothing,0.0001


In [28]:
gridSearch.best_params_, gridSearch.best_score_

({'var_smoothing': 0.0001}, np.float64(0.6235177865612648))

In [31]:
model = gridSearch.best_estimator_

YtestPredGrid = model.predict(Xtest)

accuracy_score(Ytest, YtestPredGrid)

0.65625

In [33]:
print(classification_report(Ytest, YtestPredGrid))

              precision    recall  f1-score   support

           0       0.60      0.98      0.75        50
           1       0.93      0.30      0.46        46

    accuracy                           0.66        96
   macro avg       0.77      0.64      0.60        96
weighted avg       0.76      0.66      0.61        96



In [34]:
# check for overfitting in training 

YtrainPred = model.predict(Xtrain)
accuracy_score(Ytrain, YtrainPred)

0.6591928251121076

In [36]:
print(classification_report(Ytrain, YtrainPred))

              precision    recall  f1-score   support

           0       0.60      0.93      0.73       111
           1       0.85      0.39      0.54       112

    accuracy                           0.66       223
   macro avg       0.72      0.66      0.63       223
weighted avg       0.72      0.66      0.63       223

