# 10. Naive Bayes

In this lab,  run the code below in order to understand how to utilize Naive Bayes classification under sklearn. Then follow the exercise at the bottom of the notebook.

In [None]:
%matplotlib inline

from sklearn import datasets
import numpy as np
np.random.seed(1345)

# Load the wine dataset (description here http://scikit-learn.org/stable/datasets/index.html#diabetes-dataset)
wine = datasets.load_wine()
data = wine.data.copy()
target = wine.target.copy()

# Split the data into training/testing sets
total_samples = wine.target.shape[0]
exclude = round(total_samples/3)
indices = np.arange(0,total_samples)
np.random.shuffle(indices)

idx_train = indices[:-exclude]
idx_test = indices[-exclude:]

assert not np.intersect1d( idx_test, idx_train ).size

X_train = data[idx_train]
X_test = data[idx_test]

# Split the targets into training/testing sets
y_train = target[idx_train]
y_test = target[idx_test]

print(y_test)


### 10.1 Naive Bayes with scikit-learn
Go through the code below and then run the cell to perform Naive Bayes with scikit-learn

In [None]:
import matplotlib.pyplot as plt
from sklearn import naive_bayes, metrics

# create a naive Bayes model with Gaussian observations
nb = naive_bayes.GaussianNB()
# Train using training set
nb.fit(X_train, y_train)

# Predict using test set
y_pred = nb.predict(X_test)

# Plot outputs
plt.plot(y_test-0.1,'g*')
plt.plot(y_pred,'b.')
plt.show()

# evaluation
print(metrics.confusion_matrix(y_test, y_pred))
print("accuracy: %.2f" % metrics.accuracy_score(y_test, y_pred))

# calculate precision and recall (and f-score, a weighted average of precision and recall) 
precision,recall,fscore,_ = metrics.precision_recall_fscore_support(y_test,y_pred )
for c in np.unique(y_test):
        print("class %s, precision: %.2f, recall: %.2f,  f-score: %.2f" % (c, precision[c], recall[c], fscore[c]))



### 10.2 Naive Bayes using objects

Try running the following Naive Bayes binary classifier as an OO object. Have a look at how the class structure of myGaussianNB works. Refer to the python documentation on 'class'.


In [None]:
class myGaussianNB:
    def __init__(self):
        # initialise the object
        self.nb = naive_bayes.GaussianNB()
        
    def fit(self, X_train, y_train):
        self.nb.fit(X_train, y_train)
    
    def predict(self, X_test):
        return self.nb.predict(X_test)
        
# evaluate as a binary classifier (i.e. only consider class 0 vs. rest from the above)
target_binary = np.zeros( target.shape )
target_binary[target==0] = 1 
y_train_bin = target_binary[idx_train]
y_test_bin = target_binary[idx_test]

myNB = myGaussianNB()

myNB.fit(X_train, y_train_bin)
y_pred_bin = myNB.predict(X_test)

# now do the evaluation
print(metrics.confusion_matrix(y_test_bin, y_pred_bin))
print("accuracy: %.2f" % metrics.accuracy_score(y_test, y_pred))



## 10.3 Implement your own Naive Bayes class

Now try to implement the above code (from 10.2), but instead of using sklearn's naive_bayes.GaussianNB(), fill in the gaps below to implement your own version using Bayes theorem. Your classifier should return the same results as the version in 10.2. 



In [None]:
import math
import scipy.stats as stats

class myGaussianNB:
    
    def __init__(self):
        # initialise the attributes of this class
        self.classes = []
        self.numObservations = 0
        
        self.class_prior = dict()
        self.class_mean = dict()
        self.class_std = dict()
        self.class_likelihood = dict()
        self.posteriors = []
        self.predictions = []
                
    def fit(self, X_train, y_train):
        self.classes = np.unique( y_train.astype(int) )
        self.numObservations = X_train.shape[1] 
        total = len(y_train)
        # train your model for each class
        for c in self.classes:
            # 1. calculate your prior ( P(Class) )
            self.class_prior[c] = #** YOUR CODE **
            # 2. train a Gaussian model of the likelihood P( Obs | Class )
            #   hint: calculate the mean and standard deviation of your classes
            self.class_mean[c] = #** YOUR CODE **
            self.class_std[c] = #** YOUR CODE **
            
    def predict(self, X_test):
        #1. evaluate (log) likelihoods of test data for each class 
        for c in self.classes:
            
            # there will be multiple gaussians that need to be combined using the naive assumption
            likelihood = 1
            for obs in np.arange(0,self.numObservations).astype(int):
                # use a normal pdf to calculate the likelihood for a single feature observation vector
                feature_likelihood = stats.norm.pdf(X_test[:,obs], self.class_mean[c][obs], self.class_std[c][obs]) 
                # now combine these 
                likelihood = #** YOUR CODE **

            self.class_likelihood[c] = likelihood
                      
            #2. approximate the posterior using P(X|Y)P(Y)
            self.posteriors.append( #** YOUR CODE ** )
        
        #3. take the maximum posterior probability as our final class
        self.predictions = #** YOUR CODE **
        
        return self.predictions
        
     #self.nb.predict(X_test)
    
myNB = myGaussianNB()

myNB.fit(X_train, y_train_bin)

y_pred_bin = myNB.predict(X_test)

# now do the evaluation
print(metrics.confusion_matrix(y_test_bin, y_pred_bin))
print("accuracy: %.2f" % metrics.accuracy_score(y_test, y_pred))


