In [None]:
import numpy as np
import math
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import confusion_matrix
from collections import Counter

## Penguins_af.csv dataset

In [None]:
penguins_af = pd.read_csv('penguins_af.csv', index_col = 0)
penguins_af.head()

In [None]:
f_names = ['bill_length_mm', 'bill_depth_mm','flipper_length_mm', 'body_mass_g']
penguins = penguins_af[f_names + ['species']]
#two classes
penguins2C = penguins.loc[penguins['species'].isin(['Adelie','Chinstrap'])]
#three classes
#penguins2C = penguins.loc[penguins['species'].isin(['Adelie','Chinstrap','Gentoo'])]
#Changed the target values into numeric values.
penguins2C['species'] = penguins2C['species'].astype('category')
penguins2C['species']=penguins2C['species'].cat.codes


In [None]:
penguins2C['species'].unique()

In [None]:
#y is the target value
y = penguins2C.pop('species').values
X_raw = penguins2C.values
feature_names = penguins2C.columns
X_tr_raw, X_ts_raw, y_train, y_test = train_test_split(X_raw, y, random_state=2, test_size=1/2)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_tr_raw)
X_test = scaler.transform(X_ts_raw)
max_k = X_train.shape[1]
X_train.shape, X_test.shape

## My GaussianNB Implementation

In [None]:
class MyGaussianNB(BaseEstimator, ClassifierMixin):

        
    def fit(self, X, y):
        # set the number of samples and the number of features to the dataset shape
        n_samples,n_features = X.shape
        #get the number of classes the number of unique values of y
        self.classes = np.unique(y)
        n_classes = len(self.classes)
        #initalise numpy arrays to hold the mean, variance and prior probability values. The shape is determined by 
        #the number of classes and the number of features.
        self.mean = np.zeros((n_classes,n_features))
        self.var = np.zeros((n_classes,n_features))
        self.prior = np.zeros((n_classes))
        
        #for loop to get the correct values for the mean, variance and prior probability for each row the the dataset 
        #by the class they belong to.
        for c in self.classes:
            X_class= X[c==y]
            #print(c,X_class)
            self.mean[c,:] = X_class.mean(axis=0)
            #print("mean",self.mean)
            self.var[c,:] = X_class.var(axis=0)
            # prior probability calculated by dividing feature value by class by the total number of samples
            #Calculating prior probability in this method as opposed to the predict method will mean the prior 
            #probability will be calculated on the training data.
            self.prior[c] = X_class.shape[0]/n_samples
            #print (self.prior)
            

    # calls the _predict function to make the prediction about what class the test dataset features belong to.
    def predict(self,X):
        y_pred=[self.posterior(x) for x in X]
        return y_pred
    
    # A function to calculate the conditional probability and the posterior probability of each feature in the dataset
    def posterior(self,X):
        posterior_list=[]
        for i, c in enumerate(self.classes):
            prior = self.prior[i]
            #calls the conditional probability function to calculate the conditional probability.
            c_probability= self.conditional_probability(i,X)
           #calculate the product of each conditional probability array of features
            prod=np.prod(c_probability)
            #add the product of conditional features to the prior probability to get the posterior probability.
            posterior= prod + prior
            posterior_list.append(posterior)
        #return the maximum value from the posterior probability list and shows the class it has predicted the 
        #value belongs to.
        return self.classes[np.argmax(posterior_list)]
    
    
    #Function to calculate the conditional probability based on the given formula.        
    def conditional_probability(self,class_i,x):
        e= np.e
        pi = np.pi
        mean = self.mean[class_i]
        var = self.var[class_i]
        equation1= 1/(np.sqrt(2*pi*var))
        numerator = (x-mean)**2
        denom = 2*var
        expo= np.exp(-(numerator/denom))
        prob = equation1 * expo
        return prob
    

## Scikit-Learn GaussianNB Implementation on the penguins dataset

In [None]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)

In [None]:
gnb.fit(X_train,y_train)
y_pred=gnb.predict(X_test)

In [None]:
gnb.score(X_test, y_test)

In [None]:
confusion = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n{}".format(confusion))

### MyGaussianNb implementation on penguins.csv

In [None]:
mgnb = MyGaussianNB()

In [None]:
mgnb.fit(X_train,y_train)

In [None]:
 y_pred=mgnb.predict(X_test)

In [None]:
mgnb.score(X_test,y_test)

In [None]:
confusion = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n{}".format(confusion))

### Comparison of the two approaches
Tested with a binary target Adelie or Chinstrap:<br>
MyGaussianNB<br>
Confusion matrix:<br>
[[70  0]<br>
[ 6 31]]<br>
 
GaussianNB<br>
Confusion matrix:<br>
[[69  1]<br>
[ 3 34]]<br>

We can see from the ClassifierMixin.score which calculates the mean accuracy across the datset that GaussianNB performed better at predicting the appropriate class. The GaussianNB class had 1 flase positive and 3 false negatives. The MyGaussianNB class had no false positives making it better at predicting the correct class for this particular class, but it had 6 false negative which means it incorrectly labelled the type of penguin. 

Tested with all three class Adelie, Chinstrap or Gentoo:<br>
Confusion matrix:<br>
    a  c  g <br>
a[[68  2  0]<br>
c[ 4 30  0]<br>
g[ 0  0 63]]<br>
 
True positive Adelie is 68<br>
False negative is 2<br>
False positive is 4<br>
True negative Adelie is 93<br>
Chinstrap<br>
True positive Chinstrap is 2<br>
False negative is 4<br>
False positive is 30<br>
True negative Chinstrap is 131<br>
Gentoo<br>
True positive Gentoo is 0<br>
False negative is 0<br>
False positive is 63<br>
True negative Gentoo is 104<br>


So while the score looks to have improved slightly we can see from the confusion matrix that when we add in a third class Gentoo the predictive power gets worse and more false negatives and positives occur. This is probably due to limited Gentoo data and data that is more similar.

When i changed the test parameter to 1/3 instead of 1/2 the confusion matrix inproved significantly. <br>
Confusion matrix:<br>
[[44  1  0]<br>
 [ 0 22  0]<br>
 [ 0  0 44]]<br>
 <br>
 
 and again when i made the test portion 1/4 of the dataset<br>
 Confusion matrix:<br>
[[37  1  0]<br>
 [ 0 15  0]<br>
 [ 0  0 31]]<br>

## Testing on further datasets

### Diabetes dataset

In [None]:
diabetes = pd.read_csv('diabetes.csv', index_col = False)
diabetes.head()

In [None]:
f_names = ['preg','plas', 'pres','skin', 'insu','mass','pedi','age']
diabetes2C = diabetes[f_names + ['neg_pos']]
diabetes2C['neg_pos'] = diabetes2C['neg_pos'].astype('category')
diabetes2C['neg_pos']= diabetes2C['neg_pos'].cat.codes


In [None]:
y = diabetes2C.pop('neg_pos').values
X_raw = diabetes2C.values
feature_names = diabetes2C.columns
X_tr_raw, X_ts_raw, y_train, y_test = train_test_split(X_raw, y, random_state=2, test_size=1/2)
#Testing if training dataset was larger and testing portion was smaller do we get better outcomes?
#X_tr_raw, X_ts_raw, y_train, y_test = train_test_split(X_raw, y, random_state=2, test_size=1/3)
#X_tr_raw, X_ts_raw, y_train, y_test = train_test_split(X_raw, y, random_state=2, test_size=1/4)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_tr_raw)
X_test = scaler.transform(X_ts_raw)
max_k = X_train.shape[1]
X_train.shape, X_test.shape

### My Gaussian Naive Bayes prediction on the Diabetes dataset

In [None]:
mgnb.fit(X_train,y_train)

In [None]:
y_pred=mgnb.predict(X_test)

In [None]:
mgnb.score(X_test,y_test)

In [None]:
confusion = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n{}".format(confusion))

### Scikit learn Gaussian Naive Bayes example on the Diabetes dataset

In [None]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)

In [None]:
y_pred=gnb.predict(X_test)

In [None]:
gnb.score(X_test,y_test)

In [None]:
confusion = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n{}".format(confusion))

This dataset does not perform as well as the penguins dataset with either the GaussianNb or the MyGaussianNB predictions mean accuracy falls from the mid 90's when predicting for the penguins dataset to the low 70's with this new data. We also get a lot more false positives and negatives in the confusion matrix.
MyGaussianNB<br>
Confusion matrix:<br>
[[208  48]<br>
[ 62  66]]<br>
48 false positives and 62 false negatives this is much more serious that misclassifying penguins. As the people will eaither be recieving medication they don't need (the false positives) or not recieving the medication they do need false negatives.

GaussianNb<br>
Confusion matrix:<br>
[[211  45]<br>
[ 56  72]]<br>



## GlassV2 example

In [None]:
glass = pd.read_csv('glassV2.csv', index_col = False)
glass.head()

In [None]:
#f_names = ['RI','Na', 'Mg','Al', 'Si','K','Ca','Ba','Fe']
f_names = ['RI','Na', 'Mg','Al', 'Si','K','Ca']
glass2C = glass[f_names + ['Type']]
glass2C['Type'] = glass2C['Type'].astype('category')
glass2C['Type']= glass2C['Type'].cat.codes


In [None]:
y = glass2C.pop('Type').values
X_raw = glass2C.values
feature_names = glass2C.columns
X_tr_raw, X_ts_raw, y_train, y_test = train_test_split(X_raw, y, random_state=2, test_size=1/2)
#Testing if training dataset was larger and testing portion was smaller do we get better outcomes?
#X_tr_raw, X_ts_raw, y_train, y_test = train_test_split(X_raw, y, random_state=2, test_size=1/3)
#X_tr_raw, X_ts_raw, y_train, y_test = train_test_split(X_raw, y, random_state=2, test_size=1/4)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_tr_raw)
X_test = scaler.transform(X_ts_raw)
max_k = X_train.shape[1]
X_train.shape, X_test.shape

### My Gaussian Naive Bayes prediction on the glassV2 dataset

In [None]:
mgnb.fit(X_train,y_train)

In [None]:
y_pred=mgnb.predict(X_test)

In [None]:
mgnb.score(X_test,y_test)

In [None]:
confusion = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n{}".format(confusion))

### Scikit learn Gaussian Naive Bayes example on the gassV2 dataset

In [None]:
gnb=GaussianNB()

In [None]:
gnb.fit(X_train,y_train)

In [None]:
y_pred=gnb.predict(X_test)

In [None]:
gnb.score(X_test,y_test)

In [None]:
confusion = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n{}".format(confusion))

I removed Fe and BA from the dataset for the purpose of testing the accuracy of MyGaussianNB as it did not handle divinging by zero gracefully and the predictive power was very low.<br>
Confusion matrix:<br>
[[ 0  0 35  0  0]<br>
 [ 0  0 34  0  0]<br>
 [ 0  0  7  0  0]<br>
 [ 0  0  9  0  0]<br>
 [ 0  0 18  0  0]]<br>
 
With a mean accuracy of 0.068.
 
The GaussianNB predictions seemed to handle this just fine:<br>
Confusion matrix:<br>
[[ 9  4 22  0  0]<br>
 [ 9 12 13  0  0]<br>
 [ 5  0  2  0  0]<br>
 [ 0  3  0  3  3]<br>
 [ 1  3  0  1 13]]<br>
 
 and a mean accuracy of 0.378.
 
with the removal of the two fetures that consisted of mostly zero we see a marked improvement in the predictive power of MyGaussianNB with mean accuracy rising to 0.368 and the confusion matrix display more predictive capabilities.<br>
Confusion matrix:<br>
[[ 8  3 24  0  0]<br>
 [ 9 11 14  0  0]<br>
 [ 5  0  2  0  0]<br>
 [ 0  5  0  4  0]<br>
 [ 1  4  0  0 13]]<br>
 
The GaussianNB implementation remains relatively unchanged leading me to believe that it handles data that results in division by zero, so that it does not affect the prediction. 

Testing with different train/test data splits:<br>
When we give a larger training set this dataset is different to the two previous ones in that the predictive power goes down and the predictions get worse with lower True positives and True negatives.


### Conclusion
The GaussianNB class has out performed the MyGaussianNB implementation But only slightly across the various datasets. What was more evident was that the GaussianNB implementation accounted for things like values being divided by zero while mine did not. Trying differnt feature combinations did not yield much success except in the case of removing the two mostly zero value columns in the Diabetes data. While sometimes initially you noticed a better mean average score when you looked at the confusion matrix sometimes things deteriorate with more false positives and negatives. So it's important to look at a few evaluation methods and not rely on just the mean average score but it was helpful in developing the MyGaussianNB as it was a quick way to check you were on the right track.<br>
The worst performing of the three datasets was GlassV2 and we can see a trend between the three dataset as we process them. The independance of the features seems to decrease from penguins to glassV2 and this can cause Naive Gaussian Bayes to perform poorly. The backbone of Naive Bayes is that features are independent of one another but as we've seen thats not the case with all datasets. So given the three datasets in the future I would look at other prediction models for both the Diabetes and Glass V2 datasets to see if improvement could be found with another model. 