In [55]:
import numpy as np
import pandas as pd

In [56]:
class GaussianNaiveBayes:
     def fit(self, X, y):
         n_samples, n_features = X.shape
         self._classes = np.unique(y)
         n_classes = len(self._classes)
         self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
         self._var = np.zeros((n_classes, n_features), dtype=np.float64)
         self._priors =  np.zeros(n_classes, dtype=np.float64)

         # calculating the mean, variance and prior P(H) for each class
         for i, c in enumerate(self._classes):
             X_for_class_c = X[y==c]
             self._mean[i, :] = X_for_class_c.mean(axis=0)
             self._var[i, :] = X_for_class_c.var(axis=0)
             self._priors[i] = X_for_class_c.shape[0] / float(n_samples) 
     def _calculate_likelihood(self, class_idx, x):
         mean = self._mean[class_idx]
         var = self._var[class_idx]
         num = np.exp(- (x-mean)**2 / (2 * var))
         denom = np.sqrt(2 * np.pi * var)
         return num / denom
         
     def predict(self, X):
         y_pred = [self._classify_sample(x) for x in X]
         return np.array(y_pred)

     def _classify_sample(self, x):
        posteriors = []
    # calculating posterior probability for each class
        for i, c in enumerate(self._classes):
            prior = np.log(self._priors[i])
            posterior = np.sum(np.log(self._calculate_likelihood(i, x)))
            posterior = prior + posterior
            posteriors.append(posterior)
        # return the class with highest posterior probability
        return self._classes[np.argmax(posteriors)] 

In [57]:
dataset_train = pd.read_csv('E:\\MASABAH\\MS DS 2k21\\Machine Learning\\Assignment1\\TrainingSet.csv')
dataset_test = pd.read_csv('E:\\MASABAH\\MS DS 2k21\\Machine Learning\\Assignment1\\TestSet1.csv')

In [58]:
feature_columns = ['leaf.length','leaf.width','flower.length','flower.width']

X_train = dataset_train[feature_columns].values

y_train = dataset_train['plant'].values

X_test = dataset_test[feature_columns].values

y_test = dataset_test['plant'].values

In [59]:
nb = GaussianNaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)
dataset_test['plant'] = predictions

In [60]:
dataset_test

Unnamed: 0,leaf.length,leaf.width,flower.length,flower.width,plant
0,4.4,2.9,1.4,0.2,Arctica
1,4.6,3.1,1.5,0.2,Arctica
2,4.6,3.4,1.4,0.3,Arctica
3,4.7,3.2,1.3,0.2,Arctica
4,4.9,3.0,1.4,0.2,Arctica
5,4.9,3.1,1.5,0.1,Arctica
6,4.9,2.4,3.3,1.0,Harlequin
7,4.9,2.5,4.5,1.7,Harlequin
8,5.0,3.6,1.4,0.2,Arctica
9,5.0,3.4,1.5,0.2,Arctica


In [52]:
dataset_test.to_csv("E:\\MASABAH\\MS DS 2k21\\Machine Learning\\Assignment1\\NaiveBayes_results.csv")