In [29]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [30]:

class NaiveBayes:
    def fit(self, X, y):
        # get number of samples (rows) and features (columns)
        self.n_samples, self.n_features = X.shape
        # get number of uniques classes
        self.n_classes = len(np.unique(y))
        # create three zero-matrices to store summary stats & prior
        self.mean = np.zeros((self.n_classes, self.n_features))
        self.variance = np.zeros((self.n_classes, self.n_features))
        self.priors = np.zeros(self.n_classes)

        for c in range(self.n_classes):
            # create a subset of data for the specific class 'c'
            X_c = X[y == c]
            # calculate statistics and update zero-matrices, rows=classes, cols=features
            self.mean[c, :] = np.mean(X_c, axis=0)
            self.variance[c, :] = np.var(X_c, axis=0)
            self.priors[c] = X_c.shape[0] / self.n_samples

    def predict(self, X):
        # for each sample x in the dataset X
        y_hat = [self.get_class_probability(x) for x in X]
        return np.array(y_hat)

    def get_class_probability(self, X):
        # store new posteriors for each class in a single list
        posteriors = list()

        for c in range(self.n_classes):
            # get summary stats & prior
            mean = self.mean[c]
            variance = self.variance[c]
            prior = np.log(self.priors[c])
            # calculate new posterior & append to list

            posterior = np.sum(np.log(self.gaussian_density(X, mean, variance)))
            posterior = prior + posterior
            posteriors.append(posterior)

        return np.argmax(posteriors)

    def gaussian_density(self, X, mean, var):
        # implementation of gaussian density function
        const = 1 / np.sqrt(var * 2 * np.pi)
        proba = np.exp(-0.5 * ((X - mean) ** 2 / var))

        return const * proba


# helper function to calculate accuracy
def get_accuracy(y_true, y_hat):
    return np.sum(y_true == y_hat) / len(y_true)


In [31]:
# Load Dataset
def load_iris_data():
    # load iris dataset
    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    # split into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = load_iris_data()

In [32]:
nb = NaiveBayes()
nb.fit(X_train, y_train)
train_predictions = nb.predict(X_train)
test_predictions = nb.predict(X_test)

# print results
print('Naive Bayes Train Accuracy: ', get_accuracy(y_train, train_predictions))
print('Naive Bayes Test Accuracy: ', get_accuracy(y_test, test_predictions))

Naive Bayes Train Accuracy:  0.95
Naive Bayes Test Accuracy:  0.9666666666666667
