In [1]:
from sklearn import datasets
import pandas as pd

iris = datasets.load_iris()
iris_df=pd.DataFrame(iris.data)
iris_df['class']=iris.target

iris_df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
iris_df.dropna(how="all", inplace=True) # remove any empty lines

iris_X=iris_df.iloc[:,[0,1,2,3]]
vals = iris_X.values.tolist()


In [2]:
import numpy as np 


class NaiveBayesClassifier:

    def __init__(self):
        pass

    def separate_classes(self, X, y):
        separated_classes = {}
        for i in range(len(X)):
            feature_values = X[i]
            class_name = y[i]
            if class_name not in separated_classes:
                separated_classes[class_name] = []
            separated_classes[class_name].append(feature_values)
        return separated_classes


    def stat_info(self, X):
        for feature in zip(*X):
            yield {
                'std' : np.std(feature),
                'mean' : np.mean(feature)
            }
            

    def fit (self, X, y):
        separated_classes = self.separate_classes(X, y)
        self.class_summary = {}

        for class_name, feature_values in separated_classes.items():
            self.class_summary[class_name] = {
                'prior_proba': len(feature_values)/len(X),
                'summary': [i for i in self.stat_info(feature_values)],
            }
        return self.class_summary


    def distribution(self, x, mean, std):
        exponent = np.exp(-((x-mean)**2 / (2*std**2)))

        return exponent / (np.sqrt(2*np.pi)*std)


    def predict(self, X):       
        MAPs = []

        for row in X:
            joint_proba = {}
            
            for class_name, features in self.class_summary.items():
                total_features =  len(features['summary'])
                likelihood = 1

                for idx in range(total_features):
                    feature = row[idx]
                    mean = features['summary'][idx]['mean']
                    stdev = features['summary'][idx]['std']
                    normal_proba = self.distribution(feature, mean, stdev)
                    likelihood *= normal_proba
                prior_proba = features['prior_proba']
                joint_proba[class_name] = prior_proba * likelihood

            MAP = max(joint_proba, key= joint_proba.get)
            MAPs.append(MAP)

        return MAPs


    def accuracy(self, y_test, y_pred):
        true_true = 0

        for y_t, y_p in zip(y_test, y_pred):
            if y_t == y_p:
                true_true += 1 
        return true_true / len(y_test)

In [3]:
import random

def split_data(data, weight):
    train_length = int(len(data) * weight)
    train = []
    for i in range(train_length):
        idx = random.randrange(len(data))
        train.append(data[idx])
        data.pop(idx)
    return [train, data]

train, test = split_data(vals, 0.8)

X_train = []
y_train = []
X_test = []
y_test = []

for i in range(len(train)):
    y_train.append(train[i][0])
    X_train.append(train[i][1:])
      
for i in range(len(test)):
    y_test.append(test[i][0])
    X_test.append(test[i][1:])

model = NaiveBayesClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("NaiveBayesClassifier accuracy: {0:.3f}".format(model.accuracy(y_test, y_pred)))

NaiveBayesClassifier accuracy: 0.100


