In [379]:
import pandas as pd
import numpy as np
from random import randint
iris = pd.read_csv("iris.csv")

In [380]:
class ProcessingData(object):
    @staticmethod
    def normalization(data):
        data.iloc[:, :-1] = (data.iloc[:, :-1] - data.iloc[:, :-1].min()) / (data.iloc[:, :-1].max() - data.iloc[:, :-1].min())

    @staticmethod
    def shuffling(data):
        for i in range(len(data) - 1, 0, -1):
            new_place = randint(0, i - 1)
            data.loc[i], data.loc[new_place] = data.loc[new_place], data.loc[i]

    @staticmethod
    def divide(data, n):
        return data[:n], data[n:]

In [381]:
data = iris.copy()
ProcessingData.normalization(data)
ProcessingData.shuffling(data)
train_data, validation_data = ProcessingData.divide(data, 105)

In [382]:
class Bayes(object):

    @staticmethod
    def bayes(train_data, validation_data):
        data = train_data.sort_values(by=["variety"])
        setosa = data[data.variety == "Setosa"]
        virginica = data[data.variety == "Virginica"]
        versicolor = data[data.variety == "Versicolor"]
        sorted_data = [setosa, virginica, versicolor]
        mean_array = []
        std_dev_array = []

        for x in sorted_data:
            m = x.iloc[:, :-1].mean()
            std_dev = Bayes.__std_dev(x, m)
            mean_array.append(m)
            std_dev_array.append(std_dev)

        return Bayes.__gauss(validation_data, np.array(mean_array), std_dev_array)
            

    def __std_dev(x, m):
        o = [0] * len(m) 
        n = len(x)
        for i in range(0, len(x)):
            xi = x.iloc[i, :-1]
            for j in range(0, len(m)):
                o[j] += (xi.iloc[j, ] - m.iloc[j, ]) ** 2

        for i in range(0, len(m)):
            o[i] = np.sqrt((1 / n) * o[i])
        return o


    def __gauss(x, m, std_dev):
        correct = 0
        for i in range(0, len(x)):
            correct_variety = x.iloc[i].variety
            results = []
            for j in range(0, len(m)):
                r = 1
                for k in range(0, len(m[j])):
                    r *= (1 / np.sqrt(2 * np.pi) * std_dev[j][k]) * np.exp((-np.power(x.iloc[i, k] - m[j][k], 2)) / 2 * np.power(std_dev[j][k], 2))
                results.append(r)

            if np.argmax(results) == 0:
                current_name = "Setosa"
            elif np.argmax(results) == 1:
                current_name = "Versicolor"
            elif np.argmax(results) == 2:
                current_name = "Virginica"

            if correct_variety == current_name:
                correct += 1   

        return correct * 100 / len(x)


In [383]:
result = Bayes.bayes(train_data, validation_data)