In [21]:
import pandas as pd
import numpy as np
import csv
import random
from math import sqrt
from math import pi
from math import exp
from random import randrange

In [22]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [23]:
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

In [24]:
def str_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column])

In [25]:
def str_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

In [26]:
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

In [27]:
def confusion(actual , predicted):
    mtrx =np.array([[0,0,0],[0,0,0],[0,0,0]])
    for x in range(len(actual)):
        i = actual[x]
        j = predicted[x]
        mtrx[i][j] +=1
    return mtrx

In [28]:
def accuracy(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [29]:
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    conf_mat = []
    for fold in folds:
        train = list(folds)
        train.remove(fold)
        train = sum(train, [])
        test = list()
        for row in fold:
            row_copy = list(row)
            test.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train, test, *args)
        actual = [row[-1] for row in fold]
        acc = accuracy(actual, predicted)
        scores.append(acc)
    conf_mat.append(confusion(actual , predicted))
    return scores , conf_mat

In [30]:
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

In [31]:
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries
 
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

def calculate_probability(x, mean, stdev):
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

In [32]:
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

In [33]:
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

In [34]:
def naive_bayes(train, test):
    summarize = summarize_by_class(train)
    preds = list()
    for row in test:
        output = predict(summarize, row)
        preds.append(output)
    return(preds)

In [35]:
with open('iris.csv', newline='') as f:
    reader = csv.reader(f)
    data = list(reader)

In [36]:
data

[['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'],
 ['4.9', '3', '1.4', '0.2', 'Iris-setosa'],
 ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'],
 ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'],
 ['5', '3.6', '1.4', '0.2', 'Iris-setosa'],
 ['5.4', '3.9', '1.7', '0.4', 'Iris-setosa'],
 ['4.6', '3.4', '1.4', '0.3', 'Iris-setosa'],
 ['5', '3.4', '1.5', '0.2', 'Iris-setosa'],
 ['4.4', '2.9', '1.4', '0.2', 'Iris-setosa'],
 ['4.9', '3.1', '1.5', '0.1', 'Iris-setosa'],
 ['5.4', '3.7', '1.5', '0.2', 'Iris-setosa'],
 ['4.8', '3.4', '1.6', '0.2', 'Iris-setosa'],
 ['4.8', '3', '1.4', '0.1', 'Iris-setosa'],
 ['4.3', '3', '1.1', '0.1', 'Iris-setosa'],
 ['5.8', '4', '1.2', '0.2', 'Iris-setosa'],
 ['5.7', '4.4', '1.5', '0.4', 'Iris-setosa'],
 ['5.4', '3.9', '1.3', '0.4', 'Iris-setosa'],
 ['5.1', '3.5', '1.4', '0.3', 'Iris-setosa'],
 ['5.7', '3.8', '1.7', '0.3', 'Iris-setosa'],
 ['5.1', '3.8', '1.5', '0.3', 'Iris-setosa'],
 ['5.4', '3.4', '1.7', '0.2', 'Iris-setosa'],
 ['5.1', '3.7', '1.5', '0.4', 'Iris-setosa'],


In [37]:
data = data[1:]

In [38]:
data

[['4.9', '3', '1.4', '0.2', 'Iris-setosa'],
 ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'],
 ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'],
 ['5', '3.6', '1.4', '0.2', 'Iris-setosa'],
 ['5.4', '3.9', '1.7', '0.4', 'Iris-setosa'],
 ['4.6', '3.4', '1.4', '0.3', 'Iris-setosa'],
 ['5', '3.4', '1.5', '0.2', 'Iris-setosa'],
 ['4.4', '2.9', '1.4', '0.2', 'Iris-setosa'],
 ['4.9', '3.1', '1.5', '0.1', 'Iris-setosa'],
 ['5.4', '3.7', '1.5', '0.2', 'Iris-setosa'],
 ['4.8', '3.4', '1.6', '0.2', 'Iris-setosa'],
 ['4.8', '3', '1.4', '0.1', 'Iris-setosa'],
 ['4.3', '3', '1.1', '0.1', 'Iris-setosa'],
 ['5.8', '4', '1.2', '0.2', 'Iris-setosa'],
 ['5.7', '4.4', '1.5', '0.4', 'Iris-setosa'],
 ['5.4', '3.9', '1.3', '0.4', 'Iris-setosa'],
 ['5.1', '3.5', '1.4', '0.3', 'Iris-setosa'],
 ['5.7', '3.8', '1.7', '0.3', 'Iris-setosa'],
 ['5.1', '3.8', '1.5', '0.3', 'Iris-setosa'],
 ['5.4', '3.4', '1.7', '0.2', 'Iris-setosa'],
 ['5.1', '3.7', '1.5', '0.4', 'Iris-setosa'],
 ['4.6', '3.6', '1', '0.2', 'Iris-setosa'],
 [

In [43]:
for i in range(len(data[0])-1):
    str_to_float(data, i)

str_to_int(data, len(data[0])-1)

n_folds = 2
s , conf_mat= evaluate_algorithm(data, naive_bayes, n_folds)
print('accuracy ={} %'.format(sum(s)/float(len(s))))
print('confusion matrix is : \n {} '.format(conf_mat[0]))

accuracy =95.27027027027026 %
confusion matrix is : 
 [[19  2  0]
 [ 2 23  0]
 [ 0  0 28]] 


In [44]:
n_folds = 4
s , conf_mat= evaluate_algorithm(data, naive_bayes, n_folds)
print('accuracy {} %'.format(sum(s)/float(len(s))))
print('confusion matrix : \n {} '.format(conf_mat[0]))

accuracy 94.5945945945946 %
confusion matrix : 
 [[14  1  0]
 [ 1 12  0]
 [ 0  0  9]] 
