In [10]:
import csv
import random
import math
import numpy as np
import pandas as pd
import sys
import time
import matplotlib.pyplot as plt
from collections import Counter

iris_dataset = './iris.data'
pima_dataset = './pima-indians-diabetes.data'
iris_df = pd.read_csv(iris_dataset, sep=',', header=None)
pima_df = pd.read_csv(pima_dataset, sep=',', header=None)

iris_df =  pd.DataFrame(iris_df)
iris_df = iris_df.sample(frac=1) # suffle dataset
pima_df =  pd.DataFrame(pima_df)
pima_df = pima_df.sample(frac=1)


iris_data = iris_df.values
pima_data = pima_df.values

In [14]:
def datasetClassDict(train_dataset, features):
    iris_class = {}
    for i in range(len(train_dataset)):
        vector = train_dataset[i]
        if (vector[-1] not in iris_class):
            iris_class[vector[-1]] = []
        iris_class[vector[-1]].append(vector[0:features])
    return iris_class

def mean(values):
    m = sum(values)/float(len(values))
    return m

def stdev(values):
    m = mean(values)
    stdev = math.sqrt( sum([pow(v-m,2) for v in values])/float(len(values)-1) )
    return stdev

def summarize(x):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*x)]
    
    return summaries

def summarizeByClass(train_dataset):

    summaries = {}
    for key, value in train_dataset.items():
        summaries[key] = summarize(value)
    return summaries


def gaussianProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateClassProbabilities(summaries, test_row):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = test_row[i]
            probabilities[classValue] *= gaussianProbability(x, mean, stdev)
    return probabilities

def predict(summaries, test_row):
    probabilities = calculateClassProbabilities(summaries, test_row)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel


In [15]:
""" 0.33 SPLIT TRAINING """

split_ratio = 0.333
iris_features = np.size(iris_data,1) -1
pima_features = np.size(pima_data,1) -1

########   IRIS DATASET    ########

split_val = int( len(iris_data) * split_ratio) +1
iris_train = iris_data[0:2*split_val]
iris_test = iris_data[2*split_val:3*split_val]

iris_train_dict = datasetClassDict(iris_train, iris_features)
summaries = summarizeByClass(iris_train_dict) # prepare model
# summaries now has mean and standard deviation for each feature in each class

predictions = []
    
for i in range(len(iris_test)):
    result = predict(summaries, iris_test[i])
    predictions.append(result)

match = 0
if(len(predictions) == len(iris_test)):
    for i in range(len(iris_test) ):
        if iris_test[i][-1] == predictions[i]:
            match += 1

accuracy = (match/float(len(iris_test))) * 100.0

print('Total Accuracy for IRIS dataset with 1/3 split: ', accuracy,'%')

########   PIMA DATASET    ########

split_val = int( len(pima_data) * split_ratio) +1

pima_train = pima_data[0:2*split_val]
pima_test = pima_data[2*split_val:3*split_val]
pima_train_dict = datasetClassDict(pima_train, pima_features)
summaries = summarizeByClass(pima_train_dict) # prepare model
predictions = [] 
for i in range(len(pima_test)):
    result = predict(summaries, pima_test[i])
    predictions.append(result)
match = 0
if(len(predictions) == len(pima_test)):
    for i in range(len(pima_test) ):
        if pima_test[i][-1] == predictions[i]:
            match += 1
accuracy = (match/float(len(pima_test))) * 100.0
print('Total Accuracy for PIMA dataset with 1/3 split: ', accuracy,'%')


""" 10 FOLD VALIDATION """

folds = 10
fold_size = int(len(iris_data) / folds)

# dataset_dict = datasetClassDict(iris_data)
# total_summaries = summarizeByClass(dataset_dict)
total_accuracy = []
for i in range(folds):
    val_from = fold_size*i
    val_to = (i+1)*fold_size

    validation_set = iris_data[val_from:val_to]
    
    train_set = np.delete(iris_data, np.s_[val_from:val_to], 0)
    train_set_dict = datasetClassDict(train_set, iris_features)
    summaries = summarizeByClass(train_set_dict)

    predictions = []
    for i in range(len(validation_set)):
        result = predict(summaries, validation_set[i])
        predictions.append(result)

    match = 0
    if(len(predictions) == len(validation_set)):
        for i in range(len(validation_set) ):
            if validation_set[i][-1] == predictions[i]:
                match += 1

    accuracy = (match/float(len(validation_set))) * 100.0
    total_accuracy.append(accuracy)
print('Total Accuracy after for IRIS using 10-fold cross validation: ', np.mean(total_accuracy),'%')


########   PIMA DATASET    ########

fold_size = int(len(pima_data) / folds) 

total_accuracy = []
for i in range(folds):
    val_from = fold_size*i
    val_to = (i+1)*fold_size
    
    if i == 9:
        val_to = (len(pima_data))
    validation_set = pima_data[val_from:val_to]
    
    train_set = np.delete(pima_data, np.s_[val_from:val_to], 0)
    train_set_dict = datasetClassDict(train_set, pima_features)
    summaries = summarizeByClass(train_set_dict)

    predictions = []
    for i in range(len(validation_set)):
        result = predict(summaries, validation_set[i])
        predictions.append(result)

    match = 0
    if(len(predictions) == len(validation_set)):
        for i in range(len(validation_set) ):
            if validation_set[i][-1] == predictions[i]:
                match += 1

    accuracy = (match/float(len(validation_set))) * 100.0
    total_accuracy.append(accuracy)
print('Total Accuracy after for PIMA using 10-fold cross validation: ', np.mean(total_accuracy),'%')


Total Accuracy for IRIS dataset with 1/3 split:  96.0 %
Total Accuracy for PIMA dataset with 1/3 split:  71.09375 %
Total Accuracy after for IRIS using 10-fold cross validation:  95.33333333333334 %
Total Accuracy after for PIMA using 10-fold cross validation:  74.52380952380952 %
