In [1]:
import numpy as np

In [7]:
import random

def separate_train_test(csv_file, train_file, test_file, split_proportion=0.3):
    with open(csv_file,mode='r') as infile, open(train_file,mode='w') as trainfile, open(test_file,mode='w') as testfile:
        inrows = infile.readlines()
        test_indexes = random.sample(range(len(inrows)), round(len(inrows)*split_proportion))
        for ix,row in enumerate(inrows):
            if ix in test_indexes:
                testfile.write(row)
            else:
                trainfile.write(row)

In [8]:
directory_path = 'C:\\Users\\maniksri\\Documents\\DataAnalysis\\'
separate_train_test(directory_path + 'iris.csv', directory_path + 'iris_train.csv', directory_path + 'iris_test.csv', 0.33)

In [92]:
def separate_features_classes(csv_file, class_index):
    import csv
    features_array = []
    classes_array = []
    with open(csv_file,mode='r') as infile:
        reader = csv.reader(infile)
        for row in reader:
            classes_array.append(row.pop(class_index))
            features_array.append([float(item) for item in row])
    return (features_array, classes_array)            

In [122]:
train_features,train_classes = separate_features_classes(directory_path + 'iris_train.csv', class_index=4)
train_features[0]

[5.1, 3.5, 1.4, 0.2]

In [121]:
test_features,test_classes = separate_features_classes(directory_path + 'iris_test.csv', class_index=4)
test_features[0]

[4.6, 3.1, 1.5, 0.2]

In [95]:
#group by class
def group_by_class(features,classes):
    grouped_classes = {}
    for f,c in zip(features,classes):
        if c not in grouped_classes.keys():
            grouped_classes[c] = []
        grouped_classes[c].append(f)
        
    return grouped_classes

In [205]:
#summarize by class
N=0
train_class_proportion = {}
def summarize_by_class(features,classes):
    import statistics
    summarized_class = {} #mean, stdev
    grouped_class = group_by_class(features,classes)
    global N
    N=0
    global train_class_proportion
    sample_count = {}
    for label in grouped_class:
        train_class_proportion[label] = len(grouped_class[label])
        N = N + len(grouped_class[label])
        features_array = zip(*grouped_class[label])
        summarized_class[label] = []
        for feature in features_array:
            f_mean = statistics.mean(feature)
            f_stdev = statistics.stdev(feature)
            summarized_class[label].append((f_mean,f_stdev))
    
    for key,val in train_class_proportion.items():
        train_class_proportion[key] = val / N
    
    return summarized_class
                
            

In [207]:
summarize_by_class(train_features,train_classes)
train_class_proportion

{'setosa': 0.32, 'versicolor': 0.34, 'virginica': 0.34}

In [196]:
#calcuate Gaussian probability
def calculate_gaussian_probability(x, stats):
    import math
    mean,stdev = stats
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent


In [210]:

def calculate_test_features_probabilities(test_features_array):
    test_features_probabilities = []
    trained_class_summary = summarize_by_class(train_features,train_classes)
    
    for item in test_features_array:
        item_probability = {}
        for trained_class,feature_summary in trained_class_summary.items():
            item_parms = (zip(item, feature_summary))
            prob = 1 * train_class_proportion[trained_class]
            for item_parm in item_parms:
                prob = prob * calculate_gaussian_probability(item_parm[0],item_parm[1])
            item_probability[trained_class] = prob
        test_features_probabilities.append(item_probability)
    return test_features_probabilities

In [217]:
test_probabilities = calculate_test_features_probabilities(test_features)
test_class_pred = []

for test_item in test_probabilities:
    prob_class = None
    max_prob = 0
    for key,val in test_item.items():
        if val > max_prob: 
            max_prob = val
            prob_class = key
    test_class_pred.append(prob_class) 

temp = zip(test_class_pred,test_classes) 
total_test_items = 0
total_test_correct = 0

for item in temp:
    total_test_items += 1
    if(item[0] == item[1]): total_test_correct += 1

accuracy = total_test_correct / (total_test_items * 1.0)

In [218]:
accuracy

0.96

In [219]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

clf = GaussianNB()
clf.fit(train_features,train_classes)
clf_pred = clf.predict(test_features)

print(accuracy_score(clf_pred,test_classes))

0.96
