In [161]:
import math
import random
import csv
import pandas as pd

In [162]:
# the categorical class names are changed to numberic data
# eg: yes and no encoded to 1 and 0
def encode_class(mydata):
    classes = []
    for i in range(len(mydata)):
        if mydata[i][-1] not in classes:
            classes.append(mydata[i][-1])
    for i in range(len(classes)):
        for j in range(len(mydata)):
            if mydata[j][-1] == classes[i]:
                mydata[j][-1] = i
    return mydata 

In [163]:
# Splitting the data
def splitting(mydata, ratio):
    train_num = int(len(mydata) * ratio)
    train = []
    # initially testset will have all the dataset
    test = list(mydata)
    while len(train) < train_num:
        # index generated randomly from range 0
        # to length of testset
        index = random.randrange(len(test))
        # from testset, pop data rows and put it in train
        train.append(test.pop(index))
    return train, test

In [164]:
# Group the data rows under each class yes or no in dictionary eg: dict[yes] and dict[no]
def groupUnderClass(mydata):
      dict = {}
      for i in range(len(mydata)):
          if (mydata[i][-1] not in dict):
              dict[mydata[i][-1]] = []
          dict[mydata[i][-1]].append(mydata[i])
      return dict
 

In [165]:
# Mean
def mean(numbers):
    return sum(numbers) / float(len(numbers))
 

In [166]:
# Standard Deviation
def std_dev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
    
    return math.sqrt(variance)

In [167]:
def MeanAndStdDev(mydata):
    info = [(mean(attribute), std_dev(attribute)) for attribute in zip(*mydata)]
    del info[-1]
    return info

In [168]:
# find Mean and Standard Deviation under each class
def MeanAndStdDevForClass(mydata):
    info = {}
    dict = groupUnderClass(mydata)
    for classValue, instances in dict.items():
        info[classValue] = MeanAndStdDev(instances)
    return info

In [169]:
# Calculate Gaussian Probability Density Function
def calculateGaussianProbability(x, mean, stdev):
    expo = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * expo

In [170]:
# Calculate Class Probabilities
def calculateClassProbabilities(info, test):
    probabilities = {}
    for classValue, classSummaries in info.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, std_dev = classSummaries[i]
            x = test[i]
            probabilities[classValue] *= calculateGaussianProbability(x, mean, std_dev)
    return probabilities

In [171]:
# Make prediction - highest probability is the prediction
def predict(info, test):
    probabilities = calculateClassProbabilities(info, test)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [172]:
# returns predictions for a set of examples
def getPredictions(info, test):
    predictions = []
    for i in range(len(test)):
        result = predict(info, test[i])
        predictions.append(result)
    return predictions

In [173]:
# Accuracy
def accuracy_rate(test, predictions):
    correct = 0
    for i in range(len(test)):
        if test[i][-1] == predictions[i]:
            correct += 1
    return (correct / float(len(test))) * 100.0

In [174]:
df = pd.read_csv('pc1.arff')
df

Unnamed: 0,0,5,1,0.1,0.2,8,3,0.38,4,2,...,9,0.33.1,17.1,24,5.1,10.1,9.1,0.4,8.2,0.5
0,1,3,2,0,0,4,2,0.40,2,2.00,...,8,0.29,10,10,7,8,7,0.00,5,0
1,18,19,5,1,58,34,10,0.16,16,2.13,...,33,0.07,177,215,55,25,138,49.58,61,0
2,2,3,0,0,9,4,2,0.20,2,2.00,...,6,0.09,33,35,17,7,22,47.37,10,0
3,36,13,8,0,42,18,7,0.06,8,2.25,...,33,0.03,458,544,107,28,218,25.93,120,1
4,43,39,20,1,35,56,22,0.20,24,2.33,...,78,0.10,194,270,91,31,221,25.17,108,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753,56,71,13,42,24,140,36,0.26,70,2.00,...,147,0.16,489,567,79,29,230,40.00,141,0
754,6,7,2,2,7,12,4,0.14,6,2.00,...,15,0.10,51,57,16,12,42,25.71,28,0
755,4,20,8,2,2,24,12,0.27,12,2.00,...,38,0.23,99,122,45,26,52,8.51,45,0
756,4,11,4,0,3,18,6,0.25,8,2.25,...,18,0.19,52,59,20,15,32,11.11,24,0


In [175]:
df.shape

(758, 38)

In [176]:
# add file
filename = 'pc1.arff'
 
 
# load the file and store it in mydata list
mydata = csv.reader(open(filename))
mydata = list(mydata)
mydata = encode_class(mydata)
for i in range(len(mydata)):
    mydata[i] = [float(x) for x in mydata[i]]
 
     
# split ratio = 0.7
# 70% of data is training data and 30% is test data used for testing
ratio = 0.7
train_data, test_data = splitting(mydata, ratio)
print('Total number of examples are: ', len(mydata))
print('Out of these, training examples are: ', len(train_data))
print("Test examples are: ", len(test_data))
 
# prepare model
info = MeanAndStdDevForClass(train_data)
 
# test model
predictions = getPredictions(info, test_data)
accuracy = accuracy_rate(test_data, predictions)
print("Accuracy of your model is: ", accuracy)

Total number of examples are:  759
Out of these, training examples are:  531
Test examples are:  228
Accuracy of your model is:  85.08771929824562
