In [1]:
import math
import random
import csv
import pandas as pd

In [2]:
# the categorical class names are changed to numberic data
def encode_class(mydata):
    classes = []
    for i in range(len(mydata)):
        if mydata[i][-1] not in classes:
            classes.append(mydata[i][-1])
    for i in range(len(classes)):
        for j in range(len(mydata)):
            if mydata[j][-1] == classes[i]:
                mydata[j][-1] = i
    return mydata 

In [3]:
# Splitting the data
def splitting(mydata, ratio):
    train_num = int(len(mydata) * ratio)
    train = []
    # initially testset will have all the dataset
    test = list(mydata)
    while len(train) < train_num:
        # index generated randomly from range 0
        # to length of testset
        index = random.randrange(len(test))
        # from testset, pop data rows and put it in train
        train.append(test.pop(index))
    return train, test

In [4]:
# Group the data rows under each class yes or no in dictionary eg: dict[yes] and dict[no]
def groupUnderClass(mydata):
      dict = {}
      for i in range(len(mydata)):
          if (mydata[i][-1] not in dict):
              dict[mydata[i][-1]] = []
          dict[mydata[i][-1]].append(mydata[i])
      return dict
 

In [5]:
# Mean
def mean(numbers):
    return sum(numbers) / float(len(numbers))
 

In [6]:
# Standard Deviation
def std_dev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
    
    return math.sqrt(variance)

In [7]:
def MeanAndStdDev(mydata):
    info = [(mean(attribute), std_dev(attribute)) for attribute in zip(*mydata)]
    del info[-1]
    return info

In [8]:
# find Mean and Standard Deviation under each class
def MeanAndStdDevForClass(mydata):
    info = {}
    dict = groupUnderClass(mydata)
    for classValue, instances in dict.items():
        info[classValue] = MeanAndStdDev(instances)
    return info

In [9]:
# Calculate Gaussian Probability Density Function
def calculateGaussianProbability(x, mean, stdev):
    expo = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * expo

In [10]:
# Calculate Class Probabilities
def calculateClassProbabilities(info, test):
    probabilities = {}
    for classValue, classSummaries in info.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, std_dev = classSummaries[i]
            x = test[i]
            probabilities[classValue] *= calculateGaussianProbability(x, mean, std_dev)
    return probabilities

In [11]:
# Make prediction - highest probability is the prediction
def predict(info, test):
    probabilities = calculateClassProbabilities(info, test)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [12]:
# returns predictions for a set of examples
def getPredictions(info, test):
    predictions = []
    for i in range(len(test)):
        result = predict(info, test[i])
        predictions.append(result)
    return predictions

In [13]:
# Accuracy
def accuracy_rate(test, predictions):
    correct = 0
    for i in range(len(test)):
        if test[i][-1] == predictions[i]:
            correct += 1
    return (correct / float(len(test))) * 100.0

In [14]:
df = pd.read_csv('JM1.arff')
df

Unnamed: 0,447,826,12,157,470,385,113,2824,210.28,384.45,...,8441,0,1726654.57,80843.08,3021,5420,609,155,3442,1
0,37,29,8,42,19,19,6,133,108.14,46.32,...,685,0.02,12891.31,5009.32,295,390,121,38,222,1
1,11,405,0,17,404,2,1,814,101.20,206.01,...,2033,0.00,238607.05,20848.47,813,1220,811,411,844,1
2,106,240,7,344,127,105,33,952,218.17,215.17,...,5669,0.00,561159.25,46943.69,2301,3368,262,49,1411,1
3,101,464,11,75,263,256,140,1339,106.50,337.36,...,4308,0.00,673377.60,35928.07,1556,2752,226,98,1532,1
4,67,187,4,1,94,63,27,391,233.07,58.04,...,1780,0.02,43621.22,13527.84,718,1062,167,27,466,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9587,2,7,0,0,4,4,1,13,32.93,7.33,...,52,0.14,98.38,241.48,22,30,15,10,18,0
9588,2,3,0,0,2,2,1,5,15.72,8.25,...,30,0.12,59.43,129.66,11,19,8,12,9,0
9589,10,7,0,1,4,2,1,29,19.68,26.40,...,103,0.04,762.04,519.57,44,59,15,18,42,0
9590,2,1,0,0,1,1,1,6,17.44,8.44,...,36,0.12,68.98,147.15,15,21,8,9,10,0


In [15]:
df.shape

(9592, 22)

In [16]:
# add file
filename = 'JM1.arff'
 
 
# load the file and store it in mydata list
mydata = csv.reader(open(filename))
mydata = list(mydata)
mydata = encode_class(mydata)
for i in range(len(mydata)):
    mydata[i] = [float(x) for x in mydata[i]]
 
     
# split ratio = 0.7
# 70% of data is training data and 30% is test data used for testing
ratio = 0.7
train_data, test_data = splitting(mydata, ratio)
print('Total number of examples are: ', len(mydata))
print('Out of these, training examples are: ', len(train_data))
print("Test examples are: ", len(test_data))
 
# prepare model
info = MeanAndStdDevForClass(train_data)
 
# test model
predictions = getPredictions(info, test_data)
accuracy = accuracy_rate(test_data, predictions)
print("Accuracy of your model is: ", accuracy)

Total number of examples are:  9593
Out of these, training examples are:  6715
Test examples are:  2878
Accuracy of your model is:  81.93189715079917
