In [18]:
import csv
import random
import math

def loadcsv(filename):
    lines = csv.reader(open(filename, "r"));
    dataset = list(lines)
    for i in range(len(dataset)):
        # converting strings into numbers for processing
        dataset[i] = [float(x) for x in dataset[i]]

    return dataset

def splitDataset(dataset, splitratio):
    # 67% training size
    trainsize = int(len(dataset) * splitratio);
    trainset = []
    copy = list(dataset);
    while len(trainset) < trainsize:
        # generate indices for the dataset list randomly to pick ele for training data
        index = random.randrange(len(copy));
        trainset.append(copy.pop(index))
    return [trainset, copy]


def separatebyclass(dataset):
    separated = {}  # dictionary of classes 1 and 0
    # creates a dictionary of classes 1 and 0 where the values are
    # the instances belonging to each class
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated


def mean(numbers):
    return sum(numbers) / float(len(numbers))


def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
    return math.sqrt(variance)


def summarize(dataset):  # creates a dictionary of classes
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)];
    del summaries[-1]  # excluding labels +ve or -ve
    return summaries


def summarizeByClass(dataset):
    separated = separatebyclass(dataset);
    # print(separated)
    summaries = {}
    for classvalue, instances in separated.items():
        # for key,value in dic.items()
        # summaries is a dic of tuples(mean,std) for each class value
        summaries[classvalue] = summarize(instances)  # summarize is used to cal to mean and std
    return summaries


def calculateprobability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent


def calculateclassprobabilities(summaries, inputvector):
    probabilities = {}  # probabilities contains the all prob of all class of test data
    for classvalue, classsummaries in summaries.items():  # class and attribute information as mean and sd
        probabilities[classvalue] = 1
        for i in range(len(classsummaries)):
            mean, stdev = classsummaries[i]  # take mean and sd of every attribute for class 0 and 1 seperaely
            x = inputvector[i]  # testvector's first attribute
            probabilities[classvalue] *= calculateprobability(x, mean, stdev);  # use normal dist
    return probabilities


def predict(summaries, inputvector):  # training and test data is passed
    probabilities = calculateclassprobabilities(summaries, inputvector)
    bestLabel, bestProb = None, -1
    for classvalue, probability in probabilities.items():  # assigns that class which has he highest prob
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classvalue
    return bestLabel


def getPredictions(summaries, testset):
    predictions = []
    for i in range(len(testset)):
        result = predict(summaries, testset[i])
        predictions.append(result)
    return predictions


def getAccuracy(testset, predictions):
    correct = 0
    for i in range(len(testset)):
        if testset[i][-1] == predictions[i]:
            correct += 1
    return (correct / float(len(testset))) * 100.0


def main():
    filename = 'naivedata.csv'
    splitRatio = 0.80
    dataset = loadcsv(filename);
    print("\n The length of the Data Set : ", len(dataset))
    print("\n The Data Set Splitting into Training and Testing \n")
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    print('\n Number of Rows in Training Set:{0} rows'.format(len(trainingSet)))
    print('\n Number of Rows in Testing Set:{0} rows'.format(len(testSet)))
    print("\n First Five Rows of Training Set:\n")
    for i in range(0, 5):
        print(trainingSet[i], "\n")
    print("\n First Five Rows of Testing Set:\n")
    for i in range(0, 5):
        print(testSet[i], "\n")
    # prepare model
    summaries = summarizeByClass(trainingSet)
    print("\n Model Summaries:\n", summaries)
    # test model
    predictions = getPredictions(summaries, testSet)
    print("\nPredictions:\n", predictions)
    accuracy = getAccuracy(testSet, predictions)
    print('\n Accuracy: {0}%'.format(accuracy))
    # print("\n The Data Set :\n",dataset)

main()


 The length of the Data Set :  768

 The Data Set Splitting into Training and Testing 


 Number of Rows in Training Set:614 rows

 Number of Rows in Testing Set:154 rows

 First Five Rows of Training Set:

[0.0, 118.0, 84.0, 47.0, 230.0, 45.8, 0.551, 31.0, 1.0] 

[4.0, 110.0, 76.0, 20.0, 100.0, 28.4, 0.118, 27.0, 0.0] 

[6.0, 144.0, 72.0, 27.0, 228.0, 33.9, 0.255, 40.0, 0.0] 

[2.0, 99.0, 0.0, 0.0, 0.0, 22.2, 0.108, 23.0, 0.0] 

[0.0, 137.0, 70.0, 38.0, 0.0, 33.2, 0.17, 22.0, 0.0] 


 First Five Rows of Testing Set:

[1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.167, 21.0, 0.0] 

[3.0, 126.0, 88.0, 41.0, 235.0, 39.3, 0.704, 27.0, 0.0] 

[11.0, 143.0, 94.0, 33.0, 146.0, 36.6, 0.254, 51.0, 1.0] 

[1.0, 97.0, 66.0, 15.0, 140.0, 23.2, 0.487, 22.0, 0.0] 

[3.0, 158.0, 76.0, 36.0, 245.0, 31.6, 0.851, 28.0, 1.0] 


 Model Summaries:
 {1.0: [(4.785046728971962, 3.7304181376052497), (140.57009345794393, 32.07637954747781), (69.77570093457943, 22.392700190170313), (21.598130841121495, 17.3263943205681