In [1]:
import math
import random
import csv
import numpy as np

Loading file from csv

In [2]:
def LoadCsv(fileName):
    lines=csv.reader(open(fileName,'r'))
    dataSet=list(lines)
    for i in range(len(dataSet)):
        dataSet[i]=[float(x) for x in dataSet[i]]
    return dataSet

Split Train and Test dataSets 

In [3]:
def splitDataSets(dataSet,splitRatio):
    trainSize=int(len(dataSet) * splitRatio)
    trainSet=[]
    copy=list(dataSet)
    while len(trainSet) < trainSize:
        index=random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return (trainSet,copy)
    

In [4]:
def separateByClass(dataSet):
    separated={}
    for i in range(len(dataSet)):
        vector=dataSet[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

In [5]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [6]:
def stdDev(numbers):
    avg=mean(numbers)
    variance=sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

In [7]:
def summarize(dataSet):
    summaries=[(mean(attribute),stdDev(attribute)) for attribute in zip(*dataSet)]
    del summaries[-1]
    return summaries
    

In [8]:
def summarizeByClass(dataSet):
    separate=separateByClass(dataSet)
    summaries={}
    for classValue, instance in separate.items():
        summaries[classValue]=summarize(instance)
    return summaries

In [9]:
def calculateProbability(x,mean,stdev):
    exponent=math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1/(math.sqrt(2*math.pi)*stdev))*exponent

In [10]:
def calculateClassProbability(summaries,inputVector):
    prbabilities={}
    for classValue, classSummary in summaries.items():
        prbabilities[classValue]=1
        for i in range(len(classSummary)):
            mean, stdev = classSummary[i]
            x=inputVector[i]
            prbabilities[classValue] *= calculateProbability(x, mean, stdev)
        return prbabilities
        

In [11]:
def predict(summaries, inputVector):
    probabilities=calculateClassProbability(summaries,inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel


In [12]:
def getPrediction(summaries, testSet):
    predictions=[]
    for i in range(len(testSet)):
        result=predict(summaries,testSet[i])
        predictions.append(result)
    return predictions

In [13]:
def getAccuracy(testSet, predictions):
    correct=0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
        
    return (correct/float(len(testSet)))*100.0

In [18]:
def main():
    fileName='DataForAlgo/pima-india-diabetes/pima-indians-diabetes.csv'
    splitRatio=0.67
    dataSet=LoadCsv(fileName)
    trainSet, testSet = splitDataSets(dataSet,splitRatio)
    print(np.shape(trainSet))
    print(np.shape(testSet))
    print('Split {0} rows into TrainSet={1} testSet={2}'.format(len(dataSet),len(trainSet),len(testSet)))
    #Prepare the Model
    summaries=summarizeByClass(trainSet)
    #Testing the model 
    predictions=getPrediction(summaries,testSet)
    print(np.shape(predictions))
    accuracy=getAccuracy(testSet,predictions)
    print('Accuracy of this model is {0}%'.format(accuracy))

In [19]:
main()

(514, 9)
(254, 9)
Split 768 rows into TrainSet=514 testSet=254
(254,)
Accuracy of this model is 62.59842519685039%
