In [26]:
#load dataset
import csv

def load_file(file):
    lines = csv.reader(open(file,'r'))
    new_ds= list(lines)
    for i in range(len(new_ds)) :
        # convert string to float
        new_ds[i] = [float(x) for x in new_ds[i]] 
    return new_ds

filename = 'pima-indians-diabetes.data.csv'
DataSet = load_file(filename)
print("The number of rows in the dataset is {}".format(len(DataSet)))

The number of rows in the dataset is 768


In [36]:
# split dataset between training and test sets
import random
import copy

def split(dataset,split_ratio):
    len_training = int(len(dataset) * split_ratio)
    test = []
    training = copy.deepcopy(dataset)
    while len(training) > len_training:
        random_index = random.randrange(len_training)
        test.append(training.pop(random_index))
    return training, test 

Training_Set, Test_Set = split(DataSet, 0.95)
print("Size of the Original DataSet : {}".format(len(DataSet)))
print("Size of the Training DataSet : {}".format(len(Training_Set)))
print("Size of the Test DataSet : {}".format(len(Test_Set)))

Size of the Original DataSet : 768
Size of the Training DataSet : 729
Size of the Test DataSet : 39


In [43]:
# Separate dataset by class

# assumption : the last value refers to the class
def separatedbyclass(ds):
    separated = {} #create dictionary
    for i in range(len(ds)):
        vect = ds[i]
        if vect[-1] not in separated:
            separated[vect[-1]]=[]
        separated[vect[-1]].append(ds[i])
    return separated

DataSet_Class = separatedbyclass(Test_Set)

[1.0, 103.0, 30.0, 38.0, 83.0, 43.3, 0.183, 33.0, 0.0]


In [145]:
import math

# Calculate mean
def average(numbers):
    return sum(numbers)/float(len(numbers))

# Calculate standard deviation (=ecart type)
def standard_deviation(numbers):
    avg = average(numbers)
    variance = sum([math.pow(x - avg,2) for x in numbers]) / float(len(numbers)-1)   
    return math.sqrt(variance)

In [146]:

#	summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]

# summarizing dataset
def summarize(dataset):
    summaries = [(average(attribute), standard_deviation(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

dataset = [[1,20,0], [2,21,1], [3,22,0], [3,22,0]]
summary = summarize(dataset)
print('Attribute summaries: {0}'.format(summary))

Attribute summaries: [(2.25, 0.9574271077563381), (21.25, 0.9574271077563381)]


In [98]:
# summarizing dataset per class

def summarizeByClass(ds):
    separated = separatedbyclass(ds)
    summary = {}
    for cl, instance in separated.items():
        summary[cl] = summarize(instance)
    return summary

dataset = [[1,20,1], [2,21,0], [3,22,1], [4,22,0]]
summary = summarizeByClass(dataset)
print('Summary by class value: {0}'.format(summary))

#Summary by class value: 
#{0: [(3.0, 1.4142135623730951), (21.5, 0.7071067811865476)], 
#1: [(2.0, 1.4142135623730951), (21.0, 1.4142135623730951)]}

Summary by class value: {1: [(2.0, 1.4142135623730951), (21.0, 1.4142135623730951)], 0: [(3.0, 1.4142135623730951), (21.5, 0.7071067811865476)]}


In [117]:
# Calculate Gaussian Probability Density Function
import math

def gauss(x, avg, std):
    return math.exp( - ( math.pow(x - avg ,2) / (2 * math.pow(std,2)) )  ) / ( std * math.sqrt(2 * math.pi) )

x = 19
mean = 20
stdev = 1
probability = gauss(x, mean, stdev)
print('Probability of belonging to this class: {0}'.format(probability))

Probability of belonging to this class: 0.24197072451914337


In [120]:

def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= gauss(x, mean, stdev)
    return probabilities

summaries = {0:[(1, 0.5)], 1:[(20, 5.0)]}
inputVector = [1, 3, '?']
probabilities = calculateClassProbabilities(summaries, inputVector)
print('Probabilities for each class: {0}'.format(probabilities))

Probabilities for each class: {0: 0.7978845608028654, 1: 5.838938515829206e-05}


In [121]:
# return highest probability
def predict(summaries, inputVector):
	probabilities = calculateClassProbabilities(summaries, inputVector)
	bestLabel, bestProb = None, -1
	for classValue, probability in probabilities.items():
		if bestLabel is None or probability > bestProb:
			bestProb = probability
			bestLabel = classValue
	return bestLabel

print('Class {}'.format(predict(summaries, inputVector)))

Class 0


In [122]:
def getPredictions(summaries, testSet):
	predictions = []
	for i in range(len(testSet)):
		result = predict(summaries, testSet[i])
		predictions.append(result)
	return predictions

In [123]:
def getAccuracy(testSet, predictions):
	correct = 0
	for x in range(len(testSet)):
		if testSet[x][-1] == predictions[x]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0

In [167]:
#ts = separatedbyclass(Training_Set)
summaries = summarizeByClass(Training_Set)
predictions = getPredictions(summaries, Test_Set)
print("The accuracy of the Gaussian model is %.2f %%" % getAccuracy(Test_Set, predictions))

The accuracy of the Gaussian model is 74.36 %
