# Naive Bayes Classifer
---
### Gaussian pdf

A personal [tutorial](http://machinelearningmastery.com/naive-bayes-classifier-scratch-python/)

In [76]:
import random, math
import numpy as np
from __future__ import division # screw integer math

In [77]:
def loadCSV(filename):
    """Self-explantory"""
    dataset = []
    with open(filename) as lines:
        for line in lines:
            line = line.split(",")
            line = map(float, line)
            dataset.append(line)
    return dataset

In [78]:
name = "pima-indians-diabetes.data.txt"
dataset = loadCSV(name)
print "Loaded %s correctly with %d instances" % (name, len(dataset))

Loaded pima-indians-diabetes.data.txt correctly with 768 instances


In [79]:
def splitDataset(dataset, splitRatio):
    """Does not include cross-validation"""
    trainSize = int(len(dataset)*splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [80]:
dataset = [[1],[2],[3],[4],[5]]
[train, tmp] = splitDataset(dataset, 0.6)
print train

[[4], [3], [2]]


In [114]:
def separateByClass(dataset):
    """Assumes last attribute is class"""
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated): # could be replaced by default dict
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated;

In [101]:
dataset = [[1,20,1], [2,21,0], [3,22,1]]
separated = separateByClass(dataset)
print('Separated instances: {0}').format(separated)

[1, 20, 1]
[2, 21, 0]
[3, 22, 1]
Separated instances: {0: [[2, 21, 0]], 1: [[1, 20, 1], [3, 22, 1]]}


In [105]:
def mean(numbers):
    return sum(numbers)/len(numbers)

def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

In [106]:
numbers = [1,2,3,4,5]
print('Summary of {0}: mean={1}, stdev={2}').format(numbers, mean(numbers), stdev(numbers))

Summary of [1, 2, 3, 4, 5]: mean=3.0, stdev=1.58113883008


In [107]:
def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del(summaries[-1]) # deleting summary of class 'attribute'
    return summaries

In [109]:
dataset = [[1,20,0], [2,21,1], [3,22,0]]
summary = summarize(dataset)
print('Attribute summaries: {0}').format(summary)

Attribute summaries: [(2.0, 1.0), (21.0, 1.0)]


In [110]:
def summarizeByClass(dataset):
	separated = separateByClass(dataset)
	summaries = {}
	for classValue, instances in separated.iteritems():
		summaries[classValue] = summarize(instances)
	return summaries

In [113]:
#dataset = [[1,20,1], [2,21,0], [3,22,1], [4,22,0]]
summary = summarizeByClass(dataset)
#print('Summary by class value: {0}').format(summary)

[1, 20, 1]
[2, 21, 0]
[3, 22, 1]
[4, 22, 0]


# Make Prediction
---

We are now ready to make predictions using the summaries prepared from our training data. Making predictions involves calculating the probability that a given data instance belongs to each class, then selecting the class with the largest probability as the prediction.

We can divide this part into the following tasks:

* Calculate Gaussian Probability Density Function
* Calculate Class Probabilities
* Make a Prediction
* Estimate Accuracy

In [87]:
def calculateProbability(x, mean, stdev):
    exponent = np.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

$$ p(x) = \frac{1}{\sqrt{2\pi\sigma^{2}}}e^{\frac{-( x - \mu )^{2}}{2\sigma^{2}}} $$

In [88]:
x = 71.5
mean = 73
stdev = 6.2
probability = calculateProbability(x, mean, stdev)
print('Probability of belonging to this class: {0}').format(probability)

Probability of belonging to this class: 0.0624896575937


# Calculate Class Probabilities

Now that we can calculate the probability of an attribute belonging to a class, we can combine the probabilities of all of the attribute values for a data instance and come up with a probability of the entire data instance belonging to the class.

**We combine probabilities together by multiplying them.** In the calculateClassProbabilities() below, the probability of a given data instance is calculated by multiplying together the attribute probabilities for each class. the result is a map of class values to probabilities.

In [89]:
def calculateClassProbability(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.iteritems(): # iteritems() returns [key, value] of dictionaries
        probabilities[classValue] = 1
        for i in range(len(classSummaries)): # classSummaries contains 1 tuple, composed of mean & stdev
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

### An explanation of inputVector

Because summaries is a dict composed:
        
        {'class': [(mean, stdev)]}
        
calculateClassProbability only goes through 1st item in inputVector bc it only iterates over 1 item in value

*Yes it's very unwieldy and I hate it, lowkey*

In [90]:
summaries = {0:[(1, 0.5)], 1:[(20, 5.0)]}
inputVector = [1.1, '?']
probabilities = calculateClassProbability(summaries, inputVector)
print('Probabilities for each class: {0}').format(probabilities)

Probabilities for each class: {0: 0.78208538795091176, 1: 6.2987362581504424e-05}


In [91]:
def predict(summaries, inputVector):
    p = calculateClassProbability(summaries, inputVector)
    best, maxP = None, 0
    for value, prob in p.iteritems():
        if prob > maxP:
            maxP = prob
            best = value
    return best

In [92]:
summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}
inputVector = [1.1]
result = predict(summaries, inputVector)
print('Prediction: {0}').format(result)

Prediction: A


# Classifying Datasets

In [93]:
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [94]:
summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}
testSet = [[1.1, '?'], [19.1, '?']]
predictions = getPredictions(summaries, testSet)
print('Predictions: {0}').format(predictions)

Predictions: ['A', 'B']


# Accuracy

In [95]:
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return correct/len(testSet) * 100

In [96]:
testSet = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]
predictions = ['a', 'a', 'a']
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: {0}').format(accuracy)

Accuracy: 66.6666666667


# Tying it together

In [97]:
def main():
    filename = "pima-indians-diabetes.data.txt"
    splitRatio = 0.67
    dataset = loadCSV(filename)
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    print('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet))
    # preparing model
    summaries = summarizeByClass()