# Program 5 - Naive Bayesian for dataset

---

The `statistics` package is used for mean and standard deviation

In [1]:
import csv, random, math
import statistics as st

## Get dataset - `loadCsv`

In [2]:
def loadCsv(filename):
    lines = csv.reader(open(filename, "r"));
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

## Split dataset function - `splitDataset`
Split into training and testing

In [17]:
def splitDataset(dataset, splitRatio):
    testSize = int(len(dataset) * splitRatio);
    trainSet = list(dataset);
    testSet = []
    
    # init training set as entire dataset, then keep popping random records and put them into test dataset
    while len(testSet) < testSize:
        index = random.randrange(len(trainSet));
        testSet.append(trainSet.pop(index))
    
    return [trainSet, testSet]

## `separateByClass` - Separate dataset based on class

`separated` - is a dictionary and each item is a key: value pair where,

- key : is the class value 
- value : is a list containing the dataset records which have target equal to that class value  

In [4]:
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        x = dataset[i] # current row
        if (x[-1] not in separated):
            separated[x[-1]] = []
        separated[x[-1]].append(x)
    return separated

## `compute_mean_std` - to compute Mean and Standard Dev.

The mean and standard deviation is calculated using the `statistics` package.

This mean and std is calculated for all the attributes.

`zip(*dataset)` transposes a matrix so that each row is an attribute, and the mean and STD are calculated for that row, which is an attribute.

In [15]:
def compute_mean_std(dataset):
    
    # zip(*res) transposes a matrix (2-d array/list)
    mean_std = [
        (st.mean(attribute), st.stdev(attribute)) for attribute in zip(*dataset)
    ]; 
    
    del mean_std[-1] # Exclude label, i.e., target
    return mean_std

## `summarizeByClass` - Mean and STD of each target class

In [6]:
def summarizeByClass(dataset): # summary is the mean and STD of class values
    separated = separateByClass(dataset);
    summary = {} # to store mean and std of +ve and -ve instances
    for classValue, instances in separated.items():
        #summaries is a dictionary of tuples(mean,std) for each class value
        summary[classValue] = compute_mean_std(instances)
    return summary

## `estimateProbability` - Gaussian (Normal) Distribution function

In [7]:
#For continuous attributes, p is estimated using Gaussian distribution
def estimateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

## Calculate Probabilities of each target class

In [16]:
# calculate class probabilities of that entire row (testVector)
def calculateClassProbabilities(summaries, testVector):
    p = {}
    #class and attribute information as mean and sd
    for classValue, classSummaries in summaries.items():
        p[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = testVector[i] #testvector's i-th attribute
            
            #use normal distribution
            p[classValue] *= estimateProbability(x, mean, stdev)
            
    return p

## Prediction

In the `for` loop, check all probabilities and then select best probability.

In [9]:
# calculate best out of all class probabilities of that entire row (testVector)
def predict(summaries, testVector):
    all_p = calculateClassProbabilities(summaries, testVector)
    bestLabel, bestProb = None, -1
    for lbl, p in all_p.items():#assigns that class which has he highest prob
        if bestLabel is None or p > bestProb:
            bestProb = p
            bestLabel = lbl
    return bestLabel

## Classification of record

Here, we go through the **testset** and find the final predeicted target values for all test data.

In [10]:
# find predicted class for each row in testSet
def perform_classification(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

## Accuracy

Accuracy = correct_length / total_length

In [11]:
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

## Load dataset and do prediction.

In [12]:
# dataset = loadCsv('pima-indians-diabetes.csv');
dataset = loadCsv('prog5_dataset.csv');

#print dataset
print('Pima Indian Diabetes Dataset loaded...')
print('Total instances available :',len(dataset))
print('Total attributes present :',len(dataset[0])-1)
print("First Five instances of dataset:")
for i in range(5):
    print(i+1 , ':' , dataset[i])
    

Pima Indian Diabetes Dataset loaded...
Total instances available : 768
Total attributes present : 8
First Five instances of dataset:
1 : [6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
2 : [1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0, 0.0]
3 : [8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0]
4 : [1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.167, 21.0, 0.0]
5 : [0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.288, 33.0, 1.0]


### Split and test

In [13]:
splitRatio = 0.2
trainingSet, testSet = splitDataset(dataset, splitRatio)

print('\nDataset is split into training and testing set.')
print('Training examples = {0} \nTesting examples = {1}'.format(len(trainingSet), len(testSet)))

summaries = summarizeByClass(trainingSet);

predictions = perform_classification(summaries, testSet)

accuracy = getAccuracy(testSet, predictions)

print('\nAccuracy of the Naive Baysian Classifier is :', accuracy)


Dataset is split into training and testing set.
Training examples = 615 
Testing examples = 153

Accuracy of the Naive Baysian Classifier is : 75.16339869281046
