In [2]:
import pandas as pd
import random
import numpy as np

In [3]:
# Read the dataset
data = pd.read_csv("pima-indians-diabetes.csv",header = None)

In [4]:
# Visualize some data samples from the dataset
print('Pima Indians Diabetes Data Set\n')
print(data.head())

Pima Indians Diabetes Data Set

   0    1   2   3    4     5      6   7  8
0  6  148  72  35    0  33.6  0.627  50  1
1  1   85  66  29    0  26.6  0.351  31  0
2  8  183  64   0    0  23.3  0.672  32  1
3  1   89  66  23   94  28.1  0.167  21  0
4  0  137  40  35  168  43.1  2.288  33  1


In [5]:
# 8th column is the class label
print('\n\n\nStats for the 7 features over the dataset and the 2 classes {8th column}{diabetic/not-diabetic}\n')
print(data.describe())




Stats for the 7 features over the dataset and the 2 classes {8th column}{diabetic/not-diabetic}

                0           1           2           3           4           5  \
count  768.000000  768.000000  768.000000  768.000000  768.000000  768.000000   
mean     3.845052  120.894531   69.105469   20.536458   79.799479   31.992578   
std      3.369578   31.972618   19.355807   15.952218  115.244002    7.884160   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      1.000000   99.000000   62.000000    0.000000    0.000000   27.300000   
50%      3.000000  117.000000   72.000000   23.000000   30.500000   32.000000   
75%      6.000000  140.250000   80.000000   32.000000  127.250000   36.600000   
max     17.000000  199.000000  122.000000   99.000000  846.000000   67.100000   

                6           7           8  
count  768.000000  768.000000  768.000000  
mean     0.471876   33.240885    0.348958  
std      0.331329   11.760232    0.476

In [6]:
TRAIN_TEST_RATIO = 0.8        # 80% training data
picker = list(range(data.shape[0]))        # get all indices as a list
## sometimes the data is arranged classwise and not randomly
## therefore we shuffle the indices
random.shuffle(picker)
trainMax = int(data.shape[0] * TRAIN_TEST_RATIO)

train_features = []
test_features = []
train_labels = []
test_labels = []

for pick in picker[:trainMax]:
    train_features.append(data.values[pick][:-1])
    train_labels.append(int(data.values[pick][-1]))
for pick in picker[trainMax:]:
    test_features.append(data.values[pick][:-1])
    test_labels.append(int(data.values[pick][-1]))

train_features = np.array(train_features)
test_features = np.array(test_features)

In [7]:
data.values[pick]

array([  2.   , 100.   ,  54.   ,  28.   , 105.   ,  37.8  ,   0.498,
        24.   ,   0.   ])

In [8]:
print(train_features.shape, len(train_labels), test_features.shape, len(test_labels))

(614, 8) 614 (154, 8) 154


### Exercise 1: Calculate Prior 



In [9]:
# Get the number of unique classes & corresponding number of elements belonging to each class
classes, counts = np.unique(train_labels, return_counts=True)
print(classes)
print(counts)

[0 1]
[401 213]


In [10]:
### I assume my classes are from 0 ... N for some N (Here, we have just 2 classes)
num_classes = len(classes)
num_feats = train_features.shape[1]  #total number of features
total_samples = len(train_labels)    #total number of samples

### **Exercise 1: Find the prior probability of each class as the list `prior`**

In [11]:
# Prior for any class = {number of samples belonging to that class/ total_samples}
prior = np.array([ x*1.0/total_samples for x in counts ])

In [12]:
print(prior)

[0.65309446 0.34690554]


In [13]:
### Calculate the mean and variance per feature dimension here 
### from the training set from samples belonging to each class label.

means = np.zeros((num_feats, num_classes)) # every feature, for each class
stddev = np.zeros((num_feats, num_classes)) # every feature, for each class

# For each class
for y in classes: # selecting a class 'y'
    pts = train_features[np.where( train_labels == y )[0], :]    # get all samples belonging to 'y'
    # For each feature
    for i in range(num_feats):
        means[i, y] = np.mean(pts[:, i])
        stddev[i, y] = np.std(pts[:, i])

### This completes the training phase
### We know have estimated both the prior probability and the posterior distributions from our training set.



### Exercise 2: Complete the Gaussian function based on the above equation ###

In [14]:
def gaussian(x, m, v):
    g = np.sqrt(1.0/2*np.pi*v*v)*np.exp( -1.0*(((x - m)/v)**2) )
    return g

### Exercise 3: Find the likelihood for each class 'y', once you have $P(X_{i}|y)$ from Exercise 2 ###

In [15]:
def get_likelihood(point, means, stddev):
    
    feat_prob = np.zeros((num_feats, num_classes))
    for y in classes:
        for i in range(num_feats):
            feat_prob[i, y] = gaussian(point[i], means[i, y], stddev[i, y]) # get the probability
    
    likelihood = np.zeros((num_classes, 1)) # likelihood for each class 'y'
    for y in classes:
        # Take the product of all the feature likelihoods of the class considered
        likelihood[y] = np.prod(feat_prob[np.nonzero(feat_prob), y]) # mutliply for each feature 'Xi'
    
    return likelihood

In [None]:
## Predict using Naive Bayes classifier

In [16]:
predictions = []
# For each test sample
for i in range(len(test_labels)):
    
    # Get its likelihood of belong to either class
    likelihood = get_likelihood(test_features[i, :], means, stddev)
    
    # Calculate the approximate posterior = likelihood * prior
    approx_posterior = [ np.asscalar(x*y) for x,y in zip(likelihood, prior) ]
    #approx because of missing P(X) (constant) in the denominator
    
    # Make the prediction as that class with the maximum approximate posterior
    prediction = np.argmax(approx_posterior)
    predictions.append(prediction)

In [17]:
print("Accuracy")
print(np.mean([x == y for x, y in zip(predictions, test_labels)]))

Accuracy
0.6558441558441559
