Download dataset from https://www.kaggle.com/uciml/pima-indians-diabetes-database#diabetes.csv 

1. PregnanciesNumber of times pregnant
2. GlucosePlasma glucose concentration a 2 hours in an oral glucose tolerance test
3. BloodPressureDiastolic blood pressure (mm Hg)
4. SkinThicknessTriceps skin fold thickness (mm)
5. Insulin2-Hour serum insulin (mu U/ml)
6. BMIBody mass index (weight in kg/(height in m)^2)
7. DiabetesPedigreeFunctionDiabetes pedigree function
8. AgeAge (years)
9. OutcomeClass variable (0 or 1)


In [1]:
!curl https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv --output data/pima_indian.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 23278  100 23278    0     0  21427      0  0:00:01  0:00:01 --:--:-- 21434


In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv("data/pima_indian.csv", header=None)
data.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI','DiabetesPedigreeFunction', 'Age', 'Outcome'
]
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


## Summarize Data
1. Separate data By class
2. Calculate Mean and Standard Deviation by attributes by class

In [4]:
np.sort(data['Pregnancies'].unique())

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 17])

In [5]:
np.sort(data['Outcome'].unique())

array([0, 1])

### 1. Separate Data By Class

In [6]:
# Class 0

data_out_0 = data.loc[data['Outcome']==0]
data_out_0 = data_out_0.drop(['Outcome'], axis=1)
data_out_0.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
1,1,85,66,29,0,26.6,0.351,31
3,1,89,66,23,94,28.1,0.167,21
5,5,116,74,0,0,25.6,0.201,30
7,10,115,0,0,0,35.3,0.134,29
10,4,110,92,0,0,37.6,0.191,30


In [7]:
#Class 1

data_out_1 = data.loc[data['Outcome']==1]
data_out_1 = data_out_1.drop(['Outcome'], axis=1)
data_out_1.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
2,8,183,64,0,0,23.3,0.672,32
4,0,137,40,35,168,43.1,2.288,33
6,3,78,50,32,88,31.0,0.248,26
8,2,197,70,45,543,30.5,0.158,53


### 2. Calculate Class Prior Probability

It'll calculate prior probability for each class. Only it needs frequencies.

In [8]:
print(data_out_0.shape)
print(data_out_1.shape)
print(data.shape)

(500, 8)
(268, 8)
(768, 9)


In [9]:
prior_class = {}
prior_class[0] = data_out_0.shape[0] / data.shape[0]
prior_class[1] = data_out_1.shape[0] / data.shape[0] 

print(prior_class[0] + prior_class[1])

1.0


### 3. Calculate Mean and Standard Deviation by attributes by class

This will be useful to calculate the likelihood.

For outcome 0

In [10]:
desc_0 = data_out_0.describe()
desc_0

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
std,3.017185,26.1412,18.063075,14.889947,98.865289,7.689855,0.299085,11.667655
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,93.0,62.0,0.0,0.0,25.4,0.22975,23.0
50%,2.0,107.0,70.0,21.0,39.0,30.05,0.336,27.0
75%,5.0,125.0,78.0,31.0,105.0,35.3,0.56175,37.0
max,13.0,197.0,122.0,60.0,744.0,57.3,2.329,81.0


In [11]:
mean_stan_0 = desc_0.loc[['mean', 'std']]
mean_stan_0

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
mean,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
std,3.017185,26.1412,18.063075,14.889947,98.865289,7.689855,0.299085,11.667655


For outcome 1

In [12]:
desc_1 = data_out_1.describe()
desc_1

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,268.0,268.0,268.0,268.0,268.0,268.0,268.0,268.0
mean,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164
std,3.741239,31.939622,21.491812,17.679711,138.689125,7.262967,0.372354,10.968254
min,0.0,0.0,0.0,0.0,0.0,0.0,0.088,21.0
25%,1.75,119.0,66.0,0.0,0.0,30.8,0.2625,28.0
50%,4.0,140.0,74.0,27.0,0.0,34.25,0.449,36.0
75%,8.0,167.0,82.0,36.0,167.25,38.775,0.728,44.0
max,17.0,199.0,114.0,99.0,846.0,67.1,2.42,70.0


In [13]:
mean_stan_1 = desc_1.loc[['mean', 'std']]
mean_stan_1

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
mean,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164
std,3.741239,31.939622,21.491812,17.679711,138.689125,7.262967,0.372354,10.968254


## Calculate the Posterior probability
Because, it's a Gaussian Naive Bayes; we need the Gaussian Density Function
$$p(x) = \frac{1}{\sqrt{ 2 \pi \sigma^2 }} e^{ - \frac{ (x - \mu)^2 } {2 \sigma^2} }, $$


In [14]:
import math

def calculateLikehood(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1/(math.sqrt(2*math.pi)*stdev))*exponent

def calculatedPosteriorProbability(summ, x, prior_prob):
    prob = prior_prob
    i = 0
    for column in summ:
        prob *= calculateLikehood(x[i], summ[column]['mean'], summ[column]['std'])
        i += 1
    return prob   

In [15]:
a0_0 = data_out_0.iloc[0].values
print("data test of outcome0")
print(a0_0)
print("probability for outcome 0: " + str(calculatedPosteriorProbability(mean_stan_0, a0_0, prior_class[0])))
print("probability for outcome 1: " + str(calculatedPosteriorProbability(mean_stan_1, a0_0, prior_class[1])))

a0_0 = data_out_1.iloc[0].values
print("\n\ndata test of outcome1")
print(a0_0)
print("probability for outcome 0: " + str(calculatedPosteriorProbability(mean_stan_0, a0_0, prior_class[0])))
print("probability for outcome 1: " + str(calculatedPosteriorProbability(mean_stan_1, a0_0, prior_class[1])))

data test of outcome0
[ 1.    85.    66.    29.     0.    26.6    0.351 31.   ]
probability for outcome 0: 1.9369577299087912e-12
probability for outcome 1: 3.8633100305024415e-14


data test of outcome1
[  6.    148.     72.     35.      0.     33.6     0.627  50.   ]
probability for outcome 0: 1.5601046357664668e-13
probability for outcome 1: 3.1582472589428555e-13


In [16]:
def getClassMoreProb(x):
    val0 = calculatedPosteriorProbability(mean_stan_0, x, prior_class[0])
    val1 = calculatedPosteriorProbability(mean_stan_1, x, prior_class[1])
    if val1 > val0:
        return 1
    return 0

def getAccuracy(testSet):
    correct = 0;
    rows, _ = testSet.shape
    for i in range(0, rows):
        val = testSet.iloc[i].values
        if getClassMoreProb(val[:-1]) == val[-1]:
            correct += 1        
    return (correct/len(testSet))*100

print("Accuracy " + str(getAccuracy(data)))


Accuracy 76.171875


#### Low values

In [17]:
a1_0 = data_out_1.iloc[0].values
print(a1_0)

[  6.    148.     72.     35.      0.     33.6     0.627  50.   ]


In [18]:
mean_10 = mean_stan_1['Pregnancies']['mean']
std_10 = mean_stan_1['Pregnancies']['std']
print(mean_10)
print(std_10)
print(calculateLikehood(6, mean_stan_1['Pregnancies']['mean'], mean_stan_1['Pregnancies']['std']))
print(calculateLikehood(148, mean_stan_1['Glucose']['mean'], mean_stan_1['Glucose']['std']))
print(calculateLikehood(72, mean_stan_1['BloodPressure']['mean'], mean_stan_1['BloodPressure']['std']))
print(calculateLikehood(35, mean_stan_1['SkinThickness']['mean'], mean_stan_1['SkinThickness']['std']))
print(calculateLikehood(0, mean_stan_1['Insulin']['mean'], mean_stan_1['Insulin']['std']))
print(calculateLikehood(33.6, mean_stan_1['BMI']['mean'], mean_stan_1['BMI']['std']))
print(calculateLikehood(0.627, mean_stan_1['DiabetesPedigreeFunction']['mean'], mean_stan_1['DiabetesPedigreeFunction']['std']))
print(calculateLikehood(50, mean_stan_1['Age']['mean'], mean_stan_1['Age']['std']))
print(prior_class[1])

4.865671641791045
3.7412390440415546
0.10184336702156156
0.012215275781225858
0.01853478569464016
0.017337063737873915
0.0022141880265826655
0.053703320224608955
1.0490297688915273
0.01814972153928338
0.3489583333333333


In [19]:
array = np.array([0.10184336789019185,
0.012215275885410928,
0.018534785852724837,
0.017337063885743128,
0.0022141880454676537,
0.053703320682648895,
1.0490297778387865,
0.018149721694083815,
         0.3489583333333333
        ])
print(np.prod(array))

3.158247474438422e-13


### References
- https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/
- https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.describe.html
- https://jakevdp.github.io/PythonDataScienceHandbook/05.05-naive-bayes.html