## Naive Bayes Classifier from Scratch--Jason Brownlee

Michael Muschitiello // AI in Asset Management 


In [2]:
import numpy as np 
import pandas as pd 
from random import seed
from random import randrange
from math import sqrt
from math import pi
from math import exp 

In [3]:
pima = pd.read_csv('pima-indians-diabetes.data (1).csv', header=None)
pima.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
pima.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Separating by Class

In [4]:
def separate_by_class(df):
    separated = {}
    for index, row in df.iterrows():
        class_value = row[-1]
        if class_value not in separated:
            separated[class_value] = []
        separated[class_value].append(row)
    return separated

separated = separate_by_class(pima)
max_rows_to_print = 5
for label in separated:
    print(f"Class {label}:")
    for i, row in enumerate(separated[label]):
        if i >= max_rows_to_print:
            print("... output truncated ...\n")
            break
        print(row.values.tolist())

Class 1.0:
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
[8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0]
[0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.288, 33.0, 1.0]
[3.0, 78.0, 50.0, 32.0, 88.0, 31.0, 0.248, 26.0, 1.0]
[2.0, 197.0, 70.0, 45.0, 543.0, 30.5, 0.158, 53.0, 1.0]
... output truncated ...

Class 0.0:
[1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0, 0.0]
[1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.167, 21.0, 0.0]
[5.0, 116.0, 74.0, 0.0, 0.0, 25.6, 0.201, 30.0, 0.0]
[10.0, 115.0, 0.0, 0.0, 0.0, 35.3, 0.134, 29.0, 0.0]
[4.0, 110.0, 92.0, 0.0, 0.0, 37.6, 0.191, 30.0, 0.0]
... output truncated ...



  class_value = row[-1]


## Summarize the dataset

In [5]:
# calculating mean
def mean(numbers):
    return sum(numbers) / float(len(numbers))

# calculating standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

# calculating mean, standard deviation, and length for each column in the dataset
def summarize_dataset(df):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*df.values)]
    del(summaries[-1])
    return summaries

summary = summarize_dataset(pima)
summary

[(3.8450520833333335, 3.3695780626988623, 768),
 (120.89453125, 31.97261819513622, 768),
 (69.10546875, 19.355807170644777, 768),
 (20.536458333333332, 15.952217567727677, 768),
 (79.79947916666667, 115.24400235133837, 768),
 (31.992578124999977, 7.8841603203754405, 768),
 (0.4718763020833327, 0.33132859501277484, 768),
 (33.240885416666664, 11.76023154067868, 768)]

In [6]:
def summarize_by_class(df):
    separated = separate_by_class(df)
    summaries = {}
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(pd.DataFrame(rows))
    return summaries

class_summaries = summarize_by_class(pima)
class_summaries

  class_value = row[-1]


{1.0: [(4.865671641791045, 3.741239044041554, 268),
  (141.25746268656715, 31.939622058007195, 268),
  (70.82462686567165, 21.49181165060413, 268),
  (22.16417910447761, 17.67971140046571, 268),
  (100.33582089552239, 138.6891247315351, 268),
  (35.14253731343278, 7.262967242346376, 268),
  (0.5505, 0.372354483554611, 268),
  (37.06716417910448, 10.968253652367915, 268)],
 0.0: [(3.298, 3.01718458262189, 500),
  (109.98, 26.14119975535359, 500),
  (68.184, 18.063075413305828, 500),
  (19.664, 14.889947113744254, 500),
  (68.792, 98.86528929231767, 500),
  (30.30419999999996, 7.689855011650112, 500),
  (0.42973400000000017, 0.29908530435741093, 500),
  (31.19, 11.667654791631156, 500)]}

This calculates the summary statistics(mean, std, length) for each of the 8 input variables and prints them organized by class values(0 or 1).
We can see that there are 500 '0' values and 268 '1' values

## Gaussian Probability Density Function

$$
f(x) = \frac{1}{\sqrt{2 \pi}\,\sigma} 
\exp\!\Biggl(-\frac{(x - \mu)^{2}}{2\,\sigma^{2}}\Biggr)
$$


In [7]:
# defining a function to compute the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    exponent = np.exp(-((x-mean)**2 / (2*stdev**2)))
    return (1/(np.sqrt(2*np.pi)*stdev))*exponent

# testing on some known values
print(calculate_probability(1.64, 0.0, 1.0))
print(calculate_probability(2.53, 0.0, 1.0))
print(calculate_probability(1.96, 0.0, 1.0))

0.10396109532876423
0.016254450460600506
0.05844094433345147


## Class Probabilities

- the probability that a piece of data belongs to a class is: P(class|data) = P(X|class)*P(data)

The division is removed from the typical Bayes theorem to simplify the calculation. This means that the result is no longer strictly a probability of the data belonging to a class. The value is still maximized, meaning that the calculation for the class that results in the largets value is taken as the prediction. 

- the input variables are treated separately, giving the technique it’s name “naive“. For the above example where we have 2 input variables, the calculation of the probability that a row belongs to the first class 0 can be calculated as: P(class=0|X1,X2) = P(X1|class=0) * P(X2|class=0) * P(class=0)

In [8]:
# calculate the probabilities of predicting each class for a given row

def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = {}
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, count = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

probabilities = calculate_class_probabilities(class_summaries, pima.iloc[0])
probabilities

  probabilities[class_value] *= calculate_probability(row[i], mean, stdev)


{1.0: 3.158247258942858e-13, 0.0: 1.5601046357664668e-13}

## Naive Bayes Algorithm

In [9]:
# split a dataset into k-folds
def cross_validation_split(dataset, n_folds):
    dataset_split = []
    dataset_copy = list(dataset)
    fold_size = int(len(dataset)/n_folds)
    for i in range(n_folds):
        fold = []
        while len(fold) < fold_size and len(dataset_copy) > 0:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0 

# evaluate an algorithm using a cross validation split
def evaluate_algo(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = []
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = []
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

# predict the class for a given row
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

# Naive Bayes Algorithm
def naive_bayes(train, test):
    # Convert the list-of-lists "train" back into a DataFrame
    # using the same columns as pima
    columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness',
               'Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']
    train_df = pd.DataFrame(train, columns=columns)
    
    # Now summarize_by_class(train_df) works correctly
    summarize = summarize_by_class(train_df)
    predictions = []
    for row in test:  # row remains a list. This works with the predict() function
        output = predict(summarize, row)
        predictions.append(output)
    return predictions

In [10]:
# Test the Naive Bayes Algorithm on the pima dataset
seed(42)

data = pima.values.tolist()
scores = evaluate_algo(data, naive_bayes, 12)
print(f'Scores %s' % scores)
print(f'Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

  class_value = row[-1]


Scores [76.5625, 75.0, 79.6875, 75.0, 81.25, 78.125, 73.4375, 73.4375, 82.8125, 71.875, 78.125, 70.3125]
Mean Accuracy: 76.302%
