# Naive Bayes from scratch
### From scratch

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import random
from csv import reader
import sklearn.model_selection as skms   

 
 ![](img/naivebayes1.png)
 ![](img/naivebayes2.png)
 
 ###   Six steps:
 ##### 1. Group rows by class : split the dataset in row groups one for each class.
 ##### 2. Take column statistics : mean, stdev and count to apply Bayes formula.
 ##### 3. Take column statistics by class : put steps 1 and 2 together.
 ##### 4. Buid Gaussian PDF : Gaussian PDF based on statistics calculated for each feature.
 ##### 5. Class probabilities: Probabilities of a piece of data belonging to a class based on Bayes formula.
 ##### 6. Prediction: Using all above do calculate predictions.

### Step 1: Group rows by class
We group the rows of the dataset by class in order to calculate the probability of data by the class they belong to (base rate).  
To do so, we create a dictionary object where each key is the class and the value is a list of all the records.

In [15]:
def group_rows_by_class(dataset):
    groups = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in groups):
            groups[class_value] = list()
        groups[class_value].append(vector)
    return groups

In [16]:
# Test separating dataset rows by class
dataset = [[3.393533211,2.331273381,0], [3.110073483,1.781539638,0], [1.343808831,3.368360954,0], [3.582294042,4.67917911,0], [2.280362439,2.866990263,0], [7.423436942,4.696522875,1], [5.745051997,3.533989803,1], [9.172168622,2.511101045,1], [7.792783481,3.424088941,1], [7.939820817,0.791637231,1]]
groups = group_rows_by_class(dataset)
for label in groups:
    print(label)
    for row in groups[label]:
        print(row)

0
[3.393533211, 2.331273381, 0]
[3.110073483, 1.781539638, 0]
[1.343808831, 3.368360954, 0]
[3.582294042, 4.67917911, 0]
[2.280362439, 2.866990263, 0]
1
[7.423436942, 4.696522875, 1]
[5.745051997, 3.533989803, 1]
[9.172168622, 2.511101045, 1]
[7.792783481, 3.424088941, 1]
[7.939820817, 0.791637231, 1]


### Step 2: Take column statistics
We require statistics: the mean, standard deviation and count. We will use these to calculate normal distribution of data for each column (no interactions, thus naive)

In [17]:
# Calculate the mean, stdev and count for each column in a dataset (except label)
def col_stats(dataset):
    stats = [(np.mean(column), np.std(column), len(column)) for column in zip(*dataset)]
    del(stats[-1]) # delete statistics on the class (not needed)
    return stats

In [18]:
# Test taking column statistics
dataset = [[3.393533211,2.331273381,0], [3.110073483,1.781539638,0], [1.343808831,3.368360954,0], [3.582294042,4.67917911,0], [2.280362439,2.866990263,0], [7.423436942,4.696522875,1], [5.745051997,3.533989803,1], [9.172168622,2.511101045,1], [7.792783481,3.424088941,1], [7.939820817,0.791637231,1]]
summary = col_stats(dataset)
print(summary)

[(5.178333386499999, 2.624612513030006, 10), (2.9984683241, 1.1560240509233746, 10)]


### Step 3: Column stats by class  
Above, we have developed the group_rows_by_class() and the stats_by_column(). We can put all of this together and summarize the columns in the dataset organized by class values.

In [19]:
# Split dataset by class then calculate statistics for each group of rows
def col_stats_by_class(dataset):
    row_groups = group_rows_by_class(dataset)
    stats = dict()
    for clasS, rows in row_groups.items():
        stats[clasS] = col_stats(rows)
    return stats

In [20]:
# Test column statistics by class
dataset = [[3.393533211,2.331273381,0], [3.110073483,1.781539638,0], [1.343808831,3.368360954,0], [3.582294042,4.67917911,0], [2.280362439,2.866990263,0], [7.423436942,4.696522875,1], [5.745051997,3.533989803,1], [9.172168622,2.511101045,1], [7.792783481,3.424088941,1], [7.939820817,0.791637231,1]]
csbc = col_stats_by_class(dataset)
for label in csbc:
    print(label, ':')
    for row in csbc[label]:
        print(row)

0 :
(2.7420144012, 0.8287479077141687, 5)
(3.0054686692, 0.9904256942385166, 5)
1 :
(7.6146523718, 1.1041096849046812, 5)
(2.9914679790000003, 1.3006698840042743, 5)


### Step 4: Build Gaussian PDF  
Calculating the probability or likelihood of observing a given real-value like X1 assuming normality.

$ \large P(x) = \frac{1}{\sqrt{ 2\pi \sigma}} . e^{-\frac{1}{2} (\frac{x-\mu}{\sigma})^2} $

In [22]:
# Calculate the Gaussian probability distribution function for x
def prob(x, mean, stdev):
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * math.exp(-((x-mean)**2 / (2 * stdev**2 )))

In [26]:
# Test Gaussian PDF 
print(prob(0.0, 1.0, 1.0))
print(prob(1.0, 1.0, 1.0))
print(prob(2.0, 1.0, 1.0))

0.24197072451914337
0.3989422804014327
0.24197072451914337


### Step 5: Class probabilities
The probability that a piece of data belongs to a class. Remember Bayes formula is : 
$ P(class|data) = P(X|class) . P(class) $ 

In [27]:
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(csbc, row):
    total_rows = sum([csbc[label][0][2] for label in csbc]) # sum of counts
    probs = dict()
    for clasS, class_stats in csbc.items():
        print('class: ', clasS)
        print('class_stats: ', class_stats) # one class_stat per column (except labels)
        Pclass = csbc[clasS][0][2] / float(total_rows) # rows count of class / all rows = P(class)
        probs[clasS] = Pclass
        for i in range(len(class_stats)):
            mean, stdev, count = class_stats[i]
            PXclass = prob(row[i], mean, stdev) # Probability of this row belonging to that class knowing the column value row[i] and given the distribution for that column and class P(X|class). 
            probs[clasS] *= PXclass # Same for each column, then product (naive). And multiply by the P(X|class) so that we have P(X|class) * P(class)
    return probs

In [29]:
# Test calculating class probabilities (probability of belonging to each of the classes)
dataset = [[3.393533211,2.331273381,0], [3.110073483,1.781539638,0], [1.343808831,3.368360954,0], [3.582294042,4.67917911,0], [2.280362439,2.866990263,0], [7.423436942,4.696522875,1], [5.745051997,3.533989803,1], [9.172168622,2.511101045,1], [7.792783481,3.424088941,1], [7.939820817,0.791637231,1]]
csbc = col_stats_by_class(dataset)
probs = calculate_class_probabilities(csbc, dataset[0])
print(probs)

class:  0
class_stats:  [(2.7420144012, 0.8287479077141687, 5), (3.0054686692, 0.9904256942385166, 5)]
class:  1
class_stats:  [(7.6146523718, 1.1041096849046812, 5), (2.9914679790000003, 1.3006698840042743, 5)]
{0: 0.05645767579601407, 1: 3.264562167486243e-05}


### Step 6: Prediction

In [30]:
# Predict the class for a given row
def nb_predict(csbc, row):
    probs = calculate_class_probabilities(csbc, row)
    best_class, best_prob = None, -1
    for clasS, prob in probs.items():
        if best_class is None or prob > best_prob:
            best_prob = prob
            best_class = clasS
    return best_class

### Naive Bayes on Iris dataset

In [31]:
# Load a CSV file
def load_csv(filename):
    header = True
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            if header:
                header = False
                continue
            dataset.append(row)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = random.randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores


# Naive Bayes Algorithm
def naive_bayes(train, test):
    csbc = col_stats_by_class(train)
    preds = list()
    for row in test:
        output = nb_predict(csbc, row)
        preds.append(output)
    return(preds)

In [32]:
# Test Naive Bayes on Iris Dataset
random.seed(1)
filename = 'D:/data/csv/iris.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
    
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)

# evaluate algorithm
n_folds = 5
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print('Scores: ', scores)
print( 'Mean Accuracy:', sum(scores)/float(len(scores)) )

class:  1
class_stats:  [(6.610256410256411, 0.6460113803057572, 39), (3.015384615384615, 0.31422551230577445, 39), (5.587179487179487, 0.5515384019358883, 39), (2.082051282051282, 0.2570760586562929, 39)]
class:  2
class_stats:  [(5.012820512820513, 0.3736030302267486, 39), (3.4410256410256417, 0.40173849685505414, 39), (1.4666666666666666, 0.18719704870198953, 39), (0.2487179487179487, 0.10830108759970705, 39)]
class:  0
class_stats:  [(5.935714285714285, 0.4873746132139447, 42), (2.7809523809523813, 0.2977999450238966, 42), (4.2714285714285705, 0.41820803338012713, 42), (1.3333333333333333, 0.1872948613266394, 42)]
class:  1
class_stats:  [(6.610256410256411, 0.6460113803057572, 39), (3.015384615384615, 0.31422551230577445, 39), (5.587179487179487, 0.5515384019358883, 39), (2.082051282051282, 0.2570760586562929, 39)]
class:  2
class_stats:  [(5.012820512820513, 0.3736030302267486, 39), (3.4410256410256417, 0.40173849685505414, 39), (1.4666666666666666, 0.18719704870198953, 39), (0.2

## Extensions

### 1. Other distributions

- Non-normal distributions  (Bernouilli, Multinomial)
- Empyrical

### 2. Log probabilities
A way to mitigate the vanishing effect of probabilities as a result of many multiplications.

### Credits & Links

Based on the following sources:
    
https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/  
https://www.saedsayad.com/naive_bayesian.htm  
https://slideplayer.com/slide/4996705/  
    