## Make Predictions with Naive Bayes On The Iris Dataset
### Libraries

In [165]:
# Make a prediction with Naive Bayes on Iris Dataset
from math import sqrt
from math import exp
from math import pi
from sklearn.datasets import load_iris 

## Step 1: Load Data

In [166]:
setiris = load_iris()
setdata = dict()

## Step 2: Separate By Class

In [167]:
# Step 2: Separate By Class
def separate_by_class(setiris):
  dataset = { item: setiris.data[setiris.target==i] for i, item in enumerate(setiris.target_names) }
  return dataset
separate_by_class(setiris)

{'setosa': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],


## Step 3: Summarize Dataset

In [168]:
# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [169]:
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

In [170]:
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(setdata):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*setdata)]
    #del(summaries[-1])
    return summaries[:-1]
#summarize_dataset(setdata)

## Step 4: Summarize Data By Class

In [171]:
# Split dataset by class then calculate statistics for each row
def summarize_by_class(separated):
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries
#summarize_by_class(setdata)

## Step 5: Gaussian Probability Density Function

In [172]:
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

## Step 6: Class Probabilities

In [173]:
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

## Step 7: Prediction

In [174]:
# Predict the class for a given row
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

## Step 8: Executing Algorithm

In [175]:
for (i, item) in enumerate(list(setiris.target_names), start = 0):
    print('[%s] => %d' % (item, i))
    setdata[i] = setiris.data[setiris.target == i]

[setosa] => 0
[versicolor] => 1
[virginica] => 2


In [176]:
# Prediction for setosa:
#separate the dataset
setdata = separate_by_class(setiris)
# fit model
model = summarize_by_class(setdata)
# define a new record
row = [3.6, 5.9, 2.2, 4.1]
# predict the label
label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))

Data=[3.6, 5.9, 2.2, 4.1], Predicted: setosa


In [177]:
# Prediction for versicolor:
#separate the dataset
setdata = separate_by_class(setiris)
# fit model
model = summarize_by_class(setdata)
# define a new record
row = [5.7, 2.9, 4.2, 1.3]
# predict the label
label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))

Data=[5.7, 2.9, 4.2, 1.3], Predicted: versicolor


In [178]:
# Prediction for virginica:
#separate the dataset
setdata = separate_by_class(setiris)
# fit model
model = summarize_by_class(setdata)
# define a new record
row = [7.1, 0.5, 6.4, 0.2]
# predict the label
label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))

Data=[7.1, 0.5, 6.4, 0.2], Predicted: virginica
