In [1]:
import numpy as np

In [44]:
def separate_by_class(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

In [71]:
data = [[1, 2, 3, 0],
        [1, 3, 2, 0],
        [0, 3, 2, 1],
        [0, 2, 3, 1]]

testset = [[1, 3, 3, 0],
           [0, 3, 3, 1]]

separate_by_class(data)

{0: [[1, 2, 3, 0], [1, 3, 2, 0]], 1: [[0, 3, 2, 1], [0, 2, 3, 1]]}

In [91]:
def mean(numbers):
    return sum(numbers) / float(len(numbers))

def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
    return np.sqrt(variance)

def summary(dataset):
    summaries = [(mean(attr), stdev(attr)) for attr in zip(*dataset)]
    del summaries[-1]
    return summaries

def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    print("separated {}".format(separated))
    summaries = {}
    for cls_val, instances in separated.items():
        summaries[cls_val] = summary(instances)
    return summaries

summarize_by_class(data)

separated {0: [[1, 2, 3, 0], [1, 3, 2, 0]], 1: [[0, 3, 2, 1], [0, 2, 3, 1]]}


{0: [(1.0, 0.0), (2.5, 0.7071067811865476), (2.5, 0.7071067811865476)],
 1: [(0.0, 0.0), (2.5, 0.7071067811865476), (2.5, 0.7071067811865476)]}

In [83]:
def calc_prob(x, mean, stdev):
    exp = np.exp(-(pow(x-mean, 2)/(2*pow(stdev, 2))))
    return (1/(np.sqrt(2*np.pi)*stdev))*exp

def calc_cls_prob(summaries, new_vector):
    probs = {}
    for cls_val, cls_summaries in summaries.items():
        probs[cls_val] = 1
        for i in range(len(cls_summaries)):
            mean, stdev = cls_summaries[i]
            probs[cls_val] *= calc_prob(new_vector[i], mean, stdev)
    return probs

In [84]:
def predict(summaries, new_vector):
    probs = calc_cls_prob(summaries, new_vector)
    best_label, best_prob = None, -1
    for cls_val, prob in probs.items():
        if best_label is None or prob > best_prob:
            best_label = cls_val
            best_prob = prob
    return best_label

def get_pred(summaries, testset):
    preds = []
    for i in range(len(testset)):
        result = predict(summaries, testset[i])
        preds.append(result)
    return preds

def get_acc(testset, preds):
    correct = 0
    for x in range(len(testset)):
        if testset[x][-1] == preds[x]:
            correct += 1
    return (correct / float(len(testset))) * 100.0

In [85]:
data

[[1, 2, 3, 0], [1, 3, 2, 0], [0, 3, 2, 1], [0, 2, 3, 1]]

In [86]:
summaries = summarize_by_class(data)
print(summaries)
preds = get_pred(summaries, testset)
acc = get_acc(testset, preds)
print(preds)
print(acc)

{0: [(1.0, 0.0), (2.5, 0.7071067811865476), (2.5, 0.7071067811865476)], 1: [(0.0, 0.0), (2.5, 0.7071067811865476), (2.5, 0.7071067811865476)]}
[0, 0]
50.0


  
  This is separate from the ipykernel package so we can avoid doing imports until
  
  This is separate from the ipykernel package so we can avoid doing imports until
