# Program 5 - Naive Bayesian Classifier for Dataset

---

We use `pandas` and `math`

First we split the data set into training and testing data

In [18]:
import pandas
import math

def prepare_data() :
    
    # read data
    dataset = pandas.read_csv('prog5_dataset.csv').values

    # split dataset into train and test data
    split_ratio = 0.95
    split_index = int(len(dataset) * split_ratio)
    training_data = dataset[ : split_index]
    test_data = dataset[split_index : ]
    return (training_data, test_data)

#     the following set of statements randomize the data set, unlike the above set of statements

#     splitRatio = 0.7
#     testSize = int(len(dataset) * splitRatio);
#     trainSet = list(dataset);
#     testSet = []
#     while len(testSet) < testSize:
#         index = random.randrange(len(trainSet));
#         testSet.append(trainSet.pop(index))
#     return (trainSet, testSet)

Then we split the dataset into "yes" and "no" valued data sets.

In [9]:
def split_yes_no(data) :
    
    # data whose target is 1
    yes_data = []
    
    # data whose target is 0
    no_data = []
    
    for index in range(len(data)) :
        if data[index][-1] == 1:
            yes_data.append(data[index])
        else:
            no_data.append(data[index])
    
    return (yes_data, no_data)

## Mean and Standard Deviation

In [10]:
# Calculates the mean 
def mean(numbers):
    return sum(numbers) / len(numbers)

In [11]:
# Calculates the standard deviation
# std. dev = root( sum( (x - x')^2 ) / n )
def standard_deviation(numbers):
    average = mean(numbers)
    variance = sum([pow(number - average, 2) for number in numbers]) / (len(numbers) - 1)
    return math.sqrt(variance)

Calculate the Mean and STD of each attribute

In [12]:
# Calculates the mean and standard deviation of each attribute
def summarize(data):
    summaries = [(mean(attribute), standard_deviation(attribute)) for attribute in zip(*data)]
    del summaries[-1]
    return summaries

## Calculate probability

The probability of the summary is given using Gaussian (Normal) distribution

In [13]:
def calculate_probability(summary, item):
    probability = 1
    for index in range(len(summary)):
        x = item[index]
        mean, standard_deviation = summary[index]
        exponent = math.exp(-pow(x - mean, 2) / (2 * standard_deviation ** 2))
        final = exponent / (math.sqrt(2 * math.pi) * standard_deviation)
        probability *= final
    return probability

## Finally, predict.

In [14]:
training_data, test_data = prepare_data()
yes_data, no_data = split_yes_no(training_data)

# Summarize Yes and No
yes_data = summarize(yes_data)
no_data = summarize(no_data)

# List holding the predictions
predictions = []

for record in test_data :
    yes_probability = calculate_probability(yes_data, record)
    no_probability = calculate_probability(no_data, record)
    
    predictions.append(1 if yes_probability > no_probability else 0)

Check the number of correct predictions, for accuracy.

In [15]:
# checking number of correct predictions
correct_prediction = 0

for i in range(len(test_data)):
    if(test_data[i][-1] == predictions[i]):
        correct_prediction += 1

Finally, print accuracy of the predictions.

In [16]:
print("ACTUAL VALUE \t PREDICTED VALUE")
for i in range(len(test_data)):
    print(test_data[i][-1], "\t\t", predictions[i])

print("\nAccuracy is ", ((correct_prediction / len(test_data)) * 100), "%")

ACTUAL VALUE 	 PREDICTED VALUE
0.0 		 0
1.0 		 0
1.0 		 0
1.0 		 1
0.0 		 0
0.0 		 0
0.0 		 0
0.0 		 0
0.0 		 0
0.0 		 0
1.0 		 0
1.0 		 1
0.0 		 0
0.0 		 0
1.0 		 1
0.0 		 1
0.0 		 1
1.0 		 1
0.0 		 1
1.0 		 1
1.0 		 1
1.0 		 1
0.0 		 0
0.0 		 0
1.0 		 1
1.0 		 1
1.0 		 1
0.0 		 1
1.0 		 0
0.0 		 0
1.0 		 1
0.0 		 0
1.0 		 1
0.0 		 0
0.0 		 1
0.0 		 0
0.0 		 0
1.0 		 0
0.0 		 0

Accuracy is  74.35897435897436 %
