In [177]:
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
import numpy as np
import scipy.stats as stats

In [179]:
# Read the dataset and apply basic type conversions
def load_data_and_basic_preprocessing(filename):
    print("Reading dataset {}".format(filename))
    dataset = list()
    file_process_map = processing_map[filename]
    int_map, float_map, del_map = [], [], []
    if "int" in file_process_map:
        int_map = file_process_map["int"]
        print("Processing integer columns...")
    if "float" in file_process_map:
        float_map = file_process_map["float"]
        print("Processing float columns...")
    if "del" in file_process_map:
        del_map = file_process_map["del"]
        print("Processing unwanted columns...")
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            for i in range(len(row)):
                # Preprocessing
                if i in int_map:
                    try:
                        row[i] = int(row[i])
                    except:
                        row[i] = 5 if row[i] == "more" else 6
                if i in float_map:
                    row[i] = float(row[i])
                if i in del_map:
                    del row[i]
            dataset.append(row)
    class_process_indexes = []    
    if "class" in file_process_map:
        class_process_indexes = file_process_map["class"]
        print("Processing class label columns...")
    for index in class_process_indexes:
        class_values = [row[index] for row in dataset]
        unique = set(class_values)
        lookup = dict()
        for i, value in enumerate(unique):
            lookup[value] = i
        for row in dataset:
            row[index] = lookup[row[index]]
        print(filename, "Index:", index, lookup)
    print("Dataset reading completed !!!")
    print("Dataset preprocessing completed !!!")
    return dataset

In [180]:
# Create a dict having target class name as key and their all columns as their values of complete dataset
def group_by_target_labels(dataset):
	group = dict()
	for i in range(len(dataset)):
		record = dataset[i]
		label = record[-1]
		if label not in group:
			group[label] = []
		group[label].append(record)
	return group

In [181]:
# Calculate the mean, stdev and count for each column in a dataset
def summarize_label(label_data):
	summaries = [(mean(column), stdev(column), len(column)) for column in zip(*label_data)]
	del(summaries[-1])
	return summaries

In [182]:
# Calculate statistics of target class
def target_label_summary_stats(dataset):
	label_groups = group_by_target_labels(dataset)
	summary_status = dict()
	for label_value, rows in label_groups.items():
		summary_status[label_value] = summarize_label(rows)
	return summary_status

In [183]:
# Split a dataset into k folds
def get_n_folds(dataset, folds = 10):
	dup,res,fold_size = list(dataset),[],int(len(dataset) / folds)
	for i in range(folds):
		fold = []
		while len(fold) < fold_size:
			fold.append(dup.pop(randrange(len(dup))))
		res.append(fold)
	return res

In [184]:
# Calculate model accuracy percentage
def get_model_accuracy(act, pred):
	correct = 0
	for i in range(0, len(act)):
		correct += (act[i] == pred[i]) # True => 1 False => 0
	ans = correct / len(act) * 100.0
	return ans

In [185]:
def eval_algo(dataset, algo, *args):
	folds = get_n_folds(dataset)
	accuracies = []
	for fold in folds:
		test_dataset,act = [],[]
		train_dataset = list(folds)
		train_dataset.remove(fold)
		train_dataset = sum(train_dataset, [])
		for row in fold:
			row_cp = list(row)
			test_dataset.append(row_cp)
			row_cp[-1] = None
			act.append(row[-1])
		pred = algo(train_dataset, test_dataset, *args)
		accuracies.append(get_model_accuracy(act, pred))
	return accuracies

In [186]:
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    try:
        exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
        return (1 / (sqrt(2 * pi) * stdev)) * exponent
    except:
        return 0

In [187]:
# Calculate the mean of a list of numbers
def mean(numbers):
	return sum(numbers)/float(len(numbers))

In [188]:
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
	avg = mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
	return sqrt(variance)

In [189]:
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
	total_rows = sum([summaries[label][0][2] for label in summaries])
	probabilities = dict()
	for class_value, class_summaries in summaries.items():
		probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
		for i in range(len(class_summaries)):
			mean, stdev, _ = class_summaries[i]
			probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
	return probabilities

In [190]:
# Predict the class for a given row
def predict(summaries, row):
	probabilities = calculate_class_probabilities(summaries, row)
	best_label, best_prob = None, -1
	for class_value, probability in probabilities.items():
		if best_label is None or probability > best_prob:
			best_prob, best_label = probability, class_value
	return best_label

In [191]:
# Naive Bayes Algorithm
def naive_bayes(train, test):
	summarize, predictions = target_label_summary_stats(train), []
	for row in test:
		output = predict(summarize, row)
		predictions.append(output)
	return(predictions)

In [192]:
for filename in processing_map:
    seed(2)
    print("Dataset: %s" % filename.split(".")[0])
    dataset = load_data_and_basic_preprocessing(filename)
    scores = eval_algo(dataset, naive_bayes)
    print("+","-"*41, "+", sep = "")
    print(f"|{'Accuracies' : ^41}|")
    print("+","-"*41, "+", sep = "")
    for fold_number in range(1, 11):
        fold_index = fold_number - 1
        print("| Fold: {:>2} | Accuracy: {:>18} |".format(fold_number, scores[fold_index]))
    print("+","-"*41, "+", sep = "")
    print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
    print("\n"*3)
    p_acc.append(sum(scores)/float(len(scores)))
t_value,p_value=stats.ttest_ind(p_acc,weka_accuracy)
print("Ttest value:",t_value)
print("Pvalue :",p_value)



   

Dataset: car
Reading dataset car.data
Processing integer columns...
Processing class label columns...
car.data Index: 0 {'low': 0, 'med': 1, 'high': 2, 'vhigh': 3}
car.data Index: 1 {'low': 0, 'med': 1, 'high': 2, 'vhigh': 3}
car.data Index: 4 {'med': 0, 'small': 1, 'big': 2}
car.data Index: 5 {'low': 0, 'med': 1, 'high': 2}
car.data Index: 6 {'good': 0, 'acc': 1, 'unacc': 2, 'vgood': 3}
Dataset reading completed !!!
Dataset preprocessing completed !!!
+-----------------------------------------+
|               Accuracies                |
+-----------------------------------------+
| Fold:  1 | Accuracy:  73.25581395348837 |
| Fold:  2 | Accuracy:  83.13953488372093 |
| Fold:  3 | Accuracy:   81.3953488372093 |
| Fold:  4 | Accuracy:  86.62790697674419 |
| Fold:  5 | Accuracy:  81.97674418604652 |
| Fold:  6 | Accuracy:  80.23255813953489 |
| Fold:  7 | Accuracy:  84.88372093023256 |
| Fold:  8 | Accuracy:  80.23255813953489 |
| Fold:  9 | Accuracy:   81.3953488372093 |
| Fold: 10 | Ac