In [2]:
# Task List 6 - Bayes classifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics as mtr

In [3]:
def load_dataset():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
    columns = ['Price', 'Maintenance', 'Doors', 'Persons', 'Luggage_boot', 'Safety', 'Class']
    return pd.read_csv(url, header=None, names=columns)


def calc_attr_probs(ds, attr_name):
    attr_counts = dict(dataset.groupby(attr_name).size())
    return {k: v / ds.shape[0] for k, v in attr_counts.items()}


def calc_attr_prob_when_class(ds, attribute_name, attribute_value, class_name):
    ds_attr_and_class = ds[(ds['Class'] == class_name) & (ds[attribute_name] == attribute_value)]
    ds_class = ds[ds['Class'] == class_name]
    return ds_attr_and_class.shape[0] / ds_class.shape[0]


def calc_all_probs(ds, attribute_names):
    probs = {}
    classes = list(set(ds['Class']))
    
    for attr_name in attribute_names:
        attr_vals = list(set(ds[attr_name]))
        for attr_val in attr_vals:
            for cls in classes:
                probs[(attr_name, attr_val, cls)] = calc_attr_prob_when_class(ds, attr_name, attr_val, cls)
                
    return probs


def make_instance(x):
    return {'Price': x[0], 
            'Maintenance': x[1], 
            'Doors': x[2],
            'Persons': x[3],
            'Luggage_boot': x[4], 
            'Safety': x[5]}

In [57]:
class BayesClassfier(object):
    def __init__(self):
        self.probs = None
        self.class_probs = None
    
    def fit(self, ds):
        attributes = list(dataset.columns.values)
        attributes.remove('Class')
        
        self.probs = calc_all_probs(ds, attributes)
        self.class_probs = calc_attr_probs(ds, 'Class')
    
    def predict(self, x):
        best_posterior = -1
        best_class = None
        
        instance = make_instance(x)
        classes = list(self.class_probs.keys())
        evidence = self._calc_evidence(instance)
        
        for cls_name in classes:
            likelihood = self._calc_likelihood(instance, cls_name)
            prior = self.class_probs[cls_name]
            posterior = likelihood * prior / evidence
            
            if posterior > best_posterior:
                best_posterior = posterior
                best_class = cls_name
                
        return best_class, best_posterior
            
    def _calc_likelihood(self, instance, cls_name):
        likelihood = 1
        for attr_name, attr_val in instance.items():
            likelihood *= self.probs[(attr_name, attr_val, cls_name)]
        return likelihood
    
    def _calc_evidence(self, instance):
        classes = list(self.class_probs.keys())
        evidence = 0
        
        for cls_name in classes:
            evidence += self._calc_likelihood(instance, cls_name) * self.class_probs[cls_name]
            
        return evidence

In [5]:
# Separate cell cause of loading time ...
dataset = load_dataset()
#print(dataset)

In [66]:
def example_test():
    bc = BayesClassfier()

    bc.fit(dataset)

    x = ['vhigh', 'vhigh', '2', '2', 'small', 'low']
    y_pred = bc.predict(x)

    print(x, '=>', y_pred)
    
example_test()

['vhigh', 'vhigh', '2', '2', 'small', 'low'] => ('unacc', 1.0)


In [68]:
def evaluate_classifier_own(train_ds, test_ds):
    bc = BayesClassfier()
    bc.fit(train_ds)
    
    y_true = test_ds['Class'].tolist()
    y_pred = [bc.predict(x.tolist())[0] for _, x in test_ds.iterrows()]
    
    cnf_matrix = mtr.confusion_matrix(y_true, y_pred)
    accuracy = mtr.accuracy_score(y_true, y_pred) * 100.0
    precision = mtr.precision_score(y_true, y_pred, average='macro') * 100.0
    recall = mtr.recall_score(y_true, y_pred, average='macro') * 100.0
    f1 = mtr.f1_score(y_true, y_pred, average='macro') * 100.0
    
    print("---- Confusion matrix ----")
    print(cnf_matrix)
    print('Accuracy: %.2f' % accuracy)
    print('Precision: %.2f' % precision)
    print('Recall: %.2f' % recall)
    print('F1: %.2f' % f1)
    
    # Accuracy = (TP + TN) / (TP + FP + FN + TN)
    # Precision = TP / (TP + FP)
    # Recall = TP / (TP + FN)
    # F1 =  2 * precision * recall / (precision + recall)


In [215]:
from copy import deepcopy
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import cross_val_predict

def preprocess(ds):
    ds_cpy = deepcopy(ds)
    
    ds_cpy['Price'] = ds_cpy['Price'].map({'low': 0, 'med': 1, 'high': 2, 'vhigh': 3})
    ds_cpy['Maintenance'] = ds_cpy['Maintenance'].map({'low': 0, 'med': 1, 'high': 2, 'vhigh': 3})
    ds_cpy['Doors'] = ds_cpy['Doors'].map({'2': 2, '3': 3, '4': 4, '5more': 5})
    ds_cpy['Persons'] = ds_cpy['Persons'].map({'2': 2, '4': 4, 'more': 5})
    ds_cpy['Luggage_boot'] = ds_cpy['Luggage_boot'].map({'small': 0, 'med': 1, 'big': 2})
    ds_cpy['Safety'] = ds_cpy['Safety'].map({'low': 0, 'med': 1, 'high': 2})
    ds_cpy['Class'] = ds_cpy['Class'].map({'unacc': 0, 'acc': 1, 'good': 2, 'vgood': 3})
    
    return ds_cpy


def evaluate_classifier_sklearn(train_ds, test_ds):
    bc = MultinomialNB(fit_prior=False)
    
    train_ds_cpy = preprocess(train_ds)
    test_ds_cpy = preprocess(test_ds)
    
    train_X = train_ds_cpy[['Price', 'Maintenance', 'Doors', 'Persons', 'Luggage_boot', 'Safety']]
    train_Y = train_ds_cpy['Class']
    
    bc.fit(train_X, train_Y)
    
    test_X = test_ds_cpy[['Price', 'Maintenance', 'Doors', 'Persons', 'Luggage_boot', 'Safety']]
    
    y_true = test_ds_cpy['Class']
    y_pred = bc.predict(test_X)
    #y_pred = cross_val_predict(bc, test_X, y_true, cv=8)
    
    cnf_matrix = mtr.confusion_matrix(y_true, y_pred)
    accuracy = mtr.accuracy_score(y_true, y_pred) * 100.0
    precision = mtr.precision_score(y_true, y_pred, average='macro') * 100.0
    recall = mtr.recall_score(y_true, y_pred, average='macro') * 100.0
    f1 = mtr.f1_score(y_true, y_pred, average='macro') * 100.0
    
    print("---- Confusion matrix ----")
    print(cnf_matrix)
    print('Accuracy: %.2f' % accuracy)
    print('Precision: %.2f' % precision)
    print('Recall: %.2f' % recall)
    print('F1: %.2f' % f1)
    
    print('Log_class_prior:', bc.class_log_prior_)
    print('Feature_count:', bc.feature_count_)
    

In [216]:
train_ds, test_ds = train_test_split(dataset, test_size=0.2)


print('Own:')
evaluate_classifier_own(train_ds, test_ds)

print('\n\nSklearn:')
evaluate_classifier_sklearn(train_ds, test_ds)

Own:
---- Confusion matrix ----
[[ 64   1  20   0]
 [ 11   6   0   1]
 [ 11   0 222   0]
 [  3   0   0   7]]
Accuracy: 86.42
Precision: 84.21
Recall: 68.48
F1: 73.20


Sklearn:
---- Confusion matrix ----
[[150  44  21  18]
 [ 12  60   3  10]
 [  0   0  13   5]
 [  0   0   4   6]]
Accuracy: 66.18
Precision: 49.34
Recall: 66.80
F1: 52.00
Log_class_prior: [-1.38629436 -1.38629436 -1.38629436 -1.38629436]
Feature_count: [[1623. 1596. 3383. 3267.  916.  719.]
 [ 422.  411. 1067. 1344.  325.  452.]
 [  17.   17.  182.  227.   49.   75.]
 [  22.   44.  203.  251.   87.  110.]]


In [178]:
print(calc_attr_probs(dataset, 'Class'))

{'acc': 0.2222222222222222, 'good': 0.03993055555555555, 'unacc': 0.7002314814814815, 'vgood': 0.03761574074074074}
