In [25]:
# Task List 6 - Bayes classifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics as mtr

In [3]:
def load_dataset():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
    columns = ['Price', 'Maintenance', 'Doors', 'Persons', 'Luggage_boot', 'Safety', 'Class']
    return pd.read_csv(url, header=None, names=columns)


def calc_attr_probs(ds, attr_name):
    attr_counts = dict(dataset.groupby(attr_name).size())
    return {k: v / ds.shape[0] for k, v in attr_counts.items()}


def calc_attr_prob_when_class(ds, attribute_name, attribute_value, class_name):
    ds_attr_and_class = ds[(ds['Class'] == class_name) & (ds[attribute_name] == attribute_value)]
    ds_class = ds[ds['Class'] == class_name]
    return ds_attr_and_class.shape[0] / ds_class.shape[0]


def calc_all_probs(ds, attribute_names):
    probs = {}
    classes = list(set(ds['Class']))
    
    for attr_name in attribute_names:
        attr_vals = list(set(ds[attr_name]))
        for attr_val in attr_vals:
            for cls in classes:
                probs[(attr_name, attr_val, cls)] = calc_attr_prob_when_class(ds, attr_name, attr_val, cls)
                
    return probs


def make_instance(x):
    return {'Price': x[0], 
            'Maintenance': x[1], 
            'Doors': x[2],
            'Persons': x[3],
            'Luggage_boot': x[4], 
            'Safety': x[5]}

In [4]:
class BayesClassfier(object):
    def __init__(self):
        self.probs = None
        self.class_probs = None
        self.attr_probs = {}
    
    def fit(self, ds):
        attributes = list(dataset.columns.values)
        attributes.remove('Class')
        
        self.probs = calc_all_probs(ds, attributes)
        self.class_probs = calc_attr_probs(ds, 'Class')
        for attr in attributes:
            self.attr_probs[attr] = calc_attr_probs(ds, attr)
    
    def predict(self, x):
        best_posterior = -1
        best_class = None
        
        instance = make_instance(x)
        classes = list(self.class_probs.keys())
        evidence = self._calc_evidence(instance)
        
        for cls_name in classes:
            likelihood = self._calc_likelihood(instance, cls_name)
            prior = self.class_probs[cls_name]
            posterior = likelihood * prior / evidence
            
            if posterior > best_posterior:
                best_posterior = posterior
                best_class = cls_name
                
        return best_class, best_posterior
            
    def _calc_likelihood(self, instance, cls_name):
        likelihood = 1
        for attr_name, attr_val in instance.items():
            likelihood *= self.probs[(attr_name, attr_val, cls_name)]
        return likelihood
    
    def _calc_evidence(self, instance):
        classes = list(self.class_probs.keys())
        evidence = 0
        
        for cls_name in classes:
            evidence += self._calc_likelihood(instance, cls_name) * self.class_probs[cls_name]
            
        return evidence

In [5]:
# Separate cell cause of loading time ...
dataset = load_dataset()
#print(dataset)

In [6]:
def example_test():
    bc = BayesClassfier()

    bc.fit(dataset)

    x = ['vhigh', 'vhigh', '2', '2', 'small', 'low']
    y_pred = bc.predict(x)

    print(x, '=>', y_pred)
    
example_test()

['vhigh', 'vhigh', '2', '2', 'small', 'low'] => ('unacc', 1.0)


In [34]:
def evaluate_classifier():
    train_ds, test_ds = train_test_split(dataset, test_size=0.2)
    bc = BayesClassfier()
    bc.fit(train_ds)
    
    y_true = test_ds['Class'].tolist()
    y_pred = [bc.predict(x.tolist())[0] for _, x in test_ds.iterrows()]
    
    cnf_matrix = mtr.confusion_matrix(y_true, y_pred)
    accuracy = mtr.accuracy_score(y_true, y_pred) * 100.0
    precision = mtr.precision_score(y_true, y_pred, average='macro') * 100.0
    recall = mtr.recall_score(y_true, y_pred, average='macro') * 100.0
    f1 = mtr.f1_score(y_true, y_pred, average='macro') * 100.0
    
    print("---- Confusion matrix ----")
    print(cnf_matrix)
    print('Accuracy: %.2f' % accuracy)
    print('Precision: %.2f' % precision)
    print('Recall: %.2f' % recall)
    print('F1: %.2f' % f1)
    
    
evaluate_classifier()

---- Confusion matrix ----
[[ 56   3  22   0]
 [  5   2   0   1]
 [  4   1 236   0]
 [  9   1   0   6]]
Accuracy: 86.71
Precision: 70.36
Recall: 57.39
F1: 61.42
