In [1]:
import pandas as pd
import numpy as np
import math
from anytree import Node, RenderTree, find, Walker,DoubleStyle,LevelOrderIter,findall

class ConfussionMatrix:
    def __init__(self,tp=0,fp=0,fn=0,tn=0,support=0,label=None):
        self.tp = tp
        self.fp = fp
        self.fn = fn
        self.tn = tn
        self.support = support
        self.label=label
        
    def get_p(self):
        self.p = self.tp + self.fn
        return self.p
    
    def get_n(self):
        self.n = self.fp + self.tn
        return self.n
    
    def accuracy(self):
        p = self.get_p()
        n = self.get_n()
        if p == 0.0 or n == 0.0:
            return 0.0
        return float(self.tp+self.tn)/(p+n)
    
    def error_rate(self):
        p = self.get_p()
        n = self.get_n()
        if p == 0.0 or n == 0.0:
            return 0.0
        return float(self.fp+self.fn)/(p+n)
    
    def recall(self):
        p = self.get_p()
        if p == 0.0:
            return 0.0
        return float(self.tp)/(p)
    
    def specificity(self):
        n = self.get_n()
        if n == 0.0:
            return 0.0
        return float(self.tn)/(n)
    
    def percision(self):
        divider = self.tp + self.fp
        if(divider == 0.0):
            return 0.0
        return float(self.tp)/(divider)
    
    def f1(self):
        percision = self.percision()
        recall = self.recall()
        total = percision + recall
        if(total == 0.0):
            return 0.0
        return float(2*percision*recall)/(percision+recall)
    
    def weighted_f1(self):
        return float(self.f1()) * self.support
    
    def weighted_recall(self):
        return float(self.recall()) * self.support
    
    def weighted_percision(self):
        return float(self.percision()) * self.support
    
    def weighted_error(self):
        return float(self.error_rate()) * self.support 
    
    def display_report(self,name):
#         print('%5s %5s %5s %5s %5s' % ('Name','Accuracy','Percision','Recall','F1'))
        print('%7s'% name,end =' ')
        print('%7.2f' % self.accuracy(),end=' ')
        print('%7.2f' %self.percision(),end=' ')
        print('%7.2f' %self.recall(),end=' ')
        print('%7.2f' %self.f1(),end=' ')
        print('%7d' %self.support)
    def display_matrix(self):
        print(f'TP = {self.tp} FP = {self.fp}')
        print(f'FN = {self.fn} TN = {self.tn}')

class Report:
    def __init__(self):
        pass
    def create_cm_list(self,actualList,predictionList,labels):
        cm_list = {}
        for label in labels:
            TP = 0
            FN = 0
            FP = 0
            TN = 0
            mask = np.isin(actualList,label)
            support = mask.sum()
            for index,predict in enumerate(predictionList):
                actual = actualList[index]
                if(predict == label == actual):
                    TP +=1
                elif(predict == actual and predict != label):
                    TN +=1
                elif(predict != actual and predict == label):
        #             FN +=1
                    FP += 1
                elif(predict != actual and actual != label):
                    TN +=1
                elif(predict != actual and actual == label):
        #             FP +=1
                    FN+=1
            cm = ConfussionMatrix(TP,FP,FN,TN,support,label)
            cm_list[label] = cm
        return cm_list

    def create_report(self,cm_list,labels):
        totalTP =0
        totalFP =0
        totalFN =0
        totalSupport =0
        totalF1 =0
        totalRecall =0
        totalPercision =0
        totalWeightedRecall =0
        totalWeightedPercision =0
        totalWeightedF1=0
        totalError=0
        totalWeightedError=0
        for label in labels:
            cm = cm_list[label]
            totalTP += cm.tp
            totalFP += cm.fp
            totalFN += cm.fn
            totalF1 += cm.f1()
            totalWeightedF1 += cm.weighted_f1()
            totalSupport += cm.support
            totalRecall += cm.recall()
            totalPercision += cm.percision()
            totalWeightedRecall += cm.weighted_recall()
            totalWeightedPercision += cm.weighted_percision()
            totalError += cm.error_rate()
            totalWeightedError += cm.weighted_error()
            print('-----------------------------')
            print(f'Label:{cm.label}')
            print(f'Accuracy: {cm.accuracy():.2f}')
            print(f'Error: {cm.error_rate():.2f}')
            print(f'Specificity: {cm.specificity():.2f}')
            print(f'Percision: {cm.percision():.2f}')
            print(f'Recall: {cm.recall():.2f}')
            print(f'F1-score: {cm.f1():.2f}')
            print(f'support: {cm.support:d}')
            cm.display_matrix()
            
        print('-----------------------------')
        nol = len(labels)
        P = totalTP+totalFN
        micro_f1 = totalTP/P
        macro_f1 = (totalF1)/nol
        weigthed_f1 = totalWeightedF1 / totalSupport
        # if(nol > 2):
        print(f'Micro f1/Accuracy: {micro_f1:.2f}')
        print(f'Macro f1: {macro_f1:.2f}')
        print(f'Weighted f1: {weigthed_f1:.2f}')
        
        micro_error = (totalFP+totalFN)/totalSupport
        macro_error = (totalError)/nol
        weigthed_error = totalWeightedError / totalSupport
        # if(nol > 2):
        print(f'Micro error: {micro_error:.2f}')
        print(f'Macro error: {macro_error:.2f}')
        print(f'Weighted error: {weigthed_error:.2f}')     
        
        micro_percision = (totalTP)/(totalTP+totalFP)
        macro_percision = totalPercision / nol
        weighted_percision = totalWeightedPercision / totalSupport
        # if(nol > 2):
        print(f'Micro percision: {micro_percision:.2f}')
        print(f'Macro percision: {macro_percision:.2f}')
        print(f'Weighted percision: {weighted_percision:.2f}')

        micro_recall = totalTP/P
        macro_recall = totalRecall / nol
        weighted_recall = totalWeightedRecall / totalSupport
        # if(nol > 2):
        print(f'Micro recall: {micro_recall:.2f}')
        print(f'Macro recall: {macro_recall:.2f}')
        print(f'Weighted recall: {weighted_recall:.2f}')   

In [23]:
import pandas as pd
import numpy as np
from ConfussionMatrix import Report,ConfussionMatrix
from collections import defaultdict

class Feature:
    def __init__(self, name=None, unique=None,info=0.0,
                 df=None,gain=0.0,split_info=0.0):
        
        self.name = name
        self.unique = unique
        self.info = info
        self.gain = gain
        self.split_info = split_info
        self.gain_ratio = 0.0
        self.dataset = df

class Prior:
    def __init__(self, feature=None, label=None,feature_support=0,label_support=0,label_count=0):
        self.feature = feature
        self.label = label
        self.feature_support = feature_support
        self.label_support = label_support
        self.label_count = label_count
    
    def probability(self):
        feature_support = self.feature_support
        label_support = self.label_support
        if (label_support == 0):
            return 0
        elif(feature_support == 0):
            feature_support += 1
            label_support += self.label_count
        return feature_support/label_support
            
class NaiveBayesian:
    def __init__(self,verbose=True):
        self.except_features = []
        self.feature_list={}
        self.verbose = verbose
        
    def read_csv(self,filename):
        df = pd.read_table(filename, sep=';', engine='python')
        self.label_name = df.columns[-1]
        self.number_of_entries = len(df)
        self.df = df
        
    def remove_feature(self,feature):
        if feature not in self.except_features:
            self.except_features.append(feature)
            self.features = [item for item in self.df.columns if item not in self.except_features]
#             self.df = self.df.drop(feature,axis=1)
        else:
            print(f'{feature} is removed!')
            
    def find_feature(self,dataset):
        feature_list = {}
        for col in dataset:
            if col not in self.except_features:
                feature = Feature(name=col,unique=dataset[col].unique())
                feature_list[col] = feature
        return feature_list
    
    def create_model(self):
        feature_list = self.find_feature(self.df)
        key_list = list(feature_list)
        label_dict = {}
        labelObj = feature_list[self.label_name]
        for label in labelObj.unique:
            label_idxs = self.df[(self.df[labelObj.name]==label)].index
            label_dict[label] = len(label_idxs)

        prior_dict = {}
        labelObj = feature_list[self.label_name]  
        for key in feature_list:
            featureObj = feature_list[key]
            value_dict={}
            for unique in featureObj.unique: 
                unique_list={}
                for label in labelObj.unique:
                    feature_idxs = self.df[((self.df[featureObj.name]==unique)&(self.df[labelObj.name]==label))].index
                    feature_count = len(feature_idxs)
                    label_count = label_dict[label]
                    prior = Prior(unique, label,feature_count,label_count,len(labelObj.unique))
                    unique_list[label] = prior
                value_dict[unique] = unique_list
            prior_dict[key] = value_dict
        self.prior_dict = prior_dict
        self.label_dict = label_dict
        self._feature_list = feature_list
        
    def get_model(self):
        data = []
        for key in self.prior_dict:
            value_dict = self.prior_dict[key]
            for value_key in value_dict:
                label_dict = value_dict[value_key]
                for label_key in label_dict:
                    prior = label_dict[label_key]
                    ls = {'feature':key,
                          'X':prior.feature,
                          'C':prior.label,
                          'Xi':prior.feature_support,
                          'Ci':prior.label_support,
                          'LabelCount':prior.label_count,
                          'Probability':prior.probability()
                         }
                    data.append(ls)
        df = pd.DataFrame(data)
        return df
    
    def save(self,file):
        df = self.get_model()
        df.to_csv(file)
        
    def read_testset(self,file):
        df = pd.read_table(file, sep=';', engine='python')
        label_name = df.columns[-1]
        number_of_entries = len(df)
        features = [item for item in df.columns if item != label_name]
        return label_name,number_of_entries,df,features        
        
    def predict(self,data):
        if self.verbose:
            print('-----------------------Predict for-----------------------')
            print(data)
            print()
        key_list = list(self.prior_dict)
        label_result = {}
        for label_key in self.label_dict:
            pX = 1.0
            if self.verbose:
                print(f'Label: {label_key}')
            for class_key in key_list:
                feature = data[class_key]
                try:
                    feature_dict = self.prior_dict[class_key][feature][label_key]
                    if self.verbose:
                        print(feature,feature_dict.probability())
                    pX *= feature_dict.probability()
                except:
                    print(f'Missing data for {class_key} {feature}')
            try:
                pC = float(pX) * (self.label_dict[label_key]/self.number_of_entries)
            except:
                pC = 0.0
            label_result[label_key] = pC
            if self.verbose:
                print(f"P({label_key}|X): {pC} / P(X)")
                print()

            bestPc = 0.0
            bestKey = list(label_result)[0]
            for label_key in label_result:
                if(label_result[label_key] > bestPc):
                    bestPc = label_result[label_key]
                    bestKey = label_key
        if self.verbose:
            print(f'Selected label: {bestKey}')
            print('-----------------------End predict-----------------------')
        return bestKey
        
    def predict_file(self,file,verbose=None):
        label_name,number_of_entries,dataset,features = self.read_testset(file)
        predictions = []
        if verbose == None:
            verbose =self.verbose
        if verbose:
            print(f'Number of entries: {number_of_entries}')
            print(f'Label: {label_name}')
            print(f'Features: {features}')
        for index in dataset.index:
            data = dataset.loc[index]
            label = self.predict(data)
            predictions.append(label)
        return predictions,dataset
    
    def display_predictions(self,predictions,dataset):
#         print(dataset.columns[0],self.label_name,'Predictions')
        df = pd.DataFrame(columns=[dataset.columns[0],self.label_name,'Predictions'])
        df['Predictions'] = predictions
        df[dataset.columns[0]]=dataset[dataset.columns[0]]
        df[self.label_name]=dataset[self.label_name]
        print(df)
        
    def multi_level_dict(self):
        return defaultdict(self.multi_level_dict)
        
    def load(self,file):
        df = pd.read_csv(file)
        data = []
        model_list = self.multi_level_dict()
        for index in df.index:
            feature_class= df['feature'][index]
            feature_value = df['X'][index]
            label = df['C'][index]
            feature_suppot_count = df['Xi'][index]
            label_support_count = df['Ci'][index]
            label_count = df['LabelCount'][index]
            prior = Prior(feature_class, label,feature_suppot_count,label_support_count,label_count)
            model_list[feature_class][feature_value][label] = prior
        self.prior_dict = model_list
        
        label_dict = {}
        for item in df['C'].unique():
            label_dict[item] = 0  
        self.label_dict = label_dict
        return df
    
    def info(self):
        self.features = [item for item in self.df.columns if item not in self.except_features]
        print(f'Remove feature: {self.except_features}')
        print(f'Available feature: {self.features}')
        print(f'Number of entries: {self.number_of_entries}')    

In [33]:
model = NaiveBayesian(verbose=False)
model.read_csv('lense_train.txt')
model.remove_feature(model.df.columns[0])
model.info()
model.create_model()

Remove feature: ['NO']
Available feature: ['Age', 'spectacle-prescrip', 'astigmatism', 'tear-prod-rate', 'contact-lenses']
Number of entries: 15


In [25]:
model = NaiveBayesian(verbose=False)
df = model.load('mushroom_update.csv')
model.prior_dict
predictions,dataset = model.predict_file('mushroom_test.txt',verbose=True)

Number of entries: 31
Label: class_label
Features: ['no', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']


In [None]:
model.display_predictions(predictions,dataset)
actualList = dataset[model.label_name]
predictionList = np.array(predictions)
labels = model.df[model.label_name].unique()
report = Report()
newList = report.create_cm_list(actualList,predictionList,labels)
report.create_report(newList,labels)

In [None]:
print(dataset)

In [29]:
from collections import defaultdict
def multi_level_dict():
    """ Constructor for creating multi-level nested dictionary. """

    return defaultdict(multi_level_dict)

In [53]:
df = pd.read_csv('file/student_model.csv')
data = []
model_list = multi_level_dict()
for index in df.index:
    feature_class= df['feature'][index]
    feature_value = df['X'][index]
    label = df['C'][index]
    feature_suppot_count = df['Xi'][index]
    label_support_count = df['Ci'][index]
    label_count = df['FeatureType'][index]
    prior = Prior(feature_value, label,feature_suppot_count,label_support_count,label_count)
    model_list[feature_class][feature_value][label] = prior

In [54]:
model.prior_dict

{'Age': {'young': {'none': <__main__.Prior at 0x1e1ff88a280>,
   'soft': <__main__.Prior at 0x1e1ff8877f0>,
   'hard': <__main__.Prior at 0x1e1ff68d8b0>},
  'pre-presbyopic': {'none': <__main__.Prior at 0x1e1ff68dca0>,
   'soft': <__main__.Prior at 0x1e1ff887490>,
   'hard': <__main__.Prior at 0x1e1ff68ea90>},
  'prebyopic': {'none': <__main__.Prior at 0x1e1ff826670>,
   'soft': <__main__.Prior at 0x1e1ff826940>,
   'hard': <__main__.Prior at 0x1e1ff85cfa0>}},
 'spectacle-prescrip': {'myope': {'none': <__main__.Prior at 0x1e1ff5823a0>,
   'soft': <__main__.Prior at 0x1e1ff887640>,
   'hard': <__main__.Prior at 0x1e1ff5820d0>},
  'hypermetrope': {'none': <__main__.Prior at 0x1e1ff887c10>,
   'soft': <__main__.Prior at 0x1e1ff68d6a0>,
   'hard': <__main__.Prior at 0x1e1ff8879a0>}},
 'astigmatism': {'no': {'none': <__main__.Prior at 0x1e1ff68db50>,
   'soft': <__main__.Prior at 0x1e1ff68d4f0>,
   'hard': <__main__.Prior at 0x1e1ff68d460>},
  'yes': {'none': <__main__.Prior at 0x1e1ff6e3d6

In [55]:
for class_key in model_list:
    class_value = model_list[class_key]
    for label_value in class_value:
        objs = class_value[label_value]
        for key in objs:
            obj = objs[key]
            print(obj.feature,obj.label,obj.feature_support,obj.label_support,obj.label_count,obj.probability())

youth no 3 4 2 0.75
youth yes 1 6 2 0.16666666666666666
middle_aged no 0 4 2 0.16666666666666666
middle_aged yes 2 6 2 0.3333333333333333
senior no 1 4 2 0.25
senior yes 3 6 2 0.5
high no 2 4 2 0.5
high yes 1 6 2 0.16666666666666666
medium no 1 4 2 0.25
medium yes 2 6 2 0.3333333333333333
low no 1 4 2 0.25
low yes 3 6 2 0.5
no no 3 4 2 0.75
no yes 2 6 2 0.3333333333333333
yes no 1 4 2 0.25
yes yes 4 6 2 0.6666666666666666
fair no 2 4 2 0.5
fair yes 5 6 2 0.8333333333333334
excellent no 2 4 2 0.5
excellent yes 1 6 2 0.16666666666666666
no no 4 4 2 1.0
no yes 0 6 2 0.125
yes no 0 4 2 0.16666666666666666
yes yes 6 6 2 1.0


In [58]:
dataset.loc[0]

no                          1837
cap-shape                      x
cap-surface                    y
cap-color                      n
bruises                        t
odor                           n
gill-attachment                f
gill-spacing                   c
gill-size                      b
gill-color                     w
stalk-shape                    t
stalk-root                     b
stalk-surface-above-ring       s
stalk-surface-below-ring       s
stalk-color-above-ring         p
stalk-color-below-ring         g
veil-type                      p
veil-color                     w
ring-number                    o
ring-type                      p
spore-print-color              n
population                     y
habitat                        d
class_label                    e
Name: 0, dtype: object

In [74]:
predictions,dataset = model.predict_file('lense_test.txt',verbose=True)

Number of entries: 2
Label: contact-lenses
Features: ['NO', 'Age', 'spectacle-prescrip', 'astigmatism', 'tear-prod-rate']
