### `import` Packages

In [1]:
import pandas as pd
import numpy as np

### Data

In [2]:
try:
    data_source = 'http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'
    cdf = pd.read_csv(
        filepath_or_buffer=data_source, 
        names=['buying','maint','doors','persons','lug_boot','safety','class'], 
        sep=','
    )
    # cdf['doors'] = cdf['doors'].apply(func=lambda x: str(x))
    # cdf['persons'] = cdf['persons'].apply(func=lambda x: str(x))
except Exception as e:
    cdf = pd.read_csv('tennis_anyone.csv')

In [3]:
cdf.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


### Train Test Splitter

In [4]:
def splitter(dframe, percentage=0.8, random_state=True):
    if random_state:
        dframe = dframe.sample(frac=1)
    
    thresh = round(len(dframe) * percentage)
    train_df = dframe.iloc[:thresh]
    test_df = dframe.iloc[thresh:]
    
    return train_df, test_df

In [5]:
train_df, test_df = splitter(dframe=cdf)

In [6]:
train_df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
836,high,low,4,more,big,high,acc
117,vhigh,high,2,4,small,low,unacc
657,high,med,2,4,small,low,unacc
1397,low,vhigh,5more,more,small,high,acc
586,high,high,3,more,small,med,unacc


In [7]:
test_df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
629,high,high,5more,2,big,high,unacc
409,vhigh,low,5more,2,med,med,unacc
1399,low,vhigh,5more,more,med,med,acc
365,vhigh,low,3,4,med,high,acc
44,vhigh,vhigh,3,4,big,high,unacc


### Categorical Naive Bayes

In [8]:
class CategoricalNB():
    def __init__(self, train_df, test_df, label):
        self.X_train, self.y_train = self.split_features_targets(df=train_df, label=label)
        self.X_test, self.y_test = self.split_features_targets(df=test_df, label=label)
        
        self.X_test_vals = self.X_test.values
        self.y_test_vals = self.y_test.values
        
        self.X_likelihood, self.y_likelihood = self.compute_likelihood()
        
    def split_features_targets(self, df, label):
        X = df.drop(columns=[label], axis=1)
        y = df[label]
        return X, y
    
    def compute_likelihood(self):
        X_likelihood = {}
        
        yc_df = self.y_train.value_counts().to_frame()
        yc_df.reset_index(inplace=True)
        yc_df.columns = ['class', 'count']
        
        y_vc = {i : j for (i, j) in zip(yc_df['class'], yc_df['count'])}
        y_vc_k = list(y_vc.keys())
        
        for col in self.X_train:
            each_col_dict = {}
            x_col_vals = self.X_train[col].value_counts().to_frame().index.to_list()
            fydf = pd.DataFrame(data={col : self.X_train[col], 'y' : self.y_train})
            
            for ex in x_col_vals:
                each_x_dict = {}
                x_ex_df = fydf[fydf[col] == ex]
                
                for ey in y_vc_k:
                    x_y_df = x_ex_df[x_ex_df['y'] == ey]
                    each_x_dict[ey] = len(x_y_df) / y_vc[ey]
                
                each_col_dict[ex] = each_x_dict
            X_likelihood[col] = each_col_dict
        y_likelihood = {i : j / sum(list(y_vc.values())) for (i, j) in y_vc.items()}
        
        return X_likelihood, y_likelihood
    
    def predictor(self, X_new):
        cols = list(self.X_likelihood.keys())
        col_new = {i : j for (i, j) in zip(cols, X_new)}

        lprobs = {}
        for l, v in self.y_likelihood.items():
            cate_v = [self.X_likelihood[cn][cl][l] for (cn, cl) in col_new.items()]
            lprobs[l] = round((np.prod(cate_v) * v), 4)
        
        prob_ks = list(lprobs.keys())
        prob_vs = list(lprobs.values())

        return prob_ks[np.argmax(prob_vs)]
    
    def predict(self):
        if len(self.X_test_vals) == 1:
            return self.predictor(X_new=self.X_test_vals[0])
        preds = [self.predictor(X_new=i) for i in self.X_test_vals]
        return preds
    
    def accuracy_score(self, preds):
        actual_vals = np.array(self.y_test_vals)
        preds = np.array(preds)
        corrects = np.count_nonzero(np.where((actual_vals == preds), 1, 0))
        return corrects / len(actual_vals)

### Implementation

In [9]:
nb = CategoricalNB(train_df=train_df, test_df=test_df, label='class')

In [10]:
preds = nb.predict()

In [11]:
acc = nb.accuracy_score(preds=preds)

In [12]:
acc

0.8092485549132948

### End