In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# CARA 1

In [18]:
class CategoricalNaiveBayes:
    # Constructor
    def __init__(self):
        self.probs = dict()
        self.cond_probs = dict()
        self.targets = list()
        self.columns = list()
    
    # Fit method
    def fit(self, x, y, column_names):
        self.__init__()
        # Preparing DataFrame
        dataset = pd.DataFrame(data=x, index=None, columns=column_names[:-1])
        target_column_name = column_names[-1]
        dataset[target_column_name] = y
        
        # Preparing probabilities dictionary
        for column in dataset:
            self.probs[column] = dict()
            for value in dataset[column].unique():
                self.probs[column][value] = len(dataset.query('{0} == @value'.format(column))) / len(dataset)
        
        # Preparing conditional_probabilities dictionary
        for column in dataset.drop([target_column_name], axis=1):
            self.cond_probs[column] = dict()
            for value1 in dataset[column].unique():
                for value2 in dataset[target_column_name].unique():
                    self.cond_probs[column][f'{value1}-{value2}'] = len(dataset.query('{0} == @value1 & {1} == @value2'.format(column, target_column_name))) / len(dataset.query('{0} == @value2'.format(target_column_name)))
        
        self.targets = dataset[target_column_name].unique()
        self.columns = column_names
    
    # Predict method
    def predict(self, x):
        predicts = list()
        for row in x:
            target_prob_dict = dict()
            for target in self.targets:
                row_cond_probs = [self.cond_probs[column][f'{value}-{target}'] for column, value in zip(self.columns, row)]
                target_prob_dict[target] = ( np.prod(row_cond_probs) * self.probs[self.columns[-1]][target] )
            predicts.append( max(target_prob_dict, key=target_prob_dict.get) )
        return predicts

In [19]:
dataset = pd.read_excel("NB.xlsx")
dataset

Unnamed: 0,Angin,Cuaca,Main
0,Kuat,Cerah,Tidak
1,Lemah,Mendung,YA
2,Lemah,Hujan,YA
3,Kuat,Cerah,YA
4,Kuat,Cerah,YA
5,Kuat,Mendung,YA
6,Kuat,Hujan,Tidak
7,Lemah,Hujan,Tidak
8,Lemah,Cerah,YA
9,Kuat,Hujan,YA


In [20]:
columns = dataset.columns.to_numpy()
target = (dataset["Main"]).to_numpy()
del dataset["Main"]
data = dataset.to_numpy()

In [21]:
data

array([['Kuat', 'Cerah'],
       ['Lemah', 'Mendung'],
       ['Lemah', 'Hujan'],
       ['Kuat', 'Cerah'],
       ['Kuat', 'Cerah'],
       ['Kuat', 'Mendung'],
       ['Kuat', 'Hujan'],
       ['Lemah', 'Hujan'],
       ['Lemah', 'Cerah'],
       ['Kuat', 'Hujan'],
       ['Kuat', 'Cerah'],
       ['Lemah', 'Mendung'],
       ['Lemah', 'Mendung'],
       ['Kuat', 'Hujan']], dtype=object)

In [22]:
# Evaluating Classifier's Average Efficency
cv = StratifiedKFold(n_splits=5, shuffle=True)
efficency = []
for train_index, test_index in cv.split(data, target):
    train_x, test_x = data[train_index], data[test_index]
    train_y, test_y = target[train_index], target[test_index]
    nb = CategoricalNaiveBayes()
    nb.fit(train_x, train_y, columns)
    pred_y = nb.predict(test_x)
    efficency.append(accuracy_score(test_y, pred_y))

In [23]:
print(f"Average classification efficency (\"categ.csv\" dataset) = {np.average(efficency) * 100}%")

Average classification efficency ("categ.csv" dataset) = 56.666666666666664%


# CARA 2

In [39]:
dataset2 = pd.read_excel("NB2.xlsx")
dataset2

Unnamed: 0,Angin,Cuaca,Main
0,1,2,0
1,0,1,1
2,0,0,1
3,1,2,1
4,1,2,1
5,1,1,1
6,1,0,0
7,0,0,0
8,0,2,1
9,1,0,1


In [40]:
def calculate_likelihood_categorical(df, feat_name, feat_val, Y, label):
    feat = list(df.columns)
    df = df[df[Y]==label]
    p_x_given_y = len(df[df[feat_name]==feat_val]) / len(df)
    return p_x_given_y

In [41]:
def naive_bayes_categorical(df, X, Y):
    # get feature names
    features = list(df.columns)[:-1]

    # calculate prior
    prior = calculate_prior(df, Y)

    Y_pred = []
    # loop over every data sample
    for x in X:
        # calculate likelihood
        labels = sorted(list(df[Y].unique()))
        likelihood = [1]*len(labels)
        for j in range(len(labels)):
            for i in range(len(features)):
                likelihood[j] *= calculate_likelihood_categorical(df, features[i], x[i], Y, labels[j])

        # calculate posterior probability (numerator only)
        post_prob = [1]*len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * prior[j]

        Y_pred.append(np.argmax(post_prob))

    return np.array(Y_pred) 

In [42]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(dataset2, test_size=.2, random_state=41)

X_test = test.iloc[:,:-1].values
Y_test = test.iloc[:,-1].values
Y_pred = naive_bayes_categorical(train, X=X_test, Y="Main")

from sklearn.metrics import confusion_matrix, f1_score
print(confusion_matrix(Y_test, Y_pred))
print(f1_score(Y_test, Y_pred))

[[0 2]
 [0 1]]
0.5
