In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv('A_Z Handwritten Data.csv')
csv_dataset = np.array(data)
X, y = csv_dataset[:, 1:], csv_dataset[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train = np.array(X_train)
X_test = np.array(X_test)


class_probs = dict()
feature_freqs = dict()
for c in np.unique(y_train):
    class_probs[c] = (y_train == c).sum() / len(y_train)
    feature_freqs[c] = (X_train[y_train == c] > 0).sum(axis=0)


def predict(X):
    N, K = X.shape[0], len(class_probs)
    probs = np.zeros((N, K))

    for i, (x, class_prob) in enumerate(zip(X, class_probs.values())):
        prob = class_prob
        if np.any(x):
            prob *= np.prod(feature_freqs[c][x > 0] / (y_train == c).sum())
        else:
            prob *= np.prod(1 - feature_freqs[c] / (y_train == c).sum())
        probs[i, c] = prob
    return np.argmax(probs, axis=1)

predictions = predict(X_test)
accuracy = (predictions == y_test).mean()
print("Accuracy:", accuracy*100)


Accuracy: 3.7347294938917974


In [None]:

class MultinomialNB:

    def __init__(self, alpha=0.01):
        self.alpha = alpha

    def fit(self, X_train, y_train):
        m, n = X_train.shape
        self._classes = np.unique(y_train)
        n_classes = len(self._classes)

        self._priors = np.zeros(n_classes)
        self._likelihoods = np.zeros((n_classes, n))

        for idx, c in enumerate(self._classes):
            X_train_c = X_train[c == y_train]
            self._priors[idx] = X_train_c.shape[0] / m
            self._likelihoods[idx, :] = ((X_train_c.sum(axis=0)) + self.alpha) / (np.sum(X_train_c.sum(axis=0) + self.alpha))


    def predict(self, X_test):
        return [self._predict(x_test) for x_test in X_test]

    def _predict(self, x_test):
        posteriors = []
        for idx, c in enumerate(self._classes):
            prior_c = np.log(self._priors[idx])
            likelihoods_c = self.calc_likelihood(self._likelihoods[idx,:], x_test)
            posteriors_c = np.sum(likelihoods_c) + prior_c
            posteriors.append(posteriors_c)

        return self._classes[np.argmax(posteriors)]

    def calc_likelihood(self, cls_likeli, x_test):
        return np.log(cls_likeli) * x_test

    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return np.sum(y_pred == y_test)/len(y_test)



obj =MultinomialNB()
obj.fit(X_train, y_train)

print(obj.score(X_test, y_test))




0.698697811786817


In [None]:
def bin_feature(feature, n_bins):
    """Bins a continuous numerical feature into n_bins evenly spaced bins."""
    bin_edges = np.linspace(feature.min(), feature.max(), n_bins + 1)
    return np.digitize(feature, bin_edges)


from numpy import apply_along_axis

X_train_binned = apply_along_axis(lambda x: bin_feature(x, 10), 0, X_train)
X_test_binned = apply_along_axis(lambda x: bin_feature(x, 10), 0, X_test)

obj = MultinomialNB()
obj.fit(X_train_binned, y_train)
y_pred = obj.score(X_test_binned, y_test)
print(y_pred)


0.543804537521815


In [None]:
X_train_binned = apply_along_axis(lambda x: bin_feature(x, 1000), 0, X_train)
X_test_binned = apply_along_axis(lambda x: bin_feature(x, 1000), 0, X_test)

obj = MultinomialNB()
obj.fit(X_train_binned, y_train)
y_pred = obj.score(X_test_binned, y_test)
print(y_pred)

0.569660357094912
