# Naive Bayes for Text Classification 

## Bernoulli Distribution

In [1]:
import numpy as np
import scipy.sparse

In [2]:
train_X, test_X = scipy.sparse.load_npz('training_feats.npz').toarray(), scipy.sparse.load_npz('test_feats.npz').toarray()
train_y, test_y = np.load('training_labels.npy', allow_pickle=True), np.load('test_labels.npy', allow_pickle=True)

In [3]:
def getSplitedClass(train_X):
    res = []
    for label in range(0, 20):
        post_index = np.where(train_y == label)[0].tolist()
        a_class = train_X[post_index[0]]
        for index in range(1, len(post_index)):
            a_class = np.vstack((a_class,train_X[post_index[index]]))
        res.append(a_class)
    return res

In [4]:
for post in range(len(train_X)):
    train_X[post] = np.where(train_X[post]!=0.0,1,train_X[post])
for post in range(len(test_X)):
    test_X[post] = np.where(test_X[post]!=0.0,1,test_X[post])
classified_data = getSplitedClass(train_X)
classified_data

[array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        ...,
        [1., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 arr

In [7]:
def getPriorProb(classified_data):
    prior_prob = []
    for label in range(0, 20):
        prior_prob.append(len(classified_data[label]) / len(train_X))
    return np.array(prior_prob)

In [8]:
prior_prob = getPriorProb(classified_data)
prior_prob

array([0.04242531, 0.05161747, 0.05223617, 0.05214778, 0.05108715,
       0.05241294, 0.05170585, 0.05250133, 0.05285487, 0.05276648,
       0.05303164, 0.05258971, 0.05223617, 0.05250133, 0.05241294,
       0.05294326, 0.04825879, 0.04984974, 0.04109952, 0.03332155])

In [25]:
def getBernoulliParams(classified_data): 
    alpha = 1
    params = []
    for label in range(0,20):
        classified_data_T = classified_data[label].T # shape(1000, 480)
        row_params = []
        for feature in range(0, 1000):
            param=(np.count_nonzero(classified_data_T[feature])+ alpha) / (classified_data_T.shape[1]+ 20)
            row_params.append(param)
        params.append(row_params)
    params = np.array(params)
    return params

In [26]:
BernoulliParams = getBernoulliParams(classified_data)
BernoulliParams.shape

(20, 1000)

In [103]:
def predict(test_X):
    pred = []
    for post in range(0, len(test_X)):
        if post % 100 == 0:
            print("post =",post)
        products = []
        for label in range(0, 20):
            product = 1
            for feature in range (0,1000):
                value = test_X[post][feature]
                param = BernoulliParams[label][feature]
                product *= param *value + (1 - param) * (1 - value)
            products.append(product)
        products = np.multiply(products, prior_prob)
        index = np.argmax(products)
        pred.append(index)
    return pred

In [104]:
pred = predict(test_X) 

post = 0
post = 100
post = 200
post = 300
post = 400
post = 500
post = 600
post = 700
post = 800
post = 900
post = 1000
post = 1100
post = 1200
post = 1300
post = 1400
post = 1500
post = 1600
post = 1700
post = 1800
post = 1900
post = 2000
post = 2100
post = 2200
post = 2300
post = 2400
post = 2500
post = 2600
post = 2700
post = 2800
post = 2900
post = 3000
post = 3100
post = 3200
post = 3300
post = 3400
post = 3500
post = 3600
post = 3700
post = 3800
post = 3900
post = 4000
post = 4100
post = 4200
post = 4300
post = 4400
post = 4500
post = 4600
post = 4700
post = 4800
post = 4900
post = 5000
post = 5100
post = 5200
post = 5300
post = 5400
post = 5500
post = 5600
post = 5700
post = 5800
post = 5900
post = 6000
post = 6100
post = 6200
post = 6300
post = 6400
post = 6500
post = 6600
post = 6700
post = 6800
post = 6900
post = 7000
post = 7100
post = 7200
post = 7300
post = 7400
post = 7500


In [105]:
def getAccuracy(test_y, pred):
    cnt = 0
    for i in range(len(pred)):
        if test_y[i]==pred[i]:
            cnt += 1
    acc = float(cnt / len(pred))
    return acc

In [107]:
accuracy = getAccuracy(test_y, pred)
accuracy

0.5011949017525226