In [1]:
import numpy as np
from scipy.sparse import coo_matrix # for sparse matrix
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score # for evaluating results

In [16]:
# data path and file name 
path = '/Users/vuhoangnguyen/ex6DataPrepared/'

In [32]:
nwords = 2500 

def read_data(data_fn, label_fn):
    # read label_fn
    with open(path + label_fn) as f:
        content = f.readlines()
    label = [int(x.strip()) for x in content]

    # read data_fn
    with open(path + data_fn) as f:
        content = f.readlines()
    content = [x.strip() for x in content]  # remove '\n' at the end of each line
    dat = np.zeros((len(content), 3), dtype = int)
    
    for i, line in enumerate(content): 
        a = line.split(' ')
        dat[i, :] = np.array([int(a[0]), int(a[1]), int(a[2])])
    # coo_matrix((data, (i, j)), [shape=(M, N)])
    #    data[:] the entries of the matrix, in any order
    #    i[:] the row indices of the matrix entries
    #    j[:] the column indices of the matrix entries
    # remember to -1 at coordinate since we're in Python
    # check this: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html for more info about coo_matrix function 

    data = coo_matrix((dat[:, 2], (dat[:, 0] - 1, dat[:, 1] - 1)),shape=(len(label), nwords))
    return (data, label)

In [33]:
train_data_fn = 'train-features.txt'
test_data_fn = 'test-features.txt'
train_label_fn = 'train-labels.txt'
test_label_fn = 'test-labels.txt'

(train_data, train_label)  = read_data(train_data_fn, train_label_fn)
(test_data, test_label)  = read_data(test_data_fn, test_label_fn)

clf = MultinomialNB()
clf.fit(train_data, train_label)

y_pred = clf.predict(test_data)
print(f'Training size = {train_data.shape[0]}, accuracy = {round(accuracy_score(test_label, y_pred)*100,2)}%')

Training size = 700, accuracy = 98.08%


In [34]:
train_data_fn = 'train-features-100.txt'
train_label_fn = 'train-labels-100.txt'
test_data_fn = 'test-features.txt'
test_label_fn = 'test-labels.txt'

(train_data, train_label)  = read_data(train_data_fn, train_label_fn)
(test_data, test_label)  = read_data(test_data_fn, test_label_fn)

clf = MultinomialNB()
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
print(f'Training size = {train_data.shape[0]}, accuracy = {round(accuracy_score(test_label, y_pred)*100,2)}%')

Training size = 100, accuracy = 97.69%


In [35]:
train_data_fn = 'train-features-50.txt'
train_label_fn = 'train-labels-50.txt'
test_data_fn = 'test-features.txt'
test_label_fn = 'test-labels.txt'

(train_data, train_label)  = read_data(train_data_fn, train_label_fn)
(test_data, test_label)  = read_data(test_data_fn, test_label_fn)

clf = MultinomialNB()
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
print(f'Training size = {train_data.shape[0]}, accuracy = {round(accuracy_score(test_label, y_pred)*100,2)}%')

clf = BernoulliNB(binarize = .5)
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
print(f'Training size = {train_data.shape[0]}, accuracy = {round(accuracy_score(test_label, y_pred)*100,2)}%')

Training size = 50, accuracy = 97.31%
Training size = 50, accuracy = 69.62%
