In [8]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np

# train data
d1 = [2, 1, 1, 0, 0, 0, 0, 0, 0]
d2 = [1, 1, 0, 1, 1, 0, 0, 0, 0]
d3 = [0, 1, 0, 0, 1, 1, 0, 0, 0]
d4 = [0, 1, 0, 0, 0, 0, 1, 1, 1]
train_data = np.array([d1, d2, d3, d4])
label = np.array(['B', 'B', 'B', 'N'])

# test data
d5 = np.array([[2, 0, 0, 1, 0, 0, 0, 1, 0]])
d6 = np.array([[0, 1, 0, 0, 0, 0, 0, 1, 1]])

# train
model = MultinomialNB()
model.fit(train_data, label)

# test
print('Predicting class of d5:', str(model.predict(d5)[0]))
print('Probability of d6 in each class:', model.predict_proba(d6))

Predicting class of d5: B
Probability of d6 in each class: [[0.29175335 0.70824665]]


In [9]:
from sklearn.naive_bayes import BernoulliNB
import numpy as np

# train data
d1 = [1, 1, 1, 0, 0, 0, 0, 0, 0] # Chỉ quan tâm từ có xuất hiện hay không
d2 = [1, 1, 0, 1, 1, 0, 0, 0, 0]
d3 = [0, 1, 0, 0, 1, 1, 0, 0, 0]
d4 = [0, 1, 0, 0, 0, 0, 1, 1, 1]
train_data = np.array([d1, d2, d3, d4])
label = np.array(['B', 'B', 'B', 'N'])

# test data
d5 = np.array([[1, 0, 0, 1, 0, 0, 0, 1, 0]]) # Chỉ quan tâm từ có xuất hiện hay không
d6 = np.array([[0, 1, 0, 0, 0, 0, 0, 1, 1]])

# train
model = BernoulliNB()
model.fit(train_data, label)

# test
print('Predicting class of d5:', str(model.predict(d5)[0]))
print('Probability of d6 in each class:', model.predict_proba(d6))

Predicting class of d5: B
Probability of d6 in each class: [[0.16948581 0.83051419]]


In [22]:
import numpy as np
from scipy.sparse import coo_matrix # sparse matrix
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

# data path
path = 'ex6DataPrepared/'
train_data_fn = 'train-features.txt'
test_data_fn = 'test-features.txt'
train_label_fn = 'train-labels.txt'
test_label_fn = 'test-labels.txt'

# read data
nwords = 2500

def read_data(data_fn, label_fn):
    # read label_fn
    with open(path + label_fn) as f:
        content = f.readlines()
    label = [int(x.strip()) for x in content]
    
    # read data_fn
    with open(path + data_fn) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    dat = np.zeros((len(content), 3), dtype = int)
    for i, line in enumerate(content):
        a = line.split(' ')
        dat[i, :] = np.array([int(a[0]), int(a[1]), int(a[2])])
    # remember to -1 at coordinate since we’re in Python
    data = coo_matrix((dat[:, 2], (dat[:, 0] - 1, dat[:, 1] - 1)), shape=(len(label), nwords))
    return data, label

(train_data, train_label) = read_data(train_data_fn, train_label_fn)
(test_data, test_label) = read_data(test_data_fn, test_label_fn)

# train
clf_MNB = MultinomialNB()
clf_MNB.fit(train_data, train_label)
y_pred_MNB = clf_MNB.predict(test_data)

clf_BNB = BernoulliNB(binarize = .5)
clf_BNB.fit(train_data, train_label)
y_pred_BNB = clf_BNB.predict(test_data)

print(f'MultinomialNB: Training size = {train_data.shape[0]}, accuracy = {accuracy_score(test_label, y_pred_MNB)*100:.2f}%')
print(f'BernoulliNB: Training size = {train_data.shape[0]}, accuracy = {accuracy_score(test_label, y_pred_BNB)*100:.2f}%')

MultinomialNB: Training size = 700, accuracy = 98.08%
BernoulliNB: Training size = 700, accuracy = 85.38%
