In [1]:
## packages 
from __future__ import division, print_function, unicode_literals
import numpy as np
from scipy.sparse import coo_matrix # for sparse matrix
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score # for evaluating results
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
# data path and file name 
path = 'data/'
train_data_fn = 'train-features.txt'
test_data_fn = 'test-features.txt'
train_label_fn = 'train-labels.txt'
test_label_fn = 'test-labels.txt'

### Data Description
The dataset is split into two subsets: a 700-email subset for training and a 260-email subset for testing. Each of the training and testing subsets contain 50% spam messages and 50% nonspam messages. Additionally, the emails have been preprocessed in the following ways:

**1. Stop word removal:** Certain words like "and," "the," and "of," are very common in all English sentences and are not very meaningful in deciding spam/nonspam status, so these words have been removed from the emails.

**2. Lemmatization:** Words that have the same meaning but different endings have been adjusted so that they all have the same form. For example, "include", "includes," and "included," would all be represented as "include." All words in the email body have also been converted to lower case.

**3. Removal of non-words:** Numbers and punctuation have both been removed. All white spaces (tabs, newlines, spaces) have all been trimmed to a single space character.

In [3]:
# pick number of words (a dictionary for this model) is 2500 - most frequent words
nwords = 2500 

def read_data(data_fn, label_fn):
    ## read label_fn
    with open(path + label_fn) as f:
        content = f.readlines()
    label = [int(x.strip()) for x in content]

    ## read data_fn
    with open(path + data_fn) as f:
        content = f.readlines()
        
    # remove '\n' at the end of each line
    content = [x.strip() for x in content] 

    dat = np.zeros((len(content), 3), dtype = int)
    
    for i, line in enumerate(content): 
        a = line.split(' ')
        dat[i, :] = np.array([int(a[0]), int(a[1]), int(a[2])])

    # for more information about coo_matrix function         
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html
    
    data = coo_matrix((dat[:, 2], (dat[:, 0] - 1, dat[:, 1] - 1)),\
             shape=(len(label), nwords))
    return (data, label)

### Use MultinomialNB model
Train the model on the training set and predict the spam/nonspam classification on the test set. 

In [4]:
train_data_fn = 'train-features-400.txt'
train_label_fn = 'train-labels-400.txt'
test_data_fn = 'test-features.txt'
test_label_fn = 'test-labels.txt'

(train_data, train_label)  = read_data(train_data_fn, train_label_fn)
(test_data, test_label)  = read_data(test_data_fn, test_label_fn)

clf = MultinomialNB()
clf.fit(train_data, train_label)

y_pred = clf.predict(test_data)
print('Training size = %d, accuracy = %.4f%%' % \
      (train_data.shape[0],accuracy_score(test_label, y_pred)*100))

Training size = 400, accuracy = 97.6923%


In [5]:
print(classification_report(test_label, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       130
           1       0.98      0.97      0.98       130

   micro avg       0.98      0.98      0.98       260
   macro avg       0.98      0.98      0.98       260
weighted avg       0.98      0.98      0.98       260



In [6]:
confusion_matrix(test_label, y_pred)

array([[128,   2],
       [  4, 126]], dtype=int64)

### Smaller training sets

In [7]:
train_data_fn = 'train-features-100.txt'
train_label_fn = 'train-labels-100.txt'
test_data_fn = 'test-features.txt'
test_label_fn = 'test-labels.txt'

(train_data, train_label)  = read_data(train_data_fn, train_label_fn)
(test_data, test_label)  = read_data(test_data_fn, test_label_fn)
clf = MultinomialNB()
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
print('Training size = %d, accuracy = %.4f%%' % \
      (train_data.shape[0],accuracy_score(test_label, y_pred)*100))

Training size = 100, accuracy = 97.6923%


In [8]:
confusion_matrix(test_label, y_pred,labels=[0,1])

array([[127,   3],
       [  3, 127]], dtype=int64)

#### 50 emails

In [9]:
train_data_fn = 'train-features-50.txt'
train_label_fn = 'train-labels-50.txt'
test_data_fn = 'test-features.txt'
test_label_fn = 'test-labels.txt'

(train_data, train_label)  = read_data(train_data_fn, train_label_fn)
(test_data, test_label)  = read_data(test_data_fn, test_label_fn)
clf = MultinomialNB()
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
print('Training size = %d, accuracy = %.4f%%' % \
      (train_data.shape[0],accuracy_score(test_label, y_pred)*100))

Training size = 50, accuracy = 97.3077%


In [10]:
confusion_matrix(test_label, y_pred,labels=[0,1])

array([[126,   4],
       [  3, 127]], dtype=int64)

### The errors on the smaller training sets.
How many documents did you misclassify? 
- 50 training documents: 7 misclassified, 2.7%.
- 100 training documents: 6 misclassified, 2.3%.
- 400 training documents: 6 misclassified, 2.3%.

### Try BernoulliNB

In [11]:
clf = BernoulliNB(binarize = .5)
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
print('Training size = %d, accuracy = %.2f%%' % \
      (train_data.shape[0],accuracy_score(test_label, y_pred)*100))

Training size = 50, accuracy = 69.62%


In [12]:
confusion_matrix(test_label, y_pred,labels=[0,1])

array([[125,   5],
       [ 74,  56]], dtype=int64)

--> In this problem, MultinomialNB model is better than BernoulliNB model