# Example
## North or South

In [1]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB

### Multinomial NB

In [6]:
# train set
d1 = [2, 1, 1, 0, 0, 0, 0, 0, 0]
d2 = [1, 1, 0, 1, 1, 0, 0, 0, 0]
d3 = [0, 1, 0, 0, 1, 1, 0, 0, 0]
d4 = [0, 1, 0, 0, 0, 0, 1, 1, 1]
train_data = np.array([d1, d2, d3, d4])
label = np.array([['N'], ['N'], ['N'], ['S']])
# implement MultinomialNB
model = MultinomialNB()
# training
model.fit(train_data, label)

  y = column_or_1d(y, warn=True)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
# test set
d5 = np.array([[2, 0, 0, 1, 0, 0, 0, 1, 0]])
d6 = np.array([[0, 1, 0, 0, 0, 0, 0, 1, 1]])
# predict d5
print('Predict class of d5 based on NBC:', model.predict(d5)[0])
print('Probability of d5 in each class:', model.predict_proba(d5))
# predict d6
print('Predict class of d6 based on NBC:', model.predict(d6)[0])
print('Probability of d6 in each class:', model.predict_proba(d6))

Predict class of d5 based on NBC: N
Probability of d5 in each class: [[ 0.89548823  0.10451177]]
Predict class of d6 based on NBC: S
Probability of d6 in each class: [[ 0.29175335  0.70824665]]


* The probability of each class in **d5** is similar with calculation in document

### Bernoulli NB
* Instead of considering the number of feature in document, **Bernoulli NB** just care whether this feature is in the document or not

In [11]:
from sklearn.naive_bayes import BernoulliNB

In [12]:
# test set
d1 = [1, 1, 1, 0, 0, 0, 0, 0, 0]
d2 = [1, 1, 0, 1, 1, 0, 0, 0, 0]
d3 = [0, 1, 0, 0, 1, 1, 0, 0, 0]
d4 = [0, 1, 0, 0, 0, 0, 1, 1, 1]
train_data = np.array([d1, d2, d3, d4])
label = np.array([['N'], ['N'], ['N'], ['S']])
# implement MultinomialNB
model = BernoulliNB()
# training
model.fit(train_data, label)

  y = column_or_1d(y, warn=True)


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [13]:
# test set
d5 = np.array([[1, 0, 0, 1, 0, 0, 0, 1, 0]])
d6 = np.array([[0, 1, 0, 0, 0, 0, 0, 1, 1]])
# predict d5
print('Predict class of d5 based on NBC:', model.predict(d5)[0])
print('Probability of d5 in each class:', model.predict_proba(d5))
# predict d6
print('Predict class of d6 based on NBC:', model.predict(d6)[0])
print('Probability of d6 in each class:', model.predict_proba(d6))

Predict class of d5 based on NBC: N
Probability of d5 in each class: [[ 0.76554295  0.23445705]]
Predict class of d6 based on NBC: S
Probability of d6 in each class: [[ 0.16948581  0.83051419]]


* For the small dataset like this, the result of **Multinomial NB** and **Bernoulli NB** is not different

## Email Spam Filtering
* **Preprocess** text
    * Remove **stop words**: `'and'`, `'the'`, `'of'` should be removed
    * Lemmatization: e.g. `'includes'`, `'included'` should be change to `'include'`
    * Remove non-stop: remove number, punctuation, special character
* Our dataset includes 960 english email which is separated into training set and test set in the ratio of 700:260 and half of each of them is spam
* Our dataset **was preprocessed**
* Example
    * Before Preprocessing of non-spam email:
![caption](http://res.cloudinary.com/dqagyeboj/image/upload/v1531103065/Chapter11_0_esyfsg.png)
    * After Preprocessing of non-spam email:
![caption](http://res.cloudinary.com/dqagyeboj/image/upload/v1531103065/Chapter11_1_vmvll4.png)
    * After Preprocessing of spam email:
![caption](https://res.cloudinary.com/dqagyeboj/image/upload/v1531103064/Chapter11_2_fpxkrv.png)

* The word like `'financial'`, `'extraodinary'`, `'earn'`, `'oppotunity'` is the common words in span email

* In the dataset, we have
    * test-features.txt
    * test-labels.txt
    * train-features-50.txt
    * train-features-100.txt
    * train-features-400.txt
    * train-features.txt
    * train-labels-50.txt
    * train-labels-100.txt
    * train-labels-400.txt
    * train-labels.txt
* `'train-features-50.txt'` is truncated form of training set with 50 emails
* In `'train-labels.txt'`, each row has 0 or 1 to show this email is spam or not
* In `'train-features.txt'`, each row has 3 number:
    * `'1 564 1'`
    * `'1 19 2'`
* First value is index of email, start from 1. Second value is index of word in the dictionary (dictionary has 2500 words), third value is amount of the word in email. E.g. `'1 564 1'` means in email #1, the word has index of 564 appearing once time
* If feature does not appear in the email we are considering, it means word does not exist in the email

In [98]:
import numpy as np
from scipy.sparse import coo_matrix # for sparse matrix
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
# data path and file name
path = 'Chapter11/'
train_data_fn = 'train-features.txt'
test_data_fn = 'test-features.txt'
train_label_fn = 'train-labels.txt'
test_label_fn = 'test-labels.txt'

In [101]:
vocabulary_size = 2500

def read_data(data_fn, label_fn):
    # read data filename
    with open(path + data_fn) as file:
        content = file.readlines()
    content = [x.strip().split() for x in content]
    content = np.array(content, dtype=np.int32)
    
    # read label filename
    with open(path + label_fn) as file:
        label = file.readlines()
    label = np.array([int(x.strip()) for x in label]).reshape(-1, 1)
    
    # Sparse Matrix: row is document, column is 2500 features
    data = coo_matrix((content[:, 2], (content[:, 0] - 1, content[:, 1] - 1)), shape=(len(label), vocabulary_size))
    
    return data, label

In [102]:
train_data, train_label = read_data(train_data_fn, train_label_fn)
test_data, test_label = read_data(test_data_fn, test_label_fn)

model = MultinomialNB()
model.fit(train_data, train_label)

y_pred = model.predict(test_data)

print('Training size:', train_data.shape[0], 'and Accuracy Score:', accuracy_score(test_label, y_pred) * 100)

Training size: 700 and Accuracy Score: 98.0769230769


  y = column_or_1d(y, warn=True)


* Continue with smaller training set. We test with 100-document and 50-document training set

In [105]:
train_data_fn = 'train-features-100.txt'
test_data_fn = 'test-features.txt'
train_label_fn = 'train-labels-100.txt'
test_label_fn = 'test-labels.txt'

In [106]:
train_data, train_label = read_data(train_data_fn, train_label_fn)
test_data, test_label = read_data(test_data_fn, test_label_fn)

model = MultinomialNB()
model.fit(train_data, train_label)

y_pred = model.predict(test_data)

print('Training size:', train_data.shape[0], 'and Accuracy Score:', accuracy_score(test_label, y_pred) * 100)

Training size: 100 and Accuracy Score: 97.6923076923


  y = column_or_1d(y, warn=True)


In [107]:
train_data_fn = 'train-features-50.txt'
test_data_fn = 'test-features.txt'
train_label_fn = 'train-labels-50.txt'
test_label_fn = 'test-labels.txt'

In [108]:
train_data, train_label = read_data(train_data_fn, train_label_fn)
test_data, test_label = read_data(test_data_fn, test_label_fn)

model = MultinomialNB()
model.fit(train_data, train_label)

y_pred = model.predict(test_data)

print('Training size:', train_data.shape[0], 'and Accuracy Score:', accuracy_score(test_label, y_pred) * 100)

Training size: 50 and Accuracy Score: 97.3076923077


  y = column_or_1d(y, warn=True)


* Even with much smaller training set, result of NBC is very impressive
* Now testing with **Bernoulli NB**

In [112]:
train_data_fn = 'train-features.txt'
test_data_fn = 'test-features.txt'
train_label_fn = 'train-labels.txt'
test_label_fn = 'test-labels.txt'

train_data, train_label = read_data(train_data_fn, train_label_fn)
test_data, test_label = read_data(test_data_fn, test_label_fn)

model = BernoulliNB(binarize=.5)
model.fit(train_data, train_label)
y_pred = model.predict(test_data)
print('Training size:', train_data.shape[0], 'and Accuracy Score:', accuracy_score(test_label, y_pred) * 100)

Training size: 700 and Accuracy Score: 85.3846153846


  y = column_or_1d(y, warn=True)


* `coo_matrix` is not binary, so w add `binarize=.5` as threshold of 0.5. If value > 0.5, it will be 1 and o.w.
* In the same trainging size of 700, **MultinomialNB** is much better than **BernoulliNB**