<!--BOOK_INFORMATION-->
<a href="https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-opencv" target="_blank"><img align="left" src="data/cover.jpg" style="width: 76px; height: 100px; background: white; padding: 1px; border: 1px solid black; margin-right:10px;"></a>
*This notebook contains an excerpt from the upcoming book [Machine Learning for OpenCV](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-opencv) by Michael Beyeler (expected Aug 2017).
The code is released under the [MIT license](https://opensource.org/licenses/MIT),
and is available on [GitHub](https://github.com/mbeyeler/opencv-machine-learning).*

*Note that this excerpt contains only the raw code - the book is rich with additional explanations and illustrations.
If you find this content useful, please consider supporting the work by
[buying the book](https://github.com/mbeyeler/opencv-machine-learning)!*

<!--NAVIGATION-->
< [Implementing Your First Bayesian Classifier](07.01-Implementing-Your-First-Bayesian-Classifier.ipynb) | [Contents](../README.md) | [None](08.00-Discovering-Hidden-Structures-with-Unsupervised-Learning.ipynb) >

# Implementing the Spam Filter

In [1]:
HAM = 0
SPAM = 1
datadir = 'data/chapter7'
sources = [
    ('beck-s.tar.gz', HAM),
    ('farmer-d.tar.gz', HAM),
    ('kaminski-v.tar.gz', HAM),
    ('kitchen-l.tar.gz', HAM),
    ('lokay-m.tar.gz', HAM),
    ('williams-w3.tar.gz', HAM),
    ('BG.tar.gz', SPAM),
    ('GP.tar.gz', SPAM),
    ('SH.tar.gz', SPAM)
]

In [2]:
def extract_tar(datafile, extractdir):
    try:
        import tarfile
    except ImportError:
        raise ImportError("You do not have tarfile installed. "
                          "Try unzipping the file outside of Python.")

    tar = tarfile.open(datafile)
    tar.extractall(path=extractdir)
    tar.close()
    print("%s successfully extracted to %s" % (datafile, extractdir))

In [3]:
# for source, _ in sources:
#     datafile = '%s/%s' % (datadir, source)
#     extract_tar(datafile, datadir)

In [4]:
import os
def read_single_file(filename):
    past_header, lines = False, []
    if os.path.isfile(filename):
        f = open(filename, encoding="latin-1")
        for line in f:
            if past_header:
                lines.append(line)
            elif line == '\n':
                past_header = True
        f.close()
    content = '\n'.join(lines)
    return filename, content

In [5]:
def read_files(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            filepath = os.path.join(root, filename)
            yield read_single_file(filepath)

## Building a data matrix using Pandas

In [6]:
import pandas as pd

In [7]:
pd.DataFrame({
    'model': ['Normal Bayes', 'Multinomial Bayes', 'Bernoulli Bayes'],
    'class': [
        'cv2.ml.NormalBayesClassifier_create()',
        'sklearn.naive_bayes.MultinomialNB()',
        'sklearn.naive_bayes.BernoulliNB()'
    ]
})

Unnamed: 0,class,model
0,cv2.ml.NormalBayesClassifier_create(),Normal Bayes
1,sklearn.naive_bayes.MultinomialNB(),Multinomial Bayes
2,sklearn.naive_bayes.BernoulliNB(),Bernoulli Bayes


In [8]:
def build_data_frame(extractdir, classification):
    rows = []
    index = []
    for file_name, text in read_files(extractdir):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)

    data_frame = pd.DataFrame(rows, index=index)
    return data_frame

In [9]:
data = pd.DataFrame({'text': [], 'class': []})
for source, classification in sources:
    extractdir = '%s/%s' % (datadir, source[:-7])
    data = data.append(build_data_frame(extractdir, classification))

## Preprocessing the data

In [10]:
from sklearn import feature_extraction
counts = feature_extraction.text.CountVectorizer()
X = counts.fit_transform(data['text'].values)
X.shape

(52076, 643270)

In [11]:
X

<52076x643270 sparse matrix of type '<class 'numpy.int64'>'
	with 8607632 stored elements in Compressed Sparse Row format>

In [12]:
y = data['class'].values

## Training a Normal Bayes classifier

In [13]:
from sklearn import model_selection as ms
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2,
                                                       random_state=42)

In [14]:
import cv2
model_norm = cv2.ml.NormalBayesClassifier_create()

The following won't work, and instead result in a TypeError:

    model_norm.train(X_train, cv2.ml.ROW_SAMPLE, y_train)

    ---------------------------------------------------------------------------
    TypeError                                 Traceback (most recent call last)
    <ipython-input-24-7746955e6844> in <module>()
    ----> 1 model_norm.train(X_train, cv2.ml.ROW_SAMPLE, y_train)
    
    TypeError: samples is not a numpy array, neither a scalar

In [19]:
import numpy as np
X_train_small = X_train[:100, :30].toarray().astype(np.float32)
y_train_small = y_train[:100]

In [20]:
np.unique(X_train_small)

array([  0.,   1.,   2.,   3.,   5.,   7.,  20.], dtype=float32)

In [21]:
np.unique(y_train_small)

array([ 0.,  1.])

In [22]:
meow + 2

NameError: name 'meow' is not defined

In [None]:
model_norm.train(X_train_small, cv2.ml.ROW_SAMPLE, y_train_small.astype(np.float32))

In [None]:
meow + 2

In [None]:
model_norm.train(X_train_small, cv2.ml.ROW_SAMPLE, y_train_small.reshape((-1, 1)))

## Bag of words

In [None]:
from sklearn import model_selection as ms
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn import naive_bayes
model_naive = naive_bayes.MultinomialNB()
model_naive.fit(X_train, y_train)

In [None]:
model_naive.score(X_train, y_train)

In [None]:
model_naive.score(X_test, y_test)

## n-grams

In [None]:
counts = feature_extraction.text.CountVectorizer(ngram_range=(1, 2))
X = counts.fit_transform(data['text'].values)

In [None]:
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model_naive = naive_bayes.MultinomialNB()
model_naive.fit(X_train, y_train)

In [None]:
model_naive.score(X_test, y_test)

## TfIdf

In [None]:
tfidf = feature_extraction.text.TfidfTransformer()

In [None]:
X_new = tfidf.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = ms.train_test_split(X_new, y, test_size=0.2, random_state=42)

In [None]:
model_naive = naive_bayes.MultinomialNB()
model_naive.fit(X_train, y_train)
model_naive.score(X_test, y_test)

In [None]:
from sklearn import metrics

In [None]:
metrics.confusion_matrix(y_test, model_naive.predict(X_test))

<!--NAVIGATION-->
< [Implementing Your First Bayesian Classifier](07.01-Implementing-Your-First-Bayesian-Classifier.ipynb) | [Contents](../README.md) | [None](08.00-Discovering-Hidden-Structures-with-Unsupervised-Learning.ipynb) >