# This is an introduction to scikit-learn using a spam-filter example
First we will import all the necessary libraries

In [29]:
import os
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score

### Data preprocessing
Next we will read the spam and no spam (ham) examples from the data directory

In [30]:
NEWLINE = '\n'

HAM = 'ham'
SPAM = 'spam'

SOURCES = [
    ('data/ham/beck-s',      HAM),
    ('data/ham/farmer-d',    HAM),
    ('data/ham/kaminski-v',  HAM),
    ('data/ham/kitchen-l',   HAM),
    ('data/ham/lokay-m',     HAM),
    ('data/ham/williams-w3', HAM),
    ('data/spam/BG',          SPAM),
    ('data/spam/GP',          SPAM),
    ('data/ham/SH',          SPAM)
]

SKIP_FILES = {'cmds'}

''' iterate through all files an yield the email body '''
def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = open(file_path, encoding="latin-1")
                    for line in f:
                        if past_header:
                            lines.append(line)
                        elif line == NEWLINE:
                            past_header = True
                    f.close()
                    content = NEWLINE.join(lines)
                    yield file_path, content

We will build now a pandas dataframe for the files

In [31]:
def build_data_frame(path, classification):
    rows = []
    index = []
    for file_name, text in read_files(path):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)

    data_frame = DataFrame(rows, index=index)
    return data_frame

Now we will concate the dataframes using pandas' appaend method

In [32]:
data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))

data = data.reindex(numpy.random.permutation(data.index))

### Feature extraction
We will first use the feature extraction and classification without pipelining. <br/>
We start with the basic CountVectorizer, i. e. a bag-of-words approach <br/>
<span style="color:red">Note: This may take some time. Keep calm and let it finish ;-) </span>

In [33]:
count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(data['text'].values)

### Classification with Naive Bayes Classifier
Train the classifier with the example

In [34]:
classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Validate the classifier with some examples

In [48]:
example = ["Free Viagra call today"]
example_counts = count_vectorizer.transform(example)
prediction = classifier.predict(example_counts);
prediction[0]

'spam'

In [49]:
examples = ["Free Viagra call today!", 
            "I'm going to attend the Python Learning group tomorrow.",
            "Free Viagra Free Viagra Free Viagra",
            "Today we will learn about Machine Learning"]
example_counts = count_vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
for i in range(len(predictions)):
    print("E-Mail %i: %s"%(i, predictions[i]))

E-Mail 0: spam
E-Mail 1: ham
E-Mail 2: spam
E-Mail 3: ham
