In [31]:
import os
import io
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [32]:
def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)
            
            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = "\n".join(lines)
            yield path, message

In [33]:
def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)
    
    return pd.DataFrame(rows, index=index)

In [34]:
df = pd.DataFrame({'message': [], 'class': []})

In [35]:
df = df.append(dataFrameFromDirectory('/home/cn87/machine learning/DataScience/emails/spam', 'spam'))
df = df.append(dataFrameFromDirectory('/home/cn87/machine learning/DataScience/emails/ham', 'ham'))

In [36]:
df.head()

Unnamed: 0,class,message
/home/cn87/machine learning/DataScience/emails/spam/00414.b2312673ca5358901c801eb44c00e310,spam,"<HTML>\n\n<PRE>\n\nDear Valued Member,\n\n\n\n..."
/home/cn87/machine learning/DataScience/emails/spam/00438.41295e1df4b651b7611316331b8468e4,spam,"Dear Homeowner,\n\n \n\nInterest Rates are at ..."
/home/cn87/machine learning/DataScience/emails/spam/00469.ee3b2f31459cc2ec43ae7cae00d40cf6,spam,Lowest rates available for term life insurance...
/home/cn87/machine learning/DataScience/emails/spam/00213.8c42a1c257aa30ff3b3ba668cca59408,spam,<p>We thank you for just a moment of your tim...
/home/cn87/machine learning/DataScience/emails/spam/00226.e0e2704cde3bbd561a98042f4a3baf5f,spam,"Dear Sir or Madam,\n\n\n\nMy name is Petr Stan..."


Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready

In [37]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(df["message"].values)

In [38]:
model = MultinomialNB()
target = df["class"].values
model.fit(counts, target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Let's test this model out

In [41]:
example_email_content = ["Free insurance now!!!", "Hi Manish, let's meet tomorrow"]
example_email_content_counts = vectorizer.transform(example_email_content)
prediction = model.predict(example_email_content_counts)
prediction

array(['spam', 'ham'], 
      dtype='<U4')