# Naive Bayes (the easy way)

We'll cheat by using sklearn.naive_bayes to train a spam classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [1]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines) # create a string by joining the lines' elements with \n 
            yield path, message # yield is like return but it returns generators: generators are iterables which you 
            # you can only iterate over once since their value is generated on the fly and removed after the corresponding iteration


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index) #makes the pathfile the index of the data points

data = DataFrame({'message': [], 'class': []})#creates an empty datafrom from a dictionary

data = data.append(dataFrameFromDirectory('/Users/miladmahdian/DataScience/emails/spam', 'spam'))
data = data.append(dataFrameFromDirectory('/Users/miladmahdian/DataScience/emails/ham', 'ham'))


Let's have a look at that DataFrame:

In [2]:
data.head()

Unnamed: 0,class,message
/Users/miladmahdian/DataScience/emails/spam/00001.7848dde101aa985090474a91ec93fcf0,spam,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr..."
/Users/miladmahdian/DataScience/emails/spam/00002.d94f1b97e48ed3b553b3508d116e6a09,spam,1) Fight The Risk of Cancer!\n\nhttp://www.adc...
/Users/miladmahdian/DataScience/emails/spam/00003.2ee33bc6eacdb11f38d052c44819ba6c,spam,1) Fight The Risk of Cancer!\n\nhttp://www.adc...
/Users/miladmahdian/DataScience/emails/spam/00004.eac8de8d759b7e74154f142194282724,spam,##############################################...
/Users/miladmahdian/DataScience/emails/spam/00005.57696a39d7d84318ce497886896bf90d,spam,I thought you might like these:\n\n1) Slim Dow...


Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy.

In [3]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)
classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Let's try it out:

In [4]:
examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['spam', 'ham'], 
      dtype='|S4')

## Activity

Our data set is small, so our spam classifier isn't actually very good. Try running some different test emails through it and see if you get the results you expect.

If you really want to challenge yourself, try applying train/test to this spam classifier - see how well it can predict some subset of the ham and spam emails.

In [5]:
train=data.sample(frac=0.8,random_state=200) # random_state: Seed for the random number generator
test=data.drop(train.index)

In [6]:
import numpy as np
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(train['message'].values) #train['message'].values create an array from the df
# fit_trasform creates a sparse matrix of (doc,word) with # occurances to be used by the classifier
print counts.shape # counts is a sparse matrix
classifier = MultinomialNB()
targets = train['class'].values
classifier.fit(counts, targets)
example_counts = vectorizer.transform(test['message'].values)
print example_counts.shape
predictions = classifier.predict(example_counts)
targetY = test['class'].values
correct = 0
wrong = 0
testY= np.array(test['class'])
for i in range(len(predictions)): 
    if predictions[i] == testY[i]:
        correct +=1
    else:
        wrong +=1

print "Correct predictions is ",correct, "\nWrong predictions is ",wrong
print "Accuracy:", float(correct)/(correct+wrong) 
        
print "An easier way to compute accuracy:", np.mean(predictions == targetY)  
print "The easiest way:{}".format(classifier.score(example_counts,targetY))



(2400, 54607)
(600, 54607)
Correct predictions is  581 
Wrong predictions is  19
Accuracy: 0.968333333333
An easier way to compute accuracy: 0.968333333333
 the easiest way:0.968333333333


In [35]:
example_counts

<600x54607 sparse matrix of type '<type 'numpy.int64'>'
	with 75227 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = CountVectorizer()
c = vectorizer.fit_transform(["hello milad milad", "hello aybike ulusan"])
print c
print "shape of the matrix c",c.shape

Freqs = TfidfTransformer().fit_transform(c)
print "Freqs:"
print  Freqs

  (0, 1)	1
  (0, 2)	2
  (1, 1)	1
  (1, 0)	1
  (1, 3)	1
shape of the matrix c (2, 4)
Freqs:   (0, 2)	0.942155624663
  (0, 1)	0.335175743328
  (1, 3)	0.631667201738
  (1, 0)	0.631667201738
  (1, 1)	0.449436416524
