In [3]:
import os #Provides a portable way of using operating system dependent functionality
import io #It provides the python interfaces to stream handling
import numpy #A fundamental package for scientific computing
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
#This function iterates through every single file in a directory. It uses the os.walk function to find all the files 
#in the directory and build up a full path for each individual line and reads it in.
#While reading it in, it skips the header of each email and because we are only interested in the body

    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)
            
            
            inBody = False
            lines = []
            f = io.open(path, 'r', encoding = 'latin1' )
            
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
                    
            f.close()
            
            message = '\n'.join(lines)
            yield path, message
            
            
def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    
    for filename, message in readFiles(path):
        rows.append({'message':message, 'class': classification})
        index.append(filename)
            
    return DataFrame(rows, index=index)


data = DataFrame({'message':[], 'class': []}) #creates a data frame(database) that has 2 columns, one containing message, actual content of email and the other containing the class, spam or ham

#The methods below will throw in emails from the spam and ham directories i.e populating the database created above 
data = data.append(dataFrameFromDirectory('/home/norman/Desktop/Data science/spam classifier/spam', 'spam'))
data = data.append(dataFrameFromDirectory('/home/norman/Desktop/Data science/spam classifier/ham', 'ham'))
            
            
    

In [4]:
data.head()

Unnamed: 0,class,message
/home/norman/Desktop/Data science/spam classifier/spam/00301.68fe7955b96d085360ca916289e8e716,spam,*** FREE BONUS OFFER - SEE BELOW ***\n\n\n\nWe...
/home/norman/Desktop/Data science/spam classifier/spam/00266.3cf1dcf8df07100b1530493e11f80a25,spam,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr..."
/home/norman/Desktop/Data science/spam classifier/spam/00273.0c7d73771d79e84e2aab8c909c5bb210,spam,This is a multi-part message in MIME format.\n...
/home/norman/Desktop/Data science/spam classifier/spam/00120.58579af867ff9a702cff23e7b8818a59,spam,This is a multi-part message in MIME format.\n...
/home/norman/Desktop/Data science/spam classifier/spam/00115.c97af50ef7ccd816f95bbdc6f4d226b2,spam,------=_NextPart_000_00C3_65E56B8D.B3612A36\n\...


In [5]:
#MultinomialNB function is used to perform Naive Bayes on the data that we have
#It expects two inputs; a list of words in each email and the number of times that word occurs
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

#takes all the values from the message column and pass it to vectorizer.fit_transform
#this tokenizes all individual words into values and then counts how many each word occurs
#target is the actual classification for each email

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts,targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [6]:
example = ['Free python classes everyday of the week', "Hey dude, how about linking up over the weekend"]
example_counts = vectorizer.transform(example)
output = classifier.predict(example_counts)
output

array(['ham', 'ham'], 
      dtype='|S4')

In [7]:
example2 = ['Free Offer Bonus Now !!!!', "Siz, Mother wants to talk to you urgently"]
example2_counts = vectorizer.transform(example2)
prediction = classifier.predict(example2_counts)
prediction

array(['spam', 'ham'], 
      dtype='|S4')