In [25]:
import numpy as np
import re

In [26]:
def createVocabList(docList):
    vocabSet = set([])
    
    for doc in docList:
        vocabSet = vocabSet | set(doc)
    
    return list(vocabSet)

In [27]:
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("%s not in vocabulary" % word)
    return returnVec

In [28]:
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num / p1Denom)
    p0Vect = np.log(p0Num / p0Denom)
    return p0Vect, p1Vect, pAbusive

In [29]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

In [78]:
def textParse(bigString):
    listOfTokens = re.split(r'\W+', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

In [95]:
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        try:
            f = open('email/spam/%d.txt' % i)
            text = f.read()
            wordList = textParse(text)
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(1)
        except:
            pass
        try:
            f = open("email/ham/%d.txt" % i)
            text = f.read()
            wordList = textParse(text)
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(0)
        except:
            pass
    vocabList = createVocabList(docList)
    testSet = []
    for i in range(10):
        randIndex = int(np.random.uniform(0, len(docList)))
        testSet.append(docList[randIndex])
        del(docList[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in range(len(docList)):
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    errorCount = 0
    
    for docIndex in range(len(testSet)):
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print("The error rate is:", float(errorCount) / len(testSet))

In [101]:
spamTest()

The error rate is: 0.2
