## Word list to vector function

In [6]:
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', \
                          'problems', 'help', 'please'],
                         ['maybe', 'not', 'take', 'him', \
                          'to', 'dog', 'park', 'stupid'],
                         ['my', 'dalmation', 'is', 'so', 'cute', \
                           'I', 'love', 'him'],
                         ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                         ['mr', 'licks', 'ate', 'my', 'steak', 'how',\
                           'to', 'stop', 'him'],
                         ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]  # 1 is abusive, 0 not
    return postingList, classVec

In [2]:
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

In [3]:
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print("the word: {} is not in my Vocabulary".format(word))
    return returnVec

In [14]:
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
print(myVocabList)
print([0]*3)
returnVec = setOfWords2Vec(myVocabList, listOPosts[0])
print(returnVec)

['garbage', 'mr', 'has', 'dalmation', 'I', 'love', 'worthless', 'maybe', 'buying', 'licks', 'cute', 'so', 'steak', 'stupid', 'please', 'ate', 'dog', 'not', 'my', 'him', 'to', 'how', 'is', 'flea', 'problems', 'quit', 'food', 'take', 'park', 'stop', 'help', 'posting']
[0, 0, 0]
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0]


## Naive Bayes classifier training function

In [20]:
def trainNB0(trainMatrix, trainCategory):
    numTrainDOcs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDOcs) # we only have two classes:0 and 1
    p0Num = ones(numWords); p1Num=ones(numWords)  # deal with zero value effect
    p0Denom = 2.0; p1Denom=2.0
    for i in range(numTrainDOcs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom)   #change to log()
    p0Vect = log(p0Num/p0Denom)   #change to log()
    return p0Vect, p1Vect, pAbusive
            

In [17]:
from numpy import *
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat=[]
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V,p1V,pAb=trainNB0(trainMat, listClasses)
print(pAb)
print(p0V)
print(p1V)

0.5
[0.         0.04166667 0.04166667 0.04166667 0.04166667 0.04166667
 0.         0.         0.         0.04166667 0.04166667 0.04166667
 0.04166667 0.         0.04166667 0.04166667 0.04166667 0.
 0.125      0.08333333 0.04166667 0.04166667 0.04166667 0.04166667
 0.04166667 0.         0.         0.         0.         0.04166667
 0.04166667 0.        ]
[0.05263158 0.         0.         0.         0.         0.
 0.10526316 0.05263158 0.05263158 0.         0.         0.
 0.         0.15789474 0.         0.         0.10526316 0.05263158
 0.         0.05263158 0.05263158 0.         0.         0.
 0.         0.05263158 0.05263158 0.05263158 0.05263158 0.05263158
 0.         0.05263158]


## Naive Bayes classify function

In [19]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0: 
        return 1
    else:
        return 0

In [21]:
def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat), array(listClasses))
    testEntry=['love', 'my','dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,'classified as: ', classifyNB(thisDoc, p0V,p1V,pAb))
    testEntry=['stupid','garbage']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,'classified as: ', classifyNB(thisDoc, p0V,p1V,pAb))
    

In [23]:
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


## Naive Bayes bag-of-words model

In [24]:
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

## Example: classifying spam email with naive Bayes

In [47]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*', str(bigString))
    return [tok.lower() for tok in listOfTokens if len(tok)>2]
def spamTest():
    docList = []; classList = []; fullText=[]
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i,'rb').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i,'rb').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = list(range(50)); testSet=[]
    for i in range(10):
        randIndex=int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat=[]; trainClasses=[]
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam=trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(array(wordVector), p0V,p1V,pSpam)!=classList[docIndex]:
            errorCount += 1
    print('the error rate is: {}'.format(float(errorCount)/len(testSet)))

In [56]:
spamTest()

the error rate is: 0.6
