In [1]:
#coding:utf-8
# Naive Bayes Application: Filtering Web Site's Malicious Messages
# The model is a set-of-words model that only takes the appearance of each word as a feature
# and each word in the word bag model can appear multiple times
from numpy import *
# Vocabularies to vector conversion function
def loadDataSet():
    # function creates some experimental samples, returns the first variable is the collection of documents after this segmentation,
    # the second variable is a collection of class labels
    postingList = [['my','dog','has','flea','problems','help','please'],
                   ['maybe','not','take','him','to','dog','park','stupid'],
                   ['my','dalmation','is','so','cute','I','love','him'],
                   ['stop','posting','stupid','worthless','garbage'],
                   ['mr','licks','ate','my','steak','how','to','stop','him'],
                   ['quit','buying','worthless','dog','food','stupid']]
    classVec = [0 , 1 , 0 , 1 , 0 , 1] # 1 for insulting text, 0 for normal speech
    return postingList , classVec

def createVocabList(dataSet):
    # Create a list of unique words that appear in all documents
    vocabSet = set([]) # Create an empty set
    for document in dataSet :
        vocabSet = vocabSet | set(document) # Create a union of two collections
    return list(vocabSet)

#Functions in the word set model
def setOfWords2Vec(vocabList , inputSet):
    # The input parameters of the function are a vocabulary and a document, the output document vector,
    # each element of the vector is 0 or 1, indicating whether the words in the vocabulary appear in the input document
    # Create a vector with 0 elements in it
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:print("the word : %s is not in my Vocabulary !" % word)
    return returnVec

# Bag of words model function
def bagOfWords2Vec(vocabList , inputSet):
    # The input parameters of the function are a vocabulary and a document, the output document vector,
    # each element of the vector is 0 or 1, indicating whether the words in the vocabulary appear in the input document
    # Create a vector with 0 elements in it
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
        else:print("the word : %s is not in my Vocabulary !" % word)
    return returnVec

# Naive Bayes Classifier Training Function
def trainNBO(trainMatrix , trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = ones(numWords);p1Num = ones(numWords)
    p0Denom = 2.0;p1Denom = 2.0
    # x vector sum
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    # Divide each element
    p1Vect = log(p1Num/p1Denom) #change to log()
    p0Vect = log(p0Num/p0Denom) #change to log()
    return p0Vect , p1Vect , pAbusive

# Naive Bayes Classifier Training Function
def classifyNB(vec2Classify , p0Vec , p1Vec , pClass1):
    # Equivalent to posterior probability: probability density multiplied by the prior probability
    p1 = sum(vec2Classify*p1Vec) + log(pClass1)
    p0 = sum(vec2Classify*p0Vec) + log(1-pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

# Convenient function that encapsulates all operations to save time entering code
listOPosts , listClasses = loadDataSet()

myVocabList = createVocabList(listOPosts)
trainMat = []

for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    #print (setOfWords2Vec(myVocabList,postinDoc))
p0V , p1V , pAb = trainNBO(array(trainMat) , array(listClasses))

testEntry = ['love' , 'my' , 'dalmation', 'please']
thisDoc = array(setOfWords2Vec(myVocabList , testEntry))
print(testEntry , 'classified as :' ,classifyNB(thisDoc , p0V ,p1V ,pAb))
testEntry = ['stupid' , 'garbage']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry, 'classified as :', classifyNB(thisDoc, p0V, p1V, pAb))

['love', 'my', 'dalmation', 'please'] classified as : 0
['stupid', 'garbage'] classified as : 1
