In [1]:
import numpy as np

# 条件概率
\begin{equation}
p(c|x)= \frac{p(x|c)*p(c)}{p(x)}
\end{equation}

In [2]:
#创建数据
def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec

In [3]:
listOfPosts,listClasses = loadDataSet()
print(listOfPosts)

[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]


In [4]:
#创建数据集的词汇表集合(无重复词汇)
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
        #print(vocabSet)
    return list(vocabSet)

In [5]:
myVocabList = createVocabList(listOfPosts)
print(myVocabList)

['worthless', 'stop', 'take', 'problems', 'my', 'has', 'help', 'quit', 'love', 'dalmation', 'please', 'steak', 'posting', 'is', 'I', 'park', 'dog', 'garbage', 'food', 'maybe', 'to', 'buying', 'not', 'stupid', 'ate', 'how', 'cute', 'so', 'him', 'mr', 'licks', 'flea']


In [6]:
'''
func:词汇集合转换为向量
vocabList: 训练集的词汇集合
inputSet: 待测试的文本

return: 待测文本，相对训练集的向量
'''
def setOfWord2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    #print(returnVec)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word:{} is not in my Vocabulary".format(word))
    
    return returnVec

In [7]:
a=setOfWord2Vec(myVocabList,listOfPosts[0])
print(a)

[0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [8]:
'''
func:朴素贝叶斯算法训练

trainMatrix: 训练数据的向量集
trainCategory: 训练数据的分类向量

return: 
p1Vect：标签1条件下，特征向量，各个特征值的条件概率
p0Vect：标签0条件下，特征向量，各个特征值的条件概率
pAbusive：标签1的概率
'''
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWord = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = np.ones(numWord);
    p1Num = np.ones(numWord)
    #print('p0Num',p0Num)
    p0Denom = 2.
    p1Denom = 2
    
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)
    p0Vect = np.log(p0Num/p0Denom)
    
    return p0Vect,p1Vect,pAbusive

In [9]:
listOfPosts,listClasses = loadDataSet()

In [10]:
myVocabList = createVocabList(listOfPosts)
print(myVocabList)

['worthless', 'stop', 'take', 'problems', 'my', 'has', 'help', 'quit', 'love', 'dalmation', 'please', 'steak', 'posting', 'is', 'I', 'park', 'dog', 'garbage', 'food', 'maybe', 'to', 'buying', 'not', 'stupid', 'ate', 'how', 'cute', 'so', 'him', 'mr', 'licks', 'flea']


In [11]:
trainMat = []
for postinDoc in listOfPosts:
    #print(postinDoc)
    trainMat.append(setOfWord2Vec(myVocabList,postinDoc))
print(trainMat)
print(listOfPosts[5])

[[0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0], [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]]
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']


In [12]:
p0V,p1V,pAb = trainNB0(trainMat,listClasses)

In [13]:
pAb

0.5

In [14]:
p0V

array([-3.25809654, -2.56494936, -3.25809654, -2.56494936, -1.87180218,
       -2.56494936, -2.56494936, -3.25809654, -2.56494936, -2.56494936,
       -2.56494936, -2.56494936, -3.25809654, -2.56494936, -2.56494936,
       -3.25809654, -2.56494936, -3.25809654, -3.25809654, -3.25809654,
       -2.56494936, -3.25809654, -3.25809654, -3.25809654, -2.56494936,
       -2.56494936, -2.56494936, -2.56494936, -2.15948425, -2.56494936,
       -2.56494936, -2.56494936])

In [15]:
p1V

array([-1.94591015, -2.35137526, -2.35137526, -3.04452244, -3.04452244,
       -3.04452244, -3.04452244, -2.35137526, -3.04452244, -3.04452244,
       -3.04452244, -3.04452244, -2.35137526, -3.04452244, -3.04452244,
       -2.35137526, -1.94591015, -2.35137526, -2.35137526, -2.35137526,
       -2.35137526, -2.35137526, -2.35137526, -1.65822808, -3.04452244,
       -3.04452244, -3.04452244, -3.04452244, -2.35137526, -3.04452244,
       -3.04452244, -3.04452244])

In [16]:
'''
func: 分类器

输入：待测向量，特征值向量的条件概率，各标签类的概率

return: 待测随机变量与各类标签类联合概率分布的最大值；

'''
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1 = np.sum(vec2Classify*p1Vec) + np.log(pClass1)
    p0 = np.sum(vec2Classify*p0Vec) + np.log(1.0-pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

In [69]:
def testingNB():
    listOfPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOfPosts)
    trainMat=[]
    for postinDoc in listOfPosts:
        trainMat.append(setOfWord2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses))
    
    testEntry = ['stupid','garbage']
    testEntry1=['hello','good']
    #thisDoc:文档testEntry的特征向量
    thisDoc = np.array(setOfWord2Vec(myVocabList,testEntry1))
    print("{}clasified as:{}".format(testEntry1,classifyNB(thisDoc,p0V,p1V,pAb)))

In [70]:
testingNB()

the word:hello is not in my Vocabulary
the word:good is not in my Vocabulary
['hello', 'good']clasified as:0


In [55]:
mySent='This book is the best book on Python or M.L I have laid eyes upon.'

In [78]:
import re

In [83]:
print(mySent)
listOfTokens =re.sub('[^A-Za-z]',' ',mySent).split(' ')


This book is the best book on Python or M.L I have laid eyes upon.


In [82]:
[tok.lower() for tok in listOfTokens]

['t',
 'h',
 'i',
 's',
 ' ',
 'b',
 'o',
 'o',
 'k',
 ' ',
 'i',
 's',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'b',
 'e',
 's',
 't',
 ' ',
 'b',
 'o',
 'o',
 'k',
 ' ',
 'o',
 'n',
 ' ',
 'p',
 'y',
 't',
 'h',
 'o',
 'n',
 ' ',
 'o',
 'r',
 ' ',
 'm',
 ' ',
 'l',
 ' ',
 'i',
 ' ',
 'h',
 'a',
 'v',
 'e',
 ' ',
 'l',
 'a',
 'i',
 'd',
 ' ',
 'e',
 'y',
 'e',
 's',
 ' ',
 'u',
 'p',
 'o',
 'n',
 ' ']

In [50]:
emailText = open('email/ham/6.txt').read()

In [51]:
emailText

'Hello,\n\nSince you are an owner of at least one Google Groups group that uses the customized welcome message, pages or files, we are writing to inform you that we will no longer be supporting these features starting February 2011. We made this decision so that we can focus on improving the core functionalities of Google Groups -- mailing lists and forum discussions.  Instead of these features, we encourage you to use products that are designed specifically for file storage and page creation, such as Google Docs and Google Sites.\n\nFor example, you can easily create your pages on Google Sites and share the site (http://www.google.com/support/sites/bin/answer.py?hl=en&answer=174623) with the members of your group. You can also store your files on the site by attaching files to pages (http://www.google.com/support/sites/bin/answer.py?hl=en&answer=90563) on the site. If you抮e just looking for a place to upload your files so that your group members can download them, we suggest you try G