# 朴素贝叶斯/naive bayes
训练集独立同分布时分类效果较好。

优点：在数据集较少的情况下仍然有效，可以处理多类别的问题。

缺点：对于输入数据的准备方式较为敏感。

适用数据类型：标称型数据。

In [7]:
from numpy import *

def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec


In [8]:
def createVocabList(dataSet):
    """获得整个数据集中的文档中所有不重复的单词集合"""
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) #符号‘|’表示求并集。union of the two sets
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    """将文档转化为词向量，vocabList为所有单词组成的单词集合，inputSet为某个文档
    如果文档中出现某个人word，则vocabList中该word所对应的位置i取值为1，否则为0
    最终获得的词向量形式为（0,1,...,1,0,1...)"""
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [9]:
#测试上述定义函数
listOposts, listClasses = loadDataSet()
myVocabList = createVocabList(listOposts)
print(myVocabList),

['take', 'park', 'help', 'him', 'has', 'is', 'mr', 'posting', 'not', 'stupid', 'cute', 'dog', 'flea', 'stop', 'I', 'worthless', 'garbage', 'ate', 'so', 'dalmation', 'steak', 'maybe', 'love', 'my', 'how', 'please', 'licks', 'buying', 'quit', 'to', 'food', 'problems']


(None,)

In [10]:
print(setOfWords2Vec(myVocabList, listOposts[0]))

[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1]


In [11]:
print(setOfWords2Vec(myVocabList, listOposts[1]))

[1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]


In [12]:
def trainNB0(trainMatrix,trainCategory):
    """trainMatrix:文档矩阵
    trainCategory：文档类别标签向量，类似[1,1,0,0,1,...]
    返回每个类别的条件概率
    p0Vect：类别0的情况下，词向量中每个词的条件概率列表"""
    numTrainDocs = len(trainMatrix) #训练集中文档纵数量
    numWords = len(trainMatrix[0]) #文档中的单词数量
    pAbusive = sum(trainCategory)/float(numTrainDocs)  #侮辱性文档占比，1-pAbusive为正常文档占比
    p0Num = ones(numWords); p1Num = ones(numWords)      #change to ones() ，np.ones(3)-->[1,1,1],啦普拉斯平滑
    p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0,初始化分母值
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom)          #change to log()，防止下溢出，即太多很小的数相乘，使结果趋进0
    p0Vect = log(p0Num/p0Denom)          #change to log()
    return p0Vect,p1Vect,pAbusive

In [13]:
#p1Num += trainMatrix[i] ，array列表相加机制
array([1,2,3]) + array([2,1,3])


array([3, 3, 6])

In [14]:
#测试
trainMat = []
for postinDoc in listOposts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))

In [15]:
p0v,p1v,pAb = trainNB0(trainMat,listClasses)
pAb

0.5

In [16]:
p0v

array([-3.25809654, -3.25809654, -2.56494936, -2.15948425, -2.56494936,
       -2.56494936, -2.56494936, -3.25809654, -3.25809654, -3.25809654,
       -2.56494936, -2.56494936, -2.56494936, -2.56494936, -2.56494936,
       -3.25809654, -3.25809654, -2.56494936, -2.56494936, -2.56494936,
       -2.56494936, -3.25809654, -2.56494936, -1.87180218, -2.56494936,
       -2.56494936, -2.56494936, -3.25809654, -3.25809654, -2.56494936,
       -3.25809654, -2.56494936])

In [17]:
p1v

array([-2.35137526, -2.35137526, -3.04452244, -2.35137526, -3.04452244,
       -3.04452244, -3.04452244, -2.35137526, -2.35137526, -1.65822808,
       -3.04452244, -1.94591015, -3.04452244, -2.35137526, -3.04452244,
       -1.94591015, -2.35137526, -3.04452244, -3.04452244, -3.04452244,
       -3.04452244, -2.35137526, -3.04452244, -3.04452244, -3.04452244,
       -3.04452244, -3.04452244, -2.35137526, -2.35137526, -2.35137526,
       -2.35137526, -3.04452244])

In [18]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    """vec2Classify：要测试的输入向量
    """
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)    #元素点乘-->[1,2,3]*[1,2,3] = [1,4,9]
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else: 
        return 0

def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))

In [19]:
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


# 使用朴素贝叶斯过滤垃圾邮件

In [20]:
#数据准备工作--->切分文本
mySent = 'This book is the best book on python or M.L. I have ever laid eyes upon.'
#使用正则表达式切分
import re
regEx = re.compile(r'\W+')
listOfTokens = regEx.split(mySent)
print(listOfTokens),

['This', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or', 'M', 'L', 'I', 'have', 'ever', 'laid', 'eyes', 'upon', '']


(None,)

In [21]:
#去掉空字符串和长度小于等于1的字符串
print([tok for tok in listOfTokens if len(tok)>1]),

['This', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or', 'have', 'ever', 'laid', 'eyes', 'upon']


(None,)

In [22]:
#将字符串全部转化乘小写或大写
print([tok.lower() for tok in listOfTokens if len(tok)>1]),

['this', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or', 'have', 'ever', 'laid', 'eyes', 'upon']


(None,)

In [23]:
# coding=utf-8
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

def textParse(bigString):    #input is big string, #output is word list
    import re
    listOfTokens = re.split(r'\W+', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2] 
    
def spamTest():
    docList=[]; classList = []; fullText =[]
    for i in range(1,26):
        spamPath = '/home/wl/文档/machinelearninginaction/Ch04/email/spam/%d.txt' % i
        wordList = textParse(open(spamPath, 'r',encoding='utf8').read())
        docList.append(wordList) #构建数据集矩阵
        fullText.extend(wordList) #构造所有邮件中所包含的单词列表，用于制作词向量
        classList.append(1)  #将spam 文件夹里的邮件设置标签为1
        hamPath = '/home/wl/文档/machinelearninginaction/Ch04/email/ham/%d.txt' % i
        wordList = textParse(open(hamPath,'r',encoding='utf8').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)  #将ham 文件夹里的邮件设置标签为0
    vocabList = createVocabList(docList)#create vocabulary
    trainingSet = range(50); testSet=[]           #create test set
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet))) #生成一个随机实数
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
            print("classification error",docList[docIndex])
    print('the error rate is: ',float(errorCount)/len(testSet))

In [32]:
spamTest()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x92 in position 884: invalid start byte

# 使用朴素贝叶斯分类器型从个人广告中获取区域倾向
RSS程序库的使用


In [2]:
#安装feedparse库

#下载安装包，复制下述网址下载
URL = https://pypi.python.org/pypi/feedparser#downloads

#解压安装，先进入解压后的文件加，终端输入：
cd '/home/wl/feedparser-5.2.1'

#输入以下代码安装
python setup.py install

#大功告成

SyntaxError: invalid syntax (<ipython-input-2-fdcaf9ec857d>, line 4)

In [31]:
#使用Craigslist上的个人广告
import feedparser
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')

In [33]:
ny['entries']
len(ny['entries'])

25

In [26]:
def calcMostFreq(vocabList,fullText):
    """词频排序
    vocabList：所有词的集合
    fullText：某条广告的全部内容"""
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token]=fullText.count(token)
    #将词频从高到低进行排序，取前30个高频词
    #iteritems() --> items()
    sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True) 
    return sortedFreq[:30]

In [37]:
def localWords(feed1,feed0):
    """RSS源分类器函数
    参数为两个不同的RSS源，RSS需要以参数形式传入，即从函数外导入，原因在于RSS源会随着时间而改变。"""
    import feedparser
    docList=[]; classList = []; fullText =[]
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1) #NY is class 1
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)#create vocabulary
    top30Words = calcMostFreq(vocabList,fullText)   #remove top 30 words
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    #python3中range不返回数组对象，而是返回range对象 ,因此需要在range前加list
    trainingSet = list(range(2*minLen)); 
    testSet=[]           #create test set
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ',float(errorCount)/len(testSet))
    return vocabList,p0V,p1V


In [34]:
#测试函数
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
print(len(ny['entries']),len(sf['entries']))
# vocaBlist, pSF, pNY = localWords(ny, sf)

25 25


In [38]:
#多次实验，取结果平均值，这样可以得到错误率的精确估计
vocaBlist, pSF, pNY = localWords(ny, sf)

the error rate is:  0.4


In [39]:
vocaBlist, pSF, pNY = localWords(ny, sf)

the error rate is:  0.2


In [40]:
vocaBlist, pSF, pNY = localWords(ny, sf)

the error rate is:  0.1


In [41]:
vocaBlist, pSF, pNY = localWords(ny, sf)

the error rate is:  0.4


In [45]:
def getTopWords(ny,sf):
    """最具表征性的词汇显示函数"""
    import operator
    vocabList,p0V,p1V=localWords(ny,sf)
    topNY=[]; topSF=[]
    for i in range(len(p0V)):
        if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
        if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    for item in sortedSF:
        print(item[0]),
    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
    print ("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
    for item in sortedNY:
        print(item[0]),

In [46]:
getTopWords(ny,sf)

the error rate is:  0.3
SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**
may
around
there
good
movies
love
seeking
hey
out
fit
anyone
clean
area
here
only
san
jose
interested
work
meet
hello
moved
your
also
going
enjoy
muse
find
live
games
summer
times
different
friendship
issues
eyes
chat
other
else
far
maybe
host
life
man
trying
week
sbm
kind
any
woman
this
healthy
hangout
apprentice
inspirer
day
cut
been
dinner
bay
need
how
could
sun
fandoms
moblie
dating
clara
non
they
listening
married
minded
hit
parts
respectful
his
betterif
then
free
pretty
upfront
grew
soon
usual
hikin
something
music
struggle
doing
riding
right
either
holiday
welcome
now
hang
made
today
cuisines
road
never
13955
workout
interest
twink
food
call
point
sexual
chance
creative
lbs
pen
asian
lonely
occasion
means
indian
communicate
mentor
dte
style
make
down
contra
running
smoker
three
which
harry
wanna
white
booty
time
confident
younger
possibly
milpitas
recently
lies
had
pal
year
drama
busy
safe
s