# 4.朴素贝叶斯
## 调包

In [21]:
import numpy as np
import feedparser
from bayes import *

## 4.1.收集数据：导入RSS源

In [57]:
#使用国内能访问的RSS源
ny = feedparser.parse("http://www.nasa.gov/rss/dyn/image_of_the_day.rss")
sy = feedparser.parse("http://rss.tom.com/happy/happy.xml")
print("ny['entries']的长度 = {}".format(len(ny['entries'])))
print("sy['entries']的长度 = {}".format(len(sy['entries'])))

ny['entries']的长度 = 60
sy['entries']的长度 = 20


## 4.2.RSS源分类器及高频词去除函数

In [17]:
def calcMostFreq(vocabList,fullText):
    """
    计算高频词
    参数：
        vocabList -- 词汇表
        fullText -- 全文
    返回：
        最高频的30个词
    """
    import operator
    #新建频率统计字典
    freqDict = {}
    #遍历词汇表
    for token in vocabList:
        #统计每个词的次数
        freqDict[token]=fullText.count(token)
    #排序
    sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True) 
    #返回前30个高频词汇
    return sortedFreq[:30]     

In [27]:
def localWords(feed1,feed0):
    """
    RSS源分类测试函数
    参数：
        feed1 -- 输入文本1
        feed2 -- 输入文本2
    返回：
        vocabList -- 词汇表
        p0V -- 类型0的概率
        p1V -- 类型1的概率
    """
    import feedparser
    #新建列表
    docList=[]; classList = []; fullText =[]
    #取两者之间的最小的长度
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        #取输入1的文本
        wordList = textParse(feed1['entries'][i]['summary'])
        #添加到列表
        docList.append(wordList)
        #延长到列表
        fullText.extend(wordList)
        #添加类别列表
        classList.append(1)
        #输入0的文本
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    #创建词汇表
    vocabList = createVocabList(docList)
    #去掉最高频率的30个词汇
    top30Words = calcMostFreq(vocabList,fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    #训练集和测试集的索引
    trainingSet = list(range(2*minLen)); testSet=[]
    #随机产生20个索引，从训练集移动到测试集
    for i in range(20):
        randIndex = np.int(np.random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    #训练集数据和标签
    trainMat=[]; trainClasses = []
    #添加数据
    for docIndex in trainingSet:
        #把词袋数据添加到训练集数据矩阵中
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        #添加训练标签
        trainClasses.append(classList[docIndex])
    #训练朴素贝叶斯分类器的得到三个概率
    p0V,p1V,pSpam = trainNB0(np.array(trainMat),np.array(trainClasses))
    #初始化错误计数器
    errorCount = 0
    #测试
    for docIndex in testSet:
        #转化为词袋
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        #统计错误
        if classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    #打印准确率
    print("准确率为: {}".format(1 - float(errorCount)/len(testSet)))
    return vocabList,p0V,p1V

测试一下

In [58]:
vocabList, pNY, pSY = localWords(ny,sy)

准确率为: 0.5


  return _compile(pattern, flags).split(string, maxsplit)


## 4.3.得到最有特征的词汇

In [66]:
def getTopWords(ny,sf):
    """
    得到最有特征的词汇
    参数：
        ny -- 输入文本1
        sf -- 输入文本2
    返回：
        无
    """
    import operator
    #对两个文本进行朴素贝叶斯分类
    vocabList,p0V,p1V=localWords(ny,sf)
    #新建两个列表来存储最有特征的词汇
    topNY=[]; topSF=[]
    #遍历两个词汇表的所有概率，选取较大的添加到特征词汇表中
    for i in range(p0V.shape[1]):
        if p0V[0][i] > -6.0 : topSF.append((vocabList[i],p0V[0][i]))
        if p1V[0][i] > -6.0 : topNY.append((vocabList[i],p1V[0][i]))
    #对SF特征词汇表排序
    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
    #打印SF
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    for item in sortedSF:
        print(item[0])
    #对NY进行类似操作
    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
    print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
    for item in sortedNY:
        print(item[0])


测试

In [75]:
getTopWords(ny, sy)

准确率为: 0.5
SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**
northern
parker
earth
families
schirra
nick
instruments
jets
home
again
members
transits
joe
fly
juno
serena
veggie
sun
nanoracks
aeronautics
feustel
these
air
designed
procedures
installs
fire
than
preparation
been
hasn
gulf
acronym
oval
views
arnold
lopez
donn
works
better
force
acaba
revolution
southern
california
berenices
jupiter
spacewalk
reduce
show
examines
delta
1930
ovchinin
out
only
cosmonaut
taken
crews
missions
commander
satellite
landing
hardware
well
hour
launches
over
gerst
launch
embrace
can
alexander
airport
october
that
observations
debris
gear
retriever
data
sept
mission
alegria
least
rolled
just
approach
color
plant
together
alexey
early
gemini
reveals
composite
ricky
sunday
barge
per
four
minute
junk
where
pad
serving
programs
shows
three
growth
presented
remove
science
see
lies
shuttle
eye
captured
former
her
born
constellation
funduscope
aboard
observatory
krayniy
thousand
continues
oppor

  return _compile(pattern, flags).split(string, maxsplit)
