In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups #20类新闻文本
from sklearn.feature_extraction.text import CountVectorizer #单词计数向量
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF #TF-IDF
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.metrics import brier_score_loss as BS #布里尔分数
from sklearn.calibration import CalibratedClassifierCV #校准可靠性曲线

### （一）文本编码技术简介

#### 1.单词计数向量

在开始分类之前，必须先将文本编码成数字。一种常用的方法是**单词计数向量**。在这种技术中，一个样本可以包含一段话或一篇文章，这个样本中如果出现了10个单词，就会有10个特征(n=10)，**每个特征代表一个单词**，**特征的取值表示这个单词在这个样本中总共出现了几次**，是一个**离散**的、代表**次数**的正整数。在sklearn当中，单词计数向量计数可以通过`feature_extraction.text.CountVectorizer`类实现。

In [2]:
sample = ["Machine learning is fascinating, it is wonderful" #6个单词（重复的算1个）
         ,"Machine learning is a sensational techonology" #2个单词（a不算？）
         ,"Elsa is a popular character"] #3个单词

vec = CountVectorizer()
X = vec.fit_transform(sample)
X #3行（3句话即3个样本）11列（11个单词即11个特征）

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [3]:
#使用接口get_feature_names()调用每个列的名称
vec.get_feature_names()

['character',
 'elsa',
 'fascinating',
 'is',
 'it',
 'learning',
 'machine',
 'popular',
 'sensational',
 'techonology',
 'wonderful']

In [4]:
#注意稀疏矩阵是无法输入pandas的，需要用toarray()
CVresult = pd.DataFrame(X.toarray(),columns = vec.get_feature_names())
CVresult

Unnamed: 0,character,elsa,fascinating,is,it,learning,machine,popular,sensational,techonology,wonderful
0,0,0,1,2,1,1,1,0,0,0,1
1,0,0,0,1,0,1,1,0,1,1,0
2,1,1,0,1,0,0,0,1,0,0,0


从这个编码结果可以发现两个问题：<br>
1.补集朴素贝叶斯让**每个特征的权重除以自己的L2范式**，避免句子特别长的样本（在很多特征下都有值的样本）对$θ_{ci}$的贡献比其他样本更大；<br>
2.通常'is'对语义没什么影响，但出现次数多，会占有较高的权重，对分类来说，这明显是对算法的一种误导。为了解决这个问题，可以使用单词在句子中所占的比例来编码单词，即**TF-IDF方法**。

#### 2.TF-IDF

TF-IDF全称term frequency-inverse document frequency，词频逆文档频率，是通过**单词在文档中出现的频率**来衡量其权重，也就是说，IDF的大小与一个词的常见程度成反比，这个词越常见，编码后为它设置的权重会倾向于越小，以此来压制频繁出现的一些无意义的词。在sklearn当中，使用`feature_extraction.text.TfidfVvectorizer`执行这种编码。

$$TF_i=\frac{某个词i在文章中的出现次数}{文章的总词数}$$

$$IDF=|log(\frac{语料库的文档总数}{包含词条i的文档数+1})|$$

$$TF-IDF=TF_i*IDF$$

In [5]:
vec = TFIDF()
X = vec.fit_transform(sample)
X

<3x11 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [6]:
#使用接口get_feature_names()调用每个列的名称
TFIDFresult = pd.DataFrame(X.toarray(),columns=vec.get_feature_names())
TFIDFresult

Unnamed: 0,character,elsa,fascinating,is,it,learning,machine,popular,sensational,techonology,wonderful
0,0.0,0.0,0.424396,0.50131,0.424396,0.322764,0.322764,0.0,0.0,0.0,0.424396
1,0.0,0.0,0.0,0.315444,0.0,0.406192,0.406192,0.0,0.534093,0.534093,0.0
2,0.546454,0.546454,0.0,0.322745,0.0,0.0,0.0,0.546454,0.0,0.0,0.0


使用TF-IDF编码之后，出现得多的单词的权重（theta）被降低了么？

In [7]:
CVresult.sum(axis=0)/CVresult.sum(axis=0).sum()

character      0.0625
elsa           0.0625
fascinating    0.0625
is             0.2500
it             0.0625
learning       0.1250
machine        0.1250
popular        0.0625
sensational    0.0625
techonology    0.0625
wonderful      0.0625
dtype: float64

In [8]:
TFIDFresult.sum(axis=0)/TFIDFresult.sum(axis=0).sum()

character      0.083071
elsa           0.083071
fascinating    0.064516
is             0.173225
it             0.064516
learning       0.110815
machine        0.110815
popular        0.083071
sensational    0.081192
techonology    0.081192
wonderful      0.064516
dtype: float64

将原本出现次数比较多的词压缩权重；将原本出现次数比较少的词增加权重。

### （二）探索文本数据

In [9]:
data = fetch_20newsgroups()
data.target_names #不同类型的新闻

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [10]:
categories = ["sci.space" #科学技术 - 太空
             ,"rec.sport.hockey" #运动 - 曲棍球
             ,"talk.politics.guns" #政治 - 枪支问题
             ,"talk.politics.mideast"] #政治 - 中东问题
train = fetch_20newsgroups(subset="train",categories = categories) #训练集
test = fetch_20newsgroups(subset="test",categories = categories) #测试集

In [11]:
train.target_names #标签的分类

['rec.sport.hockey',
 'sci.space',
 'talk.politics.guns',
 'talk.politics.mideast']

In [12]:
print(train.data[0])

From: tvartiai@vipunen.hut.fi (Tommi Vartiainen)
Subject: Re: Finland/Sweden vs.NHL teams (WAS:Helsinki/Stockholm & NHL expansion)
Nntp-Posting-Host: vipunen.hut.fi
Organization: Helsinki University of Technology, Finland
Lines: 51

In <1993Apr16.195754.5476@ousrvr.oulu.fi> mep@phoenix.oulu.fi (Marko Poutiainen) writes:

>: FINLAND:  
>: 
>: D-Jyrki Lumme.......20
>: D-Teppo Numminen....20
>: D-Peter Ahola.......13
>: 
>Well well, they don't like our defenders (mainly Lumme and Numminen)...

About 25 is correct for Numminen and Lumme.


>: R-Teemu Selanne.....27
>: 
>Compared to Kurri, Selanne's points are too high, lets make it 25 or 26.

No, Kurri's points are too low. 27 for Kurri and 28 for Sel{nne.

>: well in the Canada Cup and World Championships largely due to the efforts of
>: Markus Ketterer (the goalie), 3-4 or the players listed above and luck. There's
>: presumably a lot of decent players in Finland that wouldn't be superstars at
>: the highest level but still valuable rol

In [13]:
np.unique(train.target)

array([0, 1, 2, 3], dtype=int64)

In [14]:
len(train.target)

2303

In [15]:
#是否存在样本不平衡问题：否
for i in [0,1,2,3]:
    print(i,(train.target == i).sum()/len(train.target))

0 0.26052974381241856
1 0.25749023013460703
2 0.23708206686930092
3 0.24489795918367346


### （三）使用TF-IDF降文本数据编码

In [16]:
Xtrain = train.data
Xtest = test.data
Ytrain = train.target
Ytest = test.target

In [17]:
tfidf = TFIDF().fit(Xtrain) #用训练集拟合
Xtrain_ = tfidf.transform(Xtrain)
Xtest_ = tfidf.transform(Xtest)

In [18]:
Xtrain_ #稀疏矩阵，2303行，40725个特征（不重复的单词）

<2303x40725 sparse matrix of type '<class 'numpy.float64'>'
	with 430306 stored elements in Compressed Sparse Row format>

In [19]:
tosee = pd.DataFrame(Xtrain_.toarray(),columns=tfidf.get_feature_names())
tosee.head()

Unnamed: 0,00,000,0000,00000,000000,000021,000062david42,000152,000246,000256,...,zwrm,zx,zx6wre,zxp,zxqi,zy,zyg,zz,zz_g9q3,zzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.058046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
tosee.shape

(2303, 40725)

### （四）在贝叶斯上分别建模，查看结果

In [21]:
name = ["Multinomial","Complement","Bournulli"]
#注意高斯朴素贝叶斯不接受稀疏矩阵
models = [MultinomialNB(),ComplementNB(),BernoulliNB()]

In [22]:
for name,clf in zip(name,models):
    clf.fit(Xtrain_,Ytrain)
    y_pred = clf.predict(Xtest_)
    proba = clf.predict_proba(Xtest_)
    score = clf.score(Xtest_,Ytest)
    print(name)
    
    #4个不同的标签取值下的布里尔分数
    Bscore = []
    for i in range(len(np.unique(Ytrain))):
        bs = BS(pd.get_dummies(Ytest).iloc[:,i],proba[:,i],pos_label=1)
        Bscore.append(bs)
        print("\tBrier under {}:{:.3f}".format(train.target_names[i],bs))
    
    print("\tAverage Brier:{:.3f}".format(np.mean(Bscore)))
    print("\tAccuracy:{:.3f}".format(score))
    print("\n")

Multinomial
	Brier under rec.sport.hockey:0.018
	Brier under sci.space:0.033
	Brier under talk.politics.guns:0.030
	Brier under talk.politics.mideast:0.026
	Average Brier:0.027
	Accuracy:0.975


Complement
	Brier under rec.sport.hockey:0.023
	Brier under sci.space:0.039
	Brier under talk.politics.guns:0.039
	Brier under talk.politics.mideast:0.033
	Average Brier:0.033
	Accuracy:0.986


Bournulli
	Brier under rec.sport.hockey:0.068
	Brier under sci.space:0.025
	Brier under talk.politics.guns:0.045
	Brier under talk.politics.mideast:0.053
	Average Brier:0.048
	Accuracy:0.902




从结果上来看，两种贝叶斯的效果都很不错。虽然补集贝叶斯的布里尔分数更高，但它的精确度更高。可以使用概率校准来试试看能否让模型进一步突破：

In [23]:
name = ["Multinomial"
        ,"Multinomial + Isotonic"
        ,"Multinomial + Sigmoid"
        ,"Complement"
        ,"Complement + Isotonic"
        ,"Complement + Sigmoid"
        ,"Bernoulli"
        ,"Bernoulli + Isotonic"
        ,"Bernoulli + Sigmoid"]

models = [MultinomialNB()
         ,CalibratedClassifierCV(MultinomialNB(), cv=2, method='isotonic')
         ,CalibratedClassifierCV(MultinomialNB(), cv=2, method='sigmoid')
         ,ComplementNB()
         ,CalibratedClassifierCV(ComplementNB(), cv=2, method='isotonic')
         ,CalibratedClassifierCV(ComplementNB(), cv=2, method='sigmoid')
         ,BernoulliNB()
         ,CalibratedClassifierCV(BernoulliNB(), cv=2, method='isotonic')
         ,CalibratedClassifierCV(BernoulliNB(), cv=2, method='sigmoid')
         ]

for name,clf in zip(name,models):
    clf.fit(Xtrain_,Ytrain)
    y_pred = clf.predict(Xtest_)
    proba = clf.predict_proba(Xtest_)
    score = clf.score(Xtest_,Ytest)
    print(name)
    Bscore = []
    for i in range(len(np.unique(Ytrain))):
        bs = BS(pd.get_dummies(Ytest).iloc[:,i],proba[:,i],pos_label=1)
        Bscore.append(bs)
        print("\tBrier under {}:{:.3f}".format(train.target_names[i],bs))
print("\tAverage Brier:{:.3f}".format(np.mean(Bscore)))
print("\tAccuracy:{:.3f}".format(score))
print("\n")

Multinomial
	Brier under rec.sport.hockey:0.018
	Brier under sci.space:0.033
	Brier under talk.politics.guns:0.030
	Brier under talk.politics.mideast:0.026
Multinomial + Isotonic
	Brier under rec.sport.hockey:0.006
	Brier under sci.space:0.012
	Brier under talk.politics.guns:0.013
	Brier under talk.politics.mideast:0.009
Multinomial + Sigmoid
	Brier under rec.sport.hockey:0.006
	Brier under sci.space:0.012
	Brier under talk.politics.guns:0.013
	Brier under talk.politics.mideast:0.009
Complement
	Brier under rec.sport.hockey:0.023
	Brier under sci.space:0.039
	Brier under talk.politics.guns:0.039
	Brier under talk.politics.mideast:0.033
Complement + Isotonic
	Brier under rec.sport.hockey:0.004
	Brier under sci.space:0.007
	Brier under talk.politics.guns:0.009
	Brier under talk.politics.mideast:0.006
Complement + Sigmoid
	Brier under rec.sport.hockey:0.004
	Brier under sci.space:0.009
	Brier under talk.politics.guns:0.010
	Brier under talk.politics.mideast:0.007
Bernoulli
	Brier under re

可以观察到，多项式分布下无论如何调整，算法的效果都不如补集朴素贝叶斯好。因此在分类的时候，应该选择**补集朴素贝叶斯**。对于补集朴素贝叶斯来说，使用Sigmoid进行概率校准的模型综合最优秀：准确率最高，对数损失和布里尔分数都在0.1以下，可以说是非常理想的模型了。<br>
对于机器学习而言，朴素贝叶斯也许不是最常用的分类算法，但作为概率预测算法中唯——个真正**依赖概率**来进行计算，并且简单快捷的算法，朴素贝叶斯还是常常被人们提起。并且，朴素贝叶斯在文本分类上的效果的确非常优秀。由此可见，只要能够提供足够的数据，合理利用高维数据进行训练，朴素贝叶斯就可以提供意想不到的效果。