In [4]:
import matplotlib.pyplot as plt
import numpy as np
import random
import operator
from sklearn import datasets
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

In [5]:
# 载入数据集
news = fetch_20newsgroups(subset='all')
print(news.target_names)
print(len(news.data))
print(len(news.target))


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
18846
18846


In [12]:
x_train,x_test, y_train, y_test = train_test_split(news.data, news.target, random_state=33)

In [13]:
# 采用普通统计CountVectorizer提取特征向量
# 默认配置不去除停用词，停顿用的词
cv = CountVectorizer()
cv_train = cv.fit_transform(x_train)
cv_test = cv.transform(x_test)

#去除停用词
stop_cv = CountVectorizer(analyzer='word', stop_words='english')
stop_cv_train = stop_cv.fit_transform(x_train)
stop_cv_test = stop_cv.transform(x_test)

In [14]:
# 采用TfidfVectorizer 提取文本特征向量
# 默认配置不去除停用词
tfid_vec = TfidfVectorizer()
x_tfid_train = tfid_vec.fit_transform(x_train)
x_tfid_test = tfid_vec.transform(x_test)
#去除停用词
stop_tfidf = TfidfVectorizer(analyzer='word', stop_words='english')
stop_tfidf_train = stop_tfidf.fit_transform(x_train)
stop_tfidf_test = stop_tfidf.transform(x_test)

In [15]:
# 使用朴素贝叶斯分类器 对两种提词方法得到的特征值进行学习和预测
# CountVectorizer
mnb_count = MultinomialNB()
mnb_count.fit(cv_train, y_train)   # 学习
mnb_count_y_predict = mnb_count.predict(cv_test)   # 预测
# 去除停用词
mnb_count_stop = MultinomialNB()
mnb_count_stop.fit(stop_cv_train, y_train)   # 学习
mnb_count_stop_y_predict = mnb_count_stop.predict(stop_cv_test)    # 预测

# 对TfidfVectorizer提取文本特征向量 学习和预测
mnb_tfid = MultinomialNB()
mnb_tfid.fit(x_tfid_train, y_train)
mnb_tfid_y_predict = mnb_tfid.predict(x_tfid_test)
# 去除停用词
mnb_tfid_stop = MultinomialNB()
mnb_tfid_stop.fit(stop_tfidf_train, y_train)   # 学习
mnb_tfid_stop_y_predict = mnb_tfid_stop.predict(stop_tfidf_test)    # 预测


In [17]:
# 5 模型评估
# 对普通统计CountVectorizer提取的特征学习模型进行评估
print("未去除停用词的CountVectorizer提取的特征学习模型准确率：", mnb_count.score(cv_test, y_test))
print("更加详细的评估指标:\n", classification_report(mnb_count_y_predict, y_test))
print("去除停用词的CountVectorizer提取的特征学习模型准确率：", mnb_count_stop.score(stop_cv_test, y_test))
print("更加详细的评估指标:\n", classification_report(mnb_count_stop_y_predict, y_test))

# 对TfidVectorizer提取的特征学习模型进行评估
print("TfidVectorizer提取的特征学习模型准确率：", mnb_tfid.score(x_tfid_test, y_test))
print("更加详细的评估指标:\n", classification_report(mnb_tfid_y_predict, y_test))
print("去除停用词的TfidVectorizer提取的特征学习模型准确率：", mnb_tfid_stop.score(stop_tfidf_test, y_test))
print("更加详细的评估指标:\n", classification_report(mnb_tfid_stop_y_predict, y_test))


未去除停用词的CountVectorizer提取的特征学习模型准确率： 0.8397707979626485
更加详细的评估指标:
               precision    recall  f1-score   support

           0       0.86      0.86      0.86       201
           1       0.86      0.59      0.70       365
           2       0.10      0.89      0.17        27
           3       0.88      0.60      0.72       350
           4       0.78      0.93      0.85       204
           5       0.84      0.82      0.83       271
           6       0.70      0.91      0.79       197
           7       0.89      0.89      0.89       239
           8       0.92      0.98      0.95       257
           9       0.91      0.98      0.95       233
          10       0.99      0.93      0.96       248
          11       0.98      0.86      0.91       272
          12       0.88      0.85      0.86       259
          13       0.94      0.92      0.93       252
          14       0.96      0.89      0.92       239
          15       0.96      0.78      0.86       285
          16  

CountVectorizer方法构建单词的字典，每个单词转换为特征向量的一个数值特征，特征向量每个元素是特定单词在文本中出现的次数，维度是所有不同的单词个数，从而把所有句子都
统一长度，一般是字典序

In [8]:
# 词袋模型 bag of words
from sklearn.feature_extraction.text import CountVectorizer
texts = ["dog cat fish", "dog cat cat", "fish bird", "bird"]
cv = CountVectorizer()
cv_fit = cv.fit_transform(texts)

print(cv.get_feature_names_out())
print(cv_fit.toarray())

print(cv_fit.toarray().sum(axis=

['bird' 'cat' 'dog' 'fish']
[[0 1 1 1]
 [0 2 1 0]
 [1 0 0 1]
 [1 0 0 0]]
[2 3 2 2]


TfidfVectorizer使用了一个高级的计算方法，称为Term Frequency Inverse Document Frequency (TF-IDF)。可以避免因为有些词出现太过频繁而对一个视力的特征化作用不大的情况
比如 a和 and在英语中出现的频率比较高，但是它们对表征一个文本没什么作用

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
texts = ["The quick brow fox jumped over the lazy dog.", "The dog.", "The fox"]
tfidf = TfidfVectorizer()
tfidf.fit(texts)

print(tfidf.vocabulary_)
print(tfidf.idf_)
#编码文档
vector = tfidf.transform([texts[0]])

# 总结编码文档
print(vector.shape)
print(vector.toarray())


{'the': 7, 'quick': 6, 'brow': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}
[1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.        ]
(1, 8)
[[0.36388646 0.27674503 0.27674503 0.36388646 0.36388646 0.36388646
  0.36388646 0.42983441]]
