# 1. Feature extraction 特征抽取

## DictVectorizer对使用字典存储的数据进行特征抽取与向量化

In [1]:
# 定义一组字典列表，用来表示多个数据样本(每个字典代表一个数据样本)
measurements = [{'city': 'Dubai', 'temperature': 33.}, {'city': 'London', 'temperature': 12.}, {'city': 'San Fransisco', 'temperature': 18.}]

# 从sklearn.feature_extraction导入DictVectorizer
from sklearn.feature_extraction import DictVectorizer

# 初始化DictVectorizer特征提取器
vec = DictVectorizer()
# 输出转化之后的特征矩阵
print(vec.fit_transform(measurements).toarray())
# 输出各个维度的特征含义
print(vec.get_feature_names())

[[  1.   0.   0.  33.]
 [  0.   1.   0.  12.]
 [  0.   0.   1.  18.]]
['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']


## 使用CountVectorizer并且不去掉停用词的条件下，对文本特征进行量化的朴素贝叶斯分类性能测试

In [2]:
# 从sklearn.datasets里导入20类新闻文本数据抓取器
from sklearn.datasets import fetch_20newsgroups

# 从互联网上即时下载新闻样本，subset='all'参数代表下载全部近2万条文本存储在变量news中
news = fetch_20newsgroups(subset='all')

# 从sklearn.cross_validation导入train_test_split模块用于分割数据集
from sklearn.cross_validation import train_test_split

# 对news中的数据进行分割，75%的文本用作训练集，25%的文本用作测试集
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=33)

# 从sklearn.feature_extraction.text里导入CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# 采用默认的配置对CountVectorizer进行初始化(默认配置不去除英文停用词)，并且赋值给变量count_vec
count_vec = CountVectorizer()

# 只使用词频统计的方式将原始训练和测试文本转化为特征向量
X_count_train = count_vec.fit_transform(X_train)
X_count_test = count_vec.transform(X_test)




In [6]:
# 从sklearn.naive_bayes里导入朴素贝叶斯分类器
from sklearn.naive_bayes import MultinomialNB

# 使用默认的配置对分类器进行初始化
mnb_count = MultinomialNB()

# 使用朴素贝叶斯分类器，对CountVectorizer(不去除停用词)后的训练样本进行参数学习
mnb_count.fit(X_count_train, y_train)

# 输出模型准确性结果
print('The accuracy of classifying 20newsgroups using Naive Bayes (CountVectorizer without filtering stopwords):', mnb_count.score(X_count_test, y_test))

The accuracy of classifying 20newsgroups using Naive Bayes (CountVectorizer without filtering stopwords): 0.839770797963


In [7]:
#  将分类预测的结果存储在变量y_count_predict中
y_count_predict = mnb_count.predict(X_count_test)

# 从sklearn.metrics导入classification_report
from sklearn.metrics import classification_report

# 输出更加详细的其他评价分类性能的指标
print(classification_report(y_test, y_count_predict, target_names=news.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.86      0.86      0.86       201
           comp.graphics       0.59      0.86      0.70       250
 comp.os.ms-windows.misc       0.89      0.10      0.17       248
comp.sys.ibm.pc.hardware       0.60      0.88      0.72       240
   comp.sys.mac.hardware       0.93      0.78      0.85       242
          comp.windows.x       0.82      0.84      0.83       263
            misc.forsale       0.91      0.70      0.79       257
               rec.autos       0.89      0.89      0.89       238
         rec.motorcycles       0.98      0.92      0.95       276
      rec.sport.baseball       0.98      0.91      0.95       251
        rec.sport.hockey       0.93      0.99      0.96       233
               sci.crypt       0.86      0.98      0.91       238
         sci.electronics       0.85      0.88      0.86       249
                 sci.med       0.92      0.94      0.93       245
         