# 特征提升之特征抽取

In [1]:
measurements = [{'city':'shanghai','temperature':33.},{'city':'beijing','temperature':20.},{'city':'fuzhou','temperature':13.}]

In [3]:
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer()
vec.fit_transform(measurements).toarray()

array([[  0.,   0.,   1.,  33.],
       [  1.,   0.,   0.,  20.],
       [  0.,   1.,   0.,  13.]])

In [4]:
# 输出各个维度的特征的含义

vec.get_feature_names()

['city=beijing', 'city=fuzhou', 'city=shanghai', 'temperature']

In [8]:
from sklearn.datasets import fetch_20newsgroups

news = fetch_20newsgroups(subset='all')

In [10]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=  train_test_split(news.data,news.target,test_size=0.25,random_state=33)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

# 默认不去除停用词
count_vec = CountVectorizer()

只使用词频统计的方式将原始训练和测试文本转换为特征向量。

In [16]:
X_count_train = count_vec.fit_transform(X_train)
X_count_test = count_vec.transform(X_test)

使用朴素贝叶斯分类器。

In [17]:
from sklearn.naive_bayes import MultinomialNB

mnb_count = MultinomialNB()
mnb_count.fit(X_count_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
mnb_count.score(X_count_test,y_test)

0.83977079796264853

+ 进行预测

In [19]:
y_count_predict = mnb_count.predict(X_count_test)

In [20]:
from sklearn.metrics import accuracy_score

accuracy_score(y_count_predict,y_test)

0.83977079796264853

看一看分类报告

In [38]:
news.description

'the 20 newsgroups by date dataset'

In [41]:
len(news.target_names)

20

注意，查看 API 你会发现，真实值要放在最前面，然后放预测值。

In [24]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_count_predict,target_names=news.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.86      0.86      0.86       201
           comp.graphics       0.59      0.86      0.70       250
 comp.os.ms-windows.misc       0.89      0.10      0.17       248
comp.sys.ibm.pc.hardware       0.60      0.88      0.72       240
   comp.sys.mac.hardware       0.93      0.78      0.85       242
          comp.windows.x       0.82      0.84      0.83       263
            misc.forsale       0.91      0.70      0.79       257
               rec.autos       0.89      0.89      0.89       238
         rec.motorcycles       0.98      0.92      0.95       276
      rec.sport.baseball       0.98      0.91      0.95       251
        rec.sport.hockey       0.93      0.99      0.96       233
               sci.crypt       0.86      0.98      0.91       238
         sci.electronics       0.85      0.88      0.86       249
                 sci.med       0.92      0.94      0.93       245
         

In [43]:
# 下面这种写法是错误的。
# print(classification_report(y_count_predict,y_test,target_names=news.target_names))

使用计算倒数的方式生成词袋模型。

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
X_tfidf_train = tfidf_vec.fit_transform(X_train)
X_tfidf_test = tfidf_vec.transform(X_test)

In [53]:
mnb_tfidf = MultinomialNB()
mnb_tfidf.fit(X_tfidf_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [54]:
mnb_tfidf.score(X_tfidf_test,y_test)

0.84634974533106966

In [59]:
y_tfidf_predict = mnb_tfidf.predict(X_tfidf_test)
sum(y_test==y_tfidf_predict)/len(y_tfidf_predict)

0.84634974533106966

In [60]:
accuracy_score(y_test,y_tfidf_predict)

0.84634974533106966

In [62]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_tfidf_predict,target_names=news.target_names))

+ 使用停用词的情况下训练

                          precision    recall  f1-score   support

             alt.atheism       0.84      0.67      0.75       201
           comp.graphics       0.85      0.74      0.79       250
 comp.os.ms-windows.misc       0.82      0.85      0.83       248
comp.sys.ibm.pc.hardware       0.76      0.88      0.82       240
   comp.sys.mac.hardware       0.94      0.84      0.89       242
          comp.windows.x       0.96      0.84      0.89       263
            misc.forsale       0.93      0.69      0.79       257
               rec.autos       0.84      0.92      0.88       238
         rec.motorcycles       0.98      0.92      0.95       276
      rec.sport.baseball       0.96      0.91      0.94       251
        rec.sport.hockey       0.88      0.99      0.93       233
               sci.crypt       0.73      0.98      0.83       238
         sci.electronics       0.91      0.83      0.87       249
                 sci.med       0.97      0.92      0.95       245
         

In [63]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

count_fiter_ect = CountVectorizer(analyzer='word',stop_words='english')
tfidf_fiter_ect = TfidfVectorizer(analyzer='word',stop_words='english')

In [64]:
X_count_fiter_train = count_fiter_ect.fit_transform(X_train)
X_count_fiter_test = count_fiter_ect.transform(X_test)

X_tfidf_fiter_train = tfidf_fiter_ect.fit_transform(X_train)
X_tfidf_fiter_test = count_fiter_ect.transform(X_test)

In [66]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

mnb_count_filter = MultinomialNB()
mnb_count_filter.fit(X_count_fiter_train,y_train)
count_fiter_test_predict = mnb_count_filter.predict(X_count_fiter_test)
print(mnb_count_filter.score(X_count_fiter_test,y_test))


mnb_tfidf_filter = MultinomialNB()
mnb_tfidf_filter.fit(X_tfidf_fiter_train,y_train)
tfidf_fiter_test_predict = mnb_tfidf_filter.predict(X_tfidf_fiter_test)
print(mnb_tfidf_filter.score(X_tfidf_fiter_test,y_test))


0.863752122241
0.864176570458


In [68]:
from sklearn.metrics import classification_report

print(classification_report(y_test,count_fiter_test_predict,target_names=news.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.85      0.89      0.87       201
           comp.graphics       0.62      0.88      0.73       250
 comp.os.ms-windows.misc       0.93      0.22      0.36       248
comp.sys.ibm.pc.hardware       0.62      0.88      0.73       240
   comp.sys.mac.hardware       0.93      0.85      0.89       242
          comp.windows.x       0.82      0.85      0.84       263
            misc.forsale       0.90      0.79      0.84       257
               rec.autos       0.91      0.91      0.91       238
         rec.motorcycles       0.98      0.94      0.96       276
      rec.sport.baseball       0.98      0.92      0.95       251
        rec.sport.hockey       0.92      0.99      0.95       233
               sci.crypt       0.91      0.97      0.93       238
         sci.electronics       0.87      0.89      0.88       249
                 sci.med       0.94      0.95      0.95       245
         

In [69]:
print(classification_report(y_test,tfidf_fiter_test_predict,target_names=news.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.84      0.77      0.80       201
           comp.graphics       0.83      0.76      0.79       250
 comp.os.ms-windows.misc       0.80      0.88      0.84       248
comp.sys.ibm.pc.hardware       0.75      0.84      0.80       240
   comp.sys.mac.hardware       0.92      0.86      0.89       242
          comp.windows.x       0.96      0.83      0.89       263
            misc.forsale       0.90      0.81      0.85       257
               rec.autos       0.86      0.91      0.89       238
         rec.motorcycles       0.97      0.93      0.95       276
      rec.sport.baseball       0.95      0.92      0.94       251
        rec.sport.hockey       0.87      0.99      0.92       233
               sci.crypt       0.78      0.97      0.86       238
         sci.electronics       0.92      0.82      0.87       249
                 sci.med       0.97      0.90      0.93       245
         