# 把字典列表转换成特征向量 DictVectorizer

## 加载字典中的特征

In [5]:
measurements = [
    {
        'city': 'Dubai',
        'temperature': 33.
    },
    {
        'city': 'London',
        'temperature': 12.
    },
    {
        'city': 'San Fransisco',
        'temperature': 18.
    },
]

In [6]:
pd.DataFrame(measurements)

Unnamed: 0,city,temperature
0,Dubai,33.0
1,London,12.0
2,San Fransisco,18.0


### 直接把字典对象传给 DictVectorizer 就可以

In [8]:
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer()

In [9]:
vec.fit_transform(measurements).toarray()

array([[ 1.,  0.,  0., 33.],
       [ 0.,  1.,  0., 12.],
       [ 0.,  0.,  1., 18.]])

In [10]:
vec.get_feature_names()

['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']

In [11]:
from sklearn.feature_extraction import DictVectorizer

# sparse=False 的意思是不产生稀疏矩阵
vec = DictVectorizer(sparse=False)
X = vec.fit_transform(measurements)

In [12]:
X.tolist()

[[1.0, 0.0, 0.0, 33.0], [0.0, 1.0, 0.0, 12.0], [0.0, 0.0, 1.0, 18.0]]

In [13]:
vec.get_feature_names()

['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']

### 把向量恢复成原始的字典列表

In [14]:
vec.inverse_transform(X)

[{'city=Dubai': 1.0, 'temperature': 33.0},
 {'city=London': 1.0, 'temperature': 12.0},
 {'city=San Fransisco': 1.0, 'temperature': 18.0}]

In [15]:
vec.get_params(deep=True)

{'dtype': numpy.float64, 'separator': '=', 'sort': True, 'sparse': False}

## 案例：20 类新闻文本分类

### 例1：使用词频统计，没有去掉停用词

In [16]:
from sklearn.datasets import fetch_20newsgroups

news = fetch_20newsgroups(subset='all')
X = news.data # 是一个列表，列表的元素是字符串
y = news.target

In [19]:
type(X)

list

In [20]:
type(X[0])

str

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=666)

### 只有词频统计

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer()
X_count_train = count_vec.fit_transform(X_train)
X_count_test = count_vec.transform(X_test)

## 使用逻辑回归，训练效果还不错

In [25]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_count_train, y_train)
y_count_predict_lr = lr.predict(X_count_test)
lr.score(X_count_test, y_test)

0.8976127320954908

### 朴素贝叶斯分类器

In [23]:
from sklearn.naive_bayes import MultinomialNB

mnb_count = MultinomialNB()
mnb_count.fit(X_count_train, y_train)
y_count_predict = mnb_count.predict(X_count_test)
score = mnb_count.score(X_count_test, y_test)
print(score)

0.8403183023872679


### 查看更详细的分类报告

In [24]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_count_predict, target_names=news.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.89      0.86      0.87       180
           comp.graphics       0.69      0.87      0.77       203
 comp.os.ms-windows.misc       1.00      0.09      0.16       196
comp.sys.ibm.pc.hardware       0.63      0.84      0.72       182
   comp.sys.mac.hardware       0.87      0.85      0.86       194
          comp.windows.x       0.68      0.89      0.77       201
            misc.forsale       0.89      0.72      0.79       176
               rec.autos       0.93      0.91      0.92       217
         rec.motorcycles       0.96      0.93      0.95       198
      rec.sport.baseball       0.97      0.94      0.95       177
        rec.sport.hockey       0.97      0.98      0.97       183
               sci.crypt       0.87      0.98      0.92       209
         sci.electronics       0.86      0.77      0.81       219
                 sci.med       0.93      0.94      0.93       199
         

## 例2：使用 tfidf 统计，没有去掉停用词

In [93]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

news = fetch_20newsgroups(subset='all')

X = news.data
y = news.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=666)

tfidf = TfidfVectorizer()
X_tfidf_train = tfidf.fit_transform(X_train)
X_tfidf_test = tfidf.transform(X_test)

mnb_clf = MultinomialNB()
mnb_clf.fit(X_tfidf_train, y_train)
y_tfidf_predict = mnb_clf.predict(X_tfidf_test)

print(classification_report(y_test, y_tfidf_predict, target_names=news.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.88      0.63      0.73       180
           comp.graphics       0.88      0.75      0.81       203
 comp.os.ms-windows.misc       0.86      0.88      0.87       196
comp.sys.ibm.pc.hardware       0.73      0.85      0.78       182
   comp.sys.mac.hardware       0.92      0.84      0.87       194
          comp.windows.x       0.93      0.84      0.88       201
            misc.forsale       0.89      0.72      0.80       176
               rec.autos       0.94      0.92      0.93       217
         rec.motorcycles       0.93      0.96      0.94       198
      rec.sport.baseball       0.93      0.97      0.95       177
        rec.sport.hockey       0.94      0.99      0.97       183
               sci.crypt       0.83      1.00      0.90       209
         sci.electronics       0.89      0.73      0.80       219
                 sci.med       0.92      0.91      0.92       199
         

In [95]:
mnb_clf.score(X_tfidf_test, y_test)

0.843236074270557

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

news = fetch_20newsgroups(subset='all')

X = news.data
y = news.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=666)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

news = fetch_20newsgroups(subset='all')

X = news.data
y = news.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=666)

In [22]:
count_filter_vec = CountVectorizer(analyzer='word', stop_words='english')

X_count_train = count_filter_vec.fit_transform(X_train)
X_count_test = count_filter_vec.transform(X_test)


from sklearn.naive_bayes import MultinomialNB
mnb_count_clf = MultinomialNB()

mnb_count_clf.fit(X_count_train, y_train)
y_count_predict = mnb_count_clf.predict(X_count_test)

score = mnb_count_clf.score(X_count_test, y_test)
print(score)

0.8655172413793103


In [99]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_count_predict, target_names=news.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.90      0.93      0.92       180
           comp.graphics       0.71      0.87      0.78       203
 comp.os.ms-windows.misc       1.00      0.20      0.33       196
comp.sys.ibm.pc.hardware       0.65      0.83      0.73       182
   comp.sys.mac.hardware       0.83      0.86      0.84       194
          comp.windows.x       0.70      0.90      0.79       201
            misc.forsale       0.86      0.77      0.81       176
               rec.autos       0.94      0.91      0.92       217
         rec.motorcycles       0.95      0.96      0.96       198
      rec.sport.baseball       0.97      0.95      0.96       177
        rec.sport.hockey       0.96      0.98      0.97       183
               sci.crypt       0.91      0.98      0.94       209
         sci.electronics       0.89      0.84      0.87       219
                 sci.med       0.94      0.96      0.95       199
         

## 例4：使用 tfidf 统计，去掉停用词

In [100]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

news = fetch_20newsgroups(subset='all')

X = news.data
y = news.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=666)

In [101]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_filter_vec = TfidfVectorizer(analyzer='word', stop_words='english')

X_tfidf_train = tfidf_filter_vec.fit_transform(X_train)
X_tfidf_test = tfidf_filter_vec.transform(X_test)

mnb_tfidf_clf = MultinomialNB()
mnb_tfidf_clf.fit(X_tfidf_train, y_train)
y_tfidf_predict = mnb_tfidf_clf.predict(X_tfidf_test)

score = mnb_tfidf_clf.score(X_tfidf_test, y_test)
print(classification_report(y_test,y_tfidf_predict,target_names=news.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.90      0.79      0.84       180
           comp.graphics       0.85      0.82      0.84       203
 comp.os.ms-windows.misc       0.87      0.90      0.89       196
comp.sys.ibm.pc.hardware       0.77      0.85      0.81       182
   comp.sys.mac.hardware       0.91      0.85      0.88       194
          comp.windows.x       0.92      0.88      0.90       201
            misc.forsale       0.84      0.81      0.82       176
               rec.autos       0.94      0.92      0.93       217
         rec.motorcycles       0.92      0.97      0.95       198
      rec.sport.baseball       0.95      0.97      0.96       177
        rec.sport.hockey       0.95      0.99      0.97       183
               sci.crypt       0.91      0.99      0.95       209
         sci.electronics       0.92      0.79      0.85       219
                 sci.med       0.93      0.94      0.93       199
         

In [102]:
score

0.880106100795756

---

## 卡方检验

In [22]:
X = np.array([[19, 24], 
              [34, 10]])

In [23]:
X

array([[19, 24],
       [34, 10]])

In [35]:
rate = X.sum(axis=0)/sum(X.sum(axis=0))
print('属于娱乐的比例：', rate[0])
print('不属于娱乐的比例：', rate[1])

属于娱乐的比例： 0.6091954022988506
不属于娱乐的比例： 0.39080459770114945


In [41]:
content = X.sum(axis=1)
print('不包含吴亦凡的新闻总数：',content[0])
print('包含吴亦凡的新闻总数：',content[1])

不包含吴亦凡的新闻总数： 43
包含吴亦凡的新闻总数： 44


In [46]:
# 理论值矩阵
T = [[content[0]*rate[0],content[0]*rate[1]],
 [content[1]*rate[0],content[1]*rate[1]]
]
T

[[26.195402298850578, 16.804597701149426],
 [26.804597701149426, 17.195402298850574]]

In [48]:
X-T

array([[-7.1954023,  7.1954023],
       [ 7.1954023, -7.1954023]])

In [51]:
import math
np.power(X-T,2)

array([[51.77381424, 51.77381424],
       [51.77381424, 51.77381424]])

In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()

X_tfidf_train = tfidf_vec.fit_transform(X_train)
X_tfidf_test = tfidf_vec.transform(X_test)

In [90]:
mnb_tfidf = MultinomialNB()
mnb_tfidf.fit(X_tfidf_train, y_train)
y_tfidf_predict = mnb_tfidf.predict(X_tfidf_test)

In [91]:
print(classification_report(y_test, y_tfidf_predict,target_names=news.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.88      0.63      0.73       180
           comp.graphics       0.88      0.75      0.81       203
 comp.os.ms-windows.misc       0.86      0.88      0.87       196
comp.sys.ibm.pc.hardware       0.73      0.85      0.78       182
   comp.sys.mac.hardware       0.92      0.84      0.87       194
          comp.windows.x       0.93      0.84      0.88       201
            misc.forsale       0.89      0.72      0.80       176
               rec.autos       0.94      0.92      0.93       217
         rec.motorcycles       0.93      0.96      0.94       198
      rec.sport.baseball       0.93      0.97      0.95       177
        rec.sport.hockey       0.94      0.99      0.97       183
               sci.crypt       0.83      1.00      0.90       209
         sci.electronics       0.89      0.73      0.80       219
                 sci.med       0.92      0.91      0.92       199
         

In [103]:
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
 
corpus = ['This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?']

vectorizer = CountVectorizer()

transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) 
print (tfidf)


  (0, 8)	0.4387767428592343
  (0, 3)	0.4387767428592343
  (0, 6)	0.35872873824808993
  (0, 2)	0.5419765697264572
  (0, 1)	0.4387767428592343
  (1, 8)	0.27230146752334033
  (1, 3)	0.27230146752334033
  (1, 6)	0.2226242923251039
  (1, 1)	0.27230146752334033
  (1, 5)	0.8532257361452784
  (2, 6)	0.2884767487500274
  (2, 0)	0.5528053199908667
  (2, 7)	0.5528053199908667
  (2, 4)	0.5528053199908667
  (3, 8)	0.4387767428592343
  (3, 3)	0.4387767428592343
  (3, 6)	0.35872873824808993
  (3, 2)	0.5419765697264572
  (3, 1)	0.4387767428592343
