# CountVectorizer 的用法

In [2]:
# 文档
docs = [
    'The sun is shining', 'The weather is sweet',
    'The sun is shining and the weather is sweet'
]

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
count = CountVectorizer()
# 词袋模型
bag = count.fit_transform(docs)

In [4]:
print(count.vocabulary_)

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}


In [5]:
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


总共就 $7$ 个单词，所以这个词袋模型的向量的长度是 $7$。

In [8]:
np.set_printoptions(precision=2)

除了词袋模型，还有 TF-IDF。

### `TfidfTransformer` 的用法

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.56 0.56 0.   0.43 0.  ]
 [0.   0.43 0.   0.   0.56 0.43 0.56]
 [0.4  0.48 0.31 0.31 0.31 0.48 0.31]]


### 或者直接用 TfidfVectorizer

`TfidfVectorizer = TfidfVectorizer + TfidfVectorizer`

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer(use_idf=True, norm='l2', smooth_idf=True)

In [13]:
result = tfidf_vec.fit_transform(docs)

In [17]:
print(result)

  (0, 5)	0.4337078595086741
  (0, 3)	0.5584778353707552
  (0, 1)	0.4337078595086741
  (0, 2)	0.5584778353707552
  (1, 5)	0.4337078595086741
  (1, 1)	0.4337078595086741
  (1, 6)	0.5584778353707552
  (1, 4)	0.5584778353707552
  (2, 5)	0.4781017181969035
  (2, 3)	0.307821505665273
  (2, 1)	0.4781017181969035
  (2, 2)	0.307821505665273
  (2, 6)	0.307821505665273
  (2, 4)	0.307821505665273
  (2, 0)	0.40474828809297636


In [15]:
print(result.toarray())

[[0.   0.43 0.56 0.56 0.   0.43 0.  ]
 [0.   0.43 0.   0.   0.56 0.43 0.56]
 [0.4  0.48 0.31 0.31 0.31 0.48 0.31]]


下面手动计算。

In [18]:
tf_is = 2
n_docs = 3
idf_is = np.log((n_docs + 1) / (3 + 1))
tfidf_is = tf_is * (idf_is + 1)
print('tf-idf of term "is" = %.2f' % tfidf_is)

tf-idf of term "is" = 2.00


In [19]:
tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)
raw_tfidf = tfidf.fit_transform(count.fit_transform(docs)).toarray()[-1]
raw_tfidf 

array([1.69, 2.  , 1.29, 1.29, 1.29, 2.  , 1.29])

In [20]:
l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))
l2_tfidf

array([0.4 , 0.48, 0.31, 0.31, 0.31, 0.48, 0.31])