## 词向量

## BOW

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [1]:
corpus = [
    'the sky is blue',
    'sky is blue and sky is beautiful',
    'the beautiful is so blue',
    'i love blue cheese'
]

doc = ['loving this blue sky today']

In [47]:
def bow_extract(corpus, ngrams=(1, 1)):
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngrams)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [48]:
def display_features(fea_names, features):
    df = pd.DataFrame(data=features, columns=fea_names)
    print(df)

In [49]:
bow_vectorizer, bow_features = bow_extract(corpus)

fea_names = bow_vectorizer.get_feature_names()
bow_features = bow_features.todense()

print(fea_names)
print(bow_features)

display_features(fea_names, bow_features)

['and', 'beautiful', 'blue', 'cheese', 'is', 'love', 'sky', 'so', 'the']
[[0 0 1 0 1 0 1 0 1]
 [1 1 1 0 2 0 2 0 0]
 [0 1 1 0 1 0 0 1 1]
 [0 0 1 1 0 1 0 0 0]]
   and  beautiful  blue  cheese  is  love  sky  so  the
0    0          0     1       0   1     0    1   0    1
1    1          1     1       0   2     0    2   0    0
2    0          1     1       0   1     0    0   1    1
3    0          0     1       1   0     1    0   0    0


In [51]:
# transform new doc to vector of bow model
doc_features = bow_vectorizer.transform(doc).todense()

display_features(fea_names, doc_features)

   and  beautiful  blue  cheese  is  love  sky  so  the
0    0          0     1       0   0     0    1   0    0


## TFIDF
### 基于词袋模型的词进行计算

In [29]:
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np

In [39]:
def tfidf_extract(bow_features):
    transformer = TfidfTransformer(norm='l2', smooth_idf=True, use_idf=True)
    tfidf_features = transformer.fit_transform(bow_features)
    return transformer, tfidf_features

In [52]:
tfidf_transformer, tfidf_features = tfidf_extract(bow_features)

print(tfidf_features)

tfidf_features = np.round(tfidf_features.todense(), 2)
print(tfidf_features)

display_features(fea_names, tfidf_features)

  (0, 8)	0.5685556582078485
  (0, 6)	0.5685556582078485
  (0, 4)	0.4602948056749725
  (0, 2)	0.37632116457664155
  (1, 6)	0.6432036103096459
  (1, 4)	0.5207287563545564
  (1, 2)	0.21286493960380123
  (1, 1)	0.32160180515482295
  (1, 0)	0.40791111090371607
  (2, 8)	0.46115286047158355
  (2, 7)	0.5849139295745928
  (2, 4)	0.37334298451327075
  (2, 2)	0.3052323532361606
  (2, 1)	0.46115286047158355
  (3, 5)	0.6633846138519129
  (3, 3)	0.6633846138519129
  (3, 2)	0.34618161159873423
[[0.   0.   0.38 0.   0.46 0.   0.57 0.   0.57]
 [0.41 0.32 0.21 0.   0.52 0.   0.64 0.   0.  ]
 [0.   0.46 0.31 0.   0.37 0.   0.   0.58 0.46]
 [0.   0.   0.35 0.66 0.   0.66 0.   0.   0.  ]]
    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.38    0.00  0.46  0.00  0.57  0.00  0.57
1  0.41       0.32  0.21    0.00  0.52  0.00  0.64  0.00  0.00
2  0.00       0.46  0.31    0.00  0.37  0.00  0.00  0.58  0.46
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


In [53]:
# transform new doc to vector of bow model
doc_tfidf_features = tfidf_transformer.transform(doc_features)
doc_tfidf_features = np.round(doc_tfidf_features.todense(), 2)

display_features(fea_names, doc_tfidf_features)

   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.55     0.0  0.0   0.0  0.83  0.0  0.0


## TFIDF - 2
### 直接计算

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
def tfidf_extract2(corpus, ngrams=(1, 1)):
    vectorizer = TfidfVectorizer(min_df=1, ngram_range=ngrams, norm='l2', smooth_idf=True, use_idf=True)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [55]:
tfidf_vectorizer, tfidf_features = tfidf_extract2(corpus)

display_features(fea_names, np.round(tfidf_features.todense(), 2))

    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.38    0.00  0.46  0.00  0.57  0.00  0.57
1  0.41       0.32  0.21    0.00  0.52  0.00  0.64  0.00  0.00
2  0.00       0.46  0.31    0.00  0.37  0.00  0.00  0.58  0.46
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


In [54]:
# transform new doc to vector of bow model
doc_tfidf_features = tfidf_vectorizer.transform(doc)

display_features(fea_names, np.round(doc_tfidf_features.todense(), 2))

   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.55     0.0  0.0   0.0  0.83  0.0  0.0


## Word2Vec

In [2]:
from gensim.models import Word2Vec
import nltk



In [3]:
sentences = [nltk.word_tokenize(sen) for sen in corpus]
docs = [nltk.word_tokenize(sen) for sen in doc]

model = Word2Vec(sentences, size=10, window=10, min_count=2)

In [5]:
print(model.wv['sky'])
print(model.wv.most_similar('blue'))

[-0.04478759  0.04840298 -0.02347958  0.00326061 -0.02830264  0.01282009
 -0.00129597  0.04345105  0.03344116  0.02269522]
[('is', 0.064363032579422), ('the', -0.11029154062271118), ('beautiful', -0.1127469539642334), ('sky', -0.11970236897468567)]


In [4]:
# stored in a KeyedVectors instance
word_vec = model.wv
del model

In [5]:
print(word_vec['sky'])

[-0.04805895 -0.04145954  0.0315539  -0.01477276  0.04520844 -0.04808239
 -0.00829656  0.02893448 -0.02975751 -0.04921004]


In [None]:
# save and load model
model.save(filename)
model = Word2Vec.load(filename)

## 句子向量
### 平均词向量表示

In [6]:
import numpy as np

In [32]:
def average_words_vector(sen, model, vocabulary, ndims):
    feature = np.zeros((ndims,), dtype="float64")
    word_count = 0
    
    for word in sen.split():
        if word in vocabulary:
            feature = np.add(feature, model.wv[word])
            word_count = word_count + 1
            
    feature = np.divide(feature, word_count)
    
    return feature

In [34]:
def get_sentence_feature(corpus, model, ndims=10):
    vocabulary = set(model.wv.index2word)
    features = [average_words_vector(sen, model, vocabulary, ndims) for sen in corpus]
    return np.array(features)

In [38]:
sen_feature = get_sentence_feature(corpus, model)
print(np.round(sen_feature, 3))

[[-0.016 -0.    -0.004  0.014 -0.014  0.019  0.     0.022  0.029 -0.005]
 [-0.018  0.002 -0.013  0.007  0.003  0.003  0.007  0.011  0.03   0.007]
 [-0.01  -0.016 -0.008  0.013  0.004  0.012  0.002 -0.001  0.016  0.001]
 [ 0.003 -0.009 -0.023  0.015 -0.002  0.034  0.018 -0.035  0.035 -0.05 ]]


### TFIDF加权平均词向量