# 以Gensim使用進行Word2Vec的相關功能

In [1]:
# 載入相關套件
import gzip
import gensim 

## Gensim簡單測試

In [95]:
from gensim.test.utils import common_texts
# size：詞向量的大小，window：考慮上下文各自的長度
# min_count：單字至少出現的次數，workers：執行緒個數
model_simple = gensim.models.Word2Vec(sentences=common_texts, window=1, 
                                      min_count=1, workers=4)
# 傳回 有效的字數及總處理字數
model_simple.train([["hello", "world", "michael"]], total_examples=1, epochs=2)

(0, 6)

In [99]:
sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]

model_simple = gensim.models.Word2Vec(min_count=1)
model_simple.build_vocab(sentences)  # 建立生字表(vocabulary)
model_simple.train(sentences, total_examples=model_simple.corpus_count
                   , epochs=model_simple.epochs)

(1, 30)

In [40]:
model_simple.corpus_count

2

In [38]:
model_simple.epochs

5

## 實例測試

In [3]:
# 載入 OpinRank 語料庫：關於車輛與旅館的評論
data_file="./Word2Vec/reviews_data.txt.gz"

with gzip.open (data_file, 'rb') as f:
    for i,line in enumerate (f):
        print(line)
        break

b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in Be

## 讀取 OpinRank 語料庫，並作分詞

In [4]:
# 讀取 OpinRank 語料庫，並作前置處理
def read_input(input_file):
    with gzip.open (input_file, 'rb') as f:
        for i, line in enumerate (f): 
            # 前置處理
            yield gensim.utils.simple_preprocess(line)

# 載入 OpinRank 語料庫，分詞
documents = list(read_input(data_file))
documents

[['oct',
  'nice',
  'trendy',
  'hotel',
  'location',
  'not',
  'too',
  'bad',
  'stayed',
  'in',
  'this',
  'hotel',
  'for',
  'one',
  'night',
  'as',
  'this',
  'is',
  'fairly',
  'new',
  'place',
  'some',
  'of',
  'the',
  'taxi',
  'drivers',
  'did',
  'not',
  'know',
  'where',
  'it',
  'was',
  'and',
  'or',
  'did',
  'not',
  'want',
  'to',
  'drive',
  'there',
  'once',
  'have',
  'eventually',
  'arrived',
  'at',
  'the',
  'hotel',
  'was',
  'very',
  'pleasantly',
  'surprised',
  'with',
  'the',
  'decor',
  'of',
  'the',
  'lobby',
  'ground',
  'floor',
  'area',
  'it',
  'was',
  'very',
  'stylish',
  'and',
  'modern',
  'found',
  'the',
  'reception',
  'staff',
  'geeting',
  'me',
  'with',
  'aloha',
  'bit',
  'out',
  'of',
  'place',
  'but',
  'guess',
  'they',
  'are',
  'briefed',
  'to',
  'say',
  'that',
  'to',
  'keep',
  'up',
  'the',
  'coroporate',
  'image',
  'as',
  'have',
  'starwood',
  'preferred',
  'guest',
  'me

In [56]:
len(documents)

255404

## Word2Vec 模型訓練

In [5]:
# Word2Vec 模型訓練，約10分鐘
model = gensim.models.Word2Vec(documents, size=150, window=10, 
                               min_count=2, workers=10)
model.train(documents,total_examples=len(documents),epochs=10)

(303484226, 415193580)

## 測試相似詞

In [41]:
# 測試『骯髒』相似詞
w1 = "dirty"
model.wv.most_similar(positive=w1) # positive：相似詞

[('filthy', 0.8602699041366577),
 ('stained', 0.7798251509666443),
 ('dusty', 0.7683317065238953),
 ('unclean', 0.7638086676597595),
 ('grubby', 0.757234513759613),
 ('smelly', 0.7431163787841797),
 ('dingy', 0.7304496169090271),
 ('disgusting', 0.7111263275146484),
 ('soiled', 0.7099645733833313),
 ('mouldy', 0.706375241279602)]

In [10]:
# 測試『禮貌』相似詞
w1 = ["polite"]
model.wv.most_similar (positive=w1, topn=6) # topn：只列出前 n 名

[('courteous', 0.9168003797531128),
 ('friendly', 0.8217111229896545),
 ('cordial', 0.8119592666625977),
 ('professional', 0.7984355688095093),
 ('curteous', 0.778773307800293),
 ('attentive', 0.76875901222229)]

In [11]:
# 測試『法國』相似詞
w1 = ["france"]
model.wv.most_similar (positive=w1, topn=6) # topn：只列出前 n 名

[('germany', 0.6627413034439087),
 ('canada', 0.6545147895812988),
 ('spain', 0.644172728061676),
 ('england', 0.6122641563415527),
 ('mexico', 0.6106705665588379),
 ('rome', 0.6044377684593201)]

In [13]:
# 測試『床、床單、枕頭』相似詞及『長椅』相反詞
w1 = ["bed",'sheet','pillow']
w2 = ['couch']
model.wv.most_similar (positive=w1, negative=w2, topn=10) # negative：相反詞

[('duvet', 0.7157680988311768),
 ('blanket', 0.7036269903182983),
 ('mattress', 0.7003698348999023),
 ('quilt', 0.7003640532493591),
 ('matress', 0.6967926621437073),
 ('pillowcase', 0.665346086025238),
 ('sheets', 0.6376352310180664),
 ('pillows', 0.6317484378814697),
 ('comforter', 0.6119856834411621),
 ('foam', 0.6095048785209656)]

## 比較相似機率

In [14]:
# 比較兩詞相似機率
model.wv.similarity(w1="dirty",w2="smelly")

0.7431163

In [15]:
model.wv.similarity(w1="dirty",w2="dirty") 

1.0

In [16]:
model.wv.similarity(w1="dirty",w2="clean")

0.29161403

## 選出較不相似的字詞

In [17]:
# 選出較不相似的字詞
model.wv.doesnt_match(["cat","dog","france"])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'france'

## 關鍵詞萃取(Keyword Extraction)

In [2]:
# 關鍵詞萃取(Keyword Extraction)
# https://radimrehurek.com/gensim_3.8.3/summarization/keywords.html
from gensim.summarization import keywords

# 測試語料
text = '''Challenges in natural language processing frequently involve
speech recognition, natural language understanding, natural language
generation (frequently from formal, machine-readable logical forms),
connecting language and machine perception, dialog systems, or some
combination thereof.'''

# 關鍵詞萃取
print(''.join(keywords(text)))

natural language
machine
frequently


## 預先訓練的模型

In [None]:
# 下載預先訓練的模型
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [102]:
# 載入本機的預先訓練模型
from gensim.models import KeyedVectors

# 每個詞向量有 300 個元素
model = KeyedVectors.load_word2vec_format(
    './Word2Vec/GoogleNews-vectors-negative300.bin', binary=True)

In [103]:
# 取得 dog 的詞向量(300個元素)
model['dog']

array([ 5.12695312e-02, -2.23388672e-02, -1.72851562e-01,  1.61132812e-01,
       -8.44726562e-02,  5.73730469e-02,  5.85937500e-02, -8.25195312e-02,
       -1.53808594e-02, -6.34765625e-02,  1.79687500e-01, -4.23828125e-01,
       -2.25830078e-02, -1.66015625e-01, -2.51464844e-02,  1.07421875e-01,
       -1.99218750e-01,  1.59179688e-01, -1.87500000e-01, -1.20117188e-01,
        1.55273438e-01, -9.91210938e-02,  1.42578125e-01, -1.64062500e-01,
       -8.93554688e-02,  2.00195312e-01, -1.49414062e-01,  3.20312500e-01,
        3.28125000e-01,  2.44140625e-02, -9.71679688e-02, -8.20312500e-02,
       -3.63769531e-02, -8.59375000e-02, -9.86328125e-02,  7.78198242e-03,
       -1.34277344e-02,  5.27343750e-02,  1.48437500e-01,  3.33984375e-01,
        1.66015625e-02, -2.12890625e-01, -1.50756836e-02,  5.24902344e-02,
       -1.07421875e-01, -8.88671875e-02,  2.49023438e-01, -7.03125000e-02,
       -1.59912109e-02,  7.56835938e-02, -7.03125000e-02,  1.19140625e-01,
        2.29492188e-01,  

In [104]:
len(model['dog'])

300

In [105]:
# 測試『woman, king』相似詞及『man』相反詞
model.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.518113374710083),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454)]

In [106]:
# 選出較不相似的字詞
model.doesnt_match("breakfast cereal dinner lunch".split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'cereal'

In [108]:
# 比較兩詞相似機率
model.similarity('woman', 'man')

0.76640123

## 比較語句相似度
### 使用 Gensim Doc2Vec ，結果不佳 

In [None]:
import numpy as np
import nltk
import gensim
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

# 測試語料
f = open('./FAQ/starbucks_faq.txt', 'r', encoding='utf8')
corpus = f.readlines()
# print(corpus)

# 參數設定
MAX_WORDS_A_LINE = 30  # 每行最多字數

# 標點符號(Punctuation)
import string
print('標點符號:', string.punctuation)

# 讀取停用詞
stopword_list = set(nltk.corpus.stopwords.words('english') 
                    + list(string.punctuation) + ['\n'])

## 訓練 Doc2Vec 模型

In [5]:
# 分詞函數
def tokenize(text, stopwords, max_len = MAX_WORDS_A_LINE):
    return [token for token in gensim.utils.simple_preprocess(text
                              , max_len=max_len) if token not in stopwords]

# 分詞
document_tokens=[] # 整理後的字詞
for line in corpus:
    document_tokens.append(tokenize(line, stopword_list))
    
# 設定為 Gensim 標籤文件格式    
tagged_corpus = [TaggedDocument(doc, [i]) for i, doc in 
                 enumerate(document_tokens)]

# 訓練 Doc2Vec 模型
model_d2v = Doc2Vec(tagged_corpus, vector_size=MAX_WORDS_A_LINE, epochs=200)
model_d2v.train(tagged_corpus, total_examples=model_d2v.corpus_count, 
                epochs=model_d2v.epochs)

標點符號: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


## 比較語句相似度

In [7]:
# 測試
questions = []
for i in range(len(document_tokens)):
    questions.append(model_d2v.infer_vector(document_tokens[i]))
questions = np.array(questions)    
# print(questions.shape)

# 測試語句
# text = "find allergen information"
text = "mobile pay"
filtered_tokens = tokenize(text, stopword_list) 
# print(filtered_tokens)

# 比較語句相似度
similarity = cosine_similarity(model_d2v.infer_vector(
    filtered_tokens).reshape(1, -1), questions, dense_output=False)

# 選出前 10 名
top_n = np.argsort(np.array(similarity[0]))[::-1][:10]
print(f'前 10 名 index:{top_n}\n')
for i in top_n:
    print(round(similarity[0][i], 4), corpus[i].rstrip('\n'))

前 10 名 index:[164 210  54 134  43  40  46 151 207 169]

0.9619 What is Mobile Order & Pay?
0.9451 Why did you limit the number of times I can add a single modifier to my Mobile Order & Pay beverage order?
0.9059 When will my Mobile Order be ready?
0.8242 How do I choose which store to place my order at with Mobile Order & Pay?
0.8193 What are the new ways to pay and earn Stars with Starbucks Rewards and the Starbucks app? How do I pay with cash, credit/debit, or mobile wallets and earn Stars?
0.8137 Having problems using your Starbucks Mobile app?
0.8084 How do I pick up my Mobile Order & Pay order?
0.7976 Link to a URLHow do I order ahead in the Starbucks Mobile app? - Video Tutorial
0.7923 How many times can I add the same modifier to a drink order when using Mobile Order & Pay?
0.7718 How do I access Mobile Order & Pay?
