# 使用詞向量 (Word2Vec)
- 使用預先訓練好的詞向量轉換來代替 PyTorch 內建的嵌入層轉換
- PyTorch 支援 GloVe, FastText 及 CharNGram 三種詞向量

## 以 GloVe 為例

## 讀取 GloVe 50 維的詞向量

In [6]:
import torchtext

examples = ["great"]
vec = torchtext.vocab.GloVe(name = "6B", dim = 50)
ret = vec.get_vecs_by_tokens(examples, lower_case_backup = True)
ret

tensor([[-0.0266,  1.3357, -1.0280, -0.3729,  0.5201, -0.1270, -0.3543,  0.3782,
         -0.2972,  0.0939, -0.0341,  0.9296, -0.1402, -0.6330,  0.0208, -0.2153,
          0.9692,  0.4765, -1.0039, -0.2401, -0.3632, -0.0048, -0.5148, -0.4626,
          1.2447, -1.8316, -1.5581, -0.3747,  0.5336,  0.2088,  3.2209,  0.6455,
          0.3744, -0.1766, -0.0242,  0.3379, -0.4190,  0.4008, -0.1145,  0.0512,
         -0.1521,  0.2986, -0.4405,  0.1109, -0.2463,  0.6625, -0.2695, -0.4966,
         -0.4162, -0.2549]])

#### 顯示詞向量大小

In [None]:
vec.vectors.size()

#### 查詢單字的詞向量索引值

In [None]:
vec.stoi["great"]

### 建立模型 (之前的作法)

In [7]:
import torch
import torchtext
import torch.nn as nn
import numpy as np
import string

class RecurrentNet(nn.Module):
    def __init__(self, weights_matrix, num_embeddings, embedding_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(num_embeddings, embedding_dim)
        # 設定嵌入層權重
        self.embedding.load_state_dict({"weight": weights_matrix})
        self.fc = nn.Linear(embedding_dim, num_class)

    def forward(self, text):
        embedded = self.embedding(text)
        return self.fc(embedded)


### 測試資料轉換 (之前的作法)

In [8]:
docs =[
        "Well done!!",
        "Good work",
        "Great effort",
        "nice work",
        "Excellent!!",
        "Weak",
        "Poor effort!!",
        "not good",
        "poor work",
        "Could have done better"
]

stopwords = list(string.punctuation)
tokenizer = torchtext.data.utils.get_tokenizer("basic_english")

# 將詞彙表轉為詞向量
clean_text_list = []
clean_tokens_list = []
for i, text in enumerate(docs):
    tokens = tokenizer(text.lower())
    clean_tokens = []
    for w in tokens:
        if w not in stopwords:
            clean_tokens.append(w)
    clean_tokens_list += clean_tokens
    clean_text_list.append(clean_tokens)
    tokens_vec = vec.get_vecs_by_tokens(clean_tokens)
    # print("clean_tokens_list", clean_tokens_list)
    # print()
    # print("clean_text_list", clean_text_list)
    # print()
    # print("clean_tokens", clean_tokens)
    # print()
# print("clean_tokens_list", set(clean_tokens_list))

vocab_list = list(set(clean_tokens_list))
weights_matrix = vec.get_vecs_by_tokens(vocab_list)

### 定義 10 個語句的正面 (1) 或負面 (0) 的情緒, 並將 10 個語句轉換維詞彙表索引值 (之前的作法)

In [9]:
maxlen = 4

# 定義 10 個語句的正面 (1) 或負面 (0) 的情緒
y = torch.FloatTensor([1,1,1,1,1,0,0,0,0,0])
X = torch.LongTensor(np.zeros((len(docs), maxlen)))
for i, item in enumerate(clean_text_list):
    for j, token in enumerate(item):
        if token in vocab_list:
            X[i, j] = vocab_list.index(token)

X

tensor([[ 5,  1,  0,  0],
        [13,  4,  0,  0],
        [ 3,  9,  0,  0],
        [12,  4,  0,  0],
        [ 8,  0,  0,  0],
        [ 6,  0,  0,  0],
        [ 2,  9,  0,  0],
        [ 7, 13,  0,  0],
        [ 2,  4,  0,  0],
        [11,  0,  1, 10]])

In [10]:
vocab_list

['have',
 'done',
 'poor',
 'great',
 'work',
 'well',
 'weak',
 'not',
 'excellent',
 'effort',
 'better',
 'could',
 'nice',
 'good']

### 模型訓練 (類似之前的作法)

In [11]:
# 建立模型物件
model = RecurrentNet(torch.FloatTensor(weights_matrix), len(vocab_list), 50, 1)

# 指定優化器, 損失函數
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

# 模型訓練
for epoch in range(1000):
    outputs = model.forward(X)
    optimizer.zero_grad()
    loss = criterion(outputs.reshape(-1), y)
    loss.backward()
    optimizer.step()
    if epoch % 100 == 0:
        print(f"Epoch: {epoch}, loss: {loss.item():1.5f}")

Epoch: 0, loss: 1.10729
Epoch: 100, loss: 0.13446
Epoch: 200, loss: 0.04207
Epoch: 300, loss: 0.01199
Epoch: 400, loss: 0.00545
Epoch: 500, loss: 0.00280
Epoch: 600, loss: 0.00138
Epoch: 700, loss: 0.00063
Epoch: 800, loss: 0.00026
Epoch: 900, loss: 0.00010


### 模型評估 (類似之前的作法)

In [12]:
model.eval()
model(X)

tensor([[ 9.9986e-01],
        [ 9.9225e-01],
        [ 1.0063e+00],
        [ 9.9858e-01],
        [ 9.9941e-01],
        [-1.1912e-03],
        [-9.9203e-03],
        [ 4.4055e-03],
        [ 1.1089e-02],
        [-1.1119e-04]], grad_fn=<AddmmBackward0>)

### 測試新資料 (類似之前的作法)

In [13]:
# 測試資料
test_docs = [
                "great effort",
                "well done",
                "poor effort"
]

# 轉成數值
X = torch.LongTensor(np.zeros((len(test_docs), maxlen)))
clean_text_list = []
for i, text in enumerate(test_docs):
    tokens = tokenizer(text.lower())
    clean_tokens = []
    for w in tokens:
        if w not in stopwords:
            clean_tokens.append(w)
    clean_text_list.append(clean_tokens)

for i, item in enumerate(clean_text_list):
    for j, token in enumerate(item):
        if token in vocab_list:
            X[i, j] = vocab_list.index(token)

# 預測
model.eval()
model(X)

tensor([[ 1.0063],
        [ 0.9999],
        [-0.0099]], grad_fn=<AddmmBackward0>)

## 以上做法 (之前的作法) 並不能預測訓練資料以外的單字.