# 使用詞向量 (Word2Vec)
- 使用預先訓練好的詞向量轉換來代替 PyTorch 內建的嵌入層轉換
- PyTorch 支援 GloVe, FastText 及 CharNGram 三種詞向量

## 以 GloVe 為例
- 讀取 GloVe 50維的詞向量

In [13]:
import string
import numpy as np
import torch
import torchtext
import torch.nn as nn

vec = torchtext.vocab.GloVe(name = "6B", dim = 50)

examples = ["great"]
ret = vec.get_vecs_by_tokens(examples, lower_case_backup = True)
ret

tensor([[-0.0266,  1.3357, -1.0280, -0.3729,  0.5201, -0.1270, -0.3543,  0.3782,
         -0.2972,  0.0939, -0.0341,  0.9296, -0.1402, -0.6330,  0.0208, -0.2153,
          0.9692,  0.4765, -1.0039, -0.2401, -0.3632, -0.0048, -0.5148, -0.4626,
          1.2447, -1.8316, -1.5581, -0.3747,  0.5336,  0.2088,  3.2209,  0.6455,
          0.3744, -0.1766, -0.0242,  0.3379, -0.4190,  0.4008, -0.1145,  0.0512,
         -0.1521,  0.2986, -0.4405,  0.1109, -0.2463,  0.6625, -0.2695, -0.4966,
         -0.4162, -0.2549]])

In [14]:
vec.vectors.size()

torch.Size([400000, 50])

### 查詢單字的詞向量索引值

In [15]:
vec.stoi["great"]

353

## 建立模型 (將整個詞向量設定為嵌入層權重)

In [16]:
class RecurrentNet2(nn.Module):
    def __init__(self, vec, embedding_dim, num_class):
        super().__init__()
        # 將整個詞向量設定為嵌入層權重, 且嵌入層設為不訓練
        self.embedding = nn.EmbeddingBag.from_pretrained(vec, freeze = True)
        self.fc = nn.Linear(embedding_dim, num_class)
    
    def forward(self, text):
        embedded = self.embedding(text)
        return self.fc(embedded)

model = RecurrentNet2(vec.vectors, vec.dim, 1)

## 將訓練資料傳換為 GloVe 詞向量索引值

In [18]:
maxlen = 4

# 測試資料
docs = [
        "Well done!!",
        "Good work",
        "Great effort",
        "nice work",
        "Excellent!!",
        "Weak",
        "Poor effort",
        "Not good",
        "Poor work",
        "Could have done better"
]

stopwords = string.punctuation

# 轉成數值
y = torch.FloatTensor([1,1,1,1,1,0,0,0,0,0])
X = torch.LongTensor(np.zeros((len(docs), maxlen)))

tokenizer = torchtext.data.utils.get_tokenizer("basic_english")

for i, text in enumerate(docs):
    tokens = tokenizer(text.lower())
    clean_tokens = []
    j=0
    for w in tokens:
        if w not in stopwords:
            # 轉成詞向量索引值
            X[i, j] = vec.stoi[w]
            j+=1
X

tensor([[ 143,  751,    0,    0],
        [ 219,  161,    0,    0],
        [ 353,  968,    0,    0],
        [3082,  161,    0,    0],
        [4345,    0,    0,    0],
        [2690,    0,    0,    0],
        [ 992,  968,    0,    0],
        [  36,  219,    0,    0],
        [ 992,  161,    0,    0],
        [  94,   33,  751,  439]])

In [19]:
# 指定優化器, 損失函數
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

# 模型訓練
for epoch in range(1000):
    outputs = model.forward(X)
    optimizer.zero_grad()
    loss = criterion(outputs.reshape(-1), y)
    loss.backward()
    optimizer.step()
    if epoch % 100 == 0:
        print(f"Epoch: {epoch}, loss: {loss.item():1.5f}")

model.eval()
model(X)

Epoch: 0, loss: 1.53944
Epoch: 100, loss: 0.20655
Epoch: 200, loss: 0.12307
Epoch: 300, loss: 0.08522
Epoch: 400, loss: 0.06062
Epoch: 500, loss: 0.04381
Epoch: 600, loss: 0.03203
Epoch: 700, loss: 0.02366
Epoch: 800, loss: 0.01766
Epoch: 900, loss: 0.01331


tensor([[ 0.8660],
        [ 0.9514],
        [ 0.9599],
        [ 0.9797],
        [ 0.9491],
        [ 0.0131],
        [-0.0166],
        [ 0.2174],
        [ 0.1499],
        [-0.0766]], grad_fn=<AddmmBackward0>)

## 模型評估

In [21]:
model.eval()
model(X)

tensor([[ 0.8660],
        [ 0.9514],
        [ 0.9599],
        [ 0.9797],
        [ 0.9491],
        [ 0.0131],
        [-0.0166],
        [ 0.2174],
        [ 0.1499],
        [-0.0766]], grad_fn=<AddmmBackward0>)

## 測試新資料 (輸入訓練資料以外的單字)

In [20]:
# 測試資料
test_docs = [
                "great job",
                "sample test",
                "cycle test"
]

# 轉成數值
X_test = torch.LongTensor(np.zeros((len(test_docs), maxlen)))
for i, text in enumerate(test_docs):
    tokens = tokenizer(text.lower())
    clean_tokens = []
    j = 0
    for w in tokens:
        if w not in stopwords:
            X_test[i, j] = vec.stoi[w]
            j+=1
X_test

tensor([[ 353,  664,    0,    0],
        [5863,  728,    0,    0],
        [4124,  728,    0,    0]])

## 觀察新資料的預測結果

In [22]:
model.eval()
model(X_test)

tensor([[0.6359],
        [1.0099],
        [1.1694]], grad_fn=<AddmmBackward0>)