## 載入相關套件

In [48]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
import numpy as np


## 嵌入層測試
- 第一個參數為 6, 表示詞彙表含 6 個單字
- 第二個參數為 5, 表示每個單字以 5 個實數表示

In [49]:
x = torch.LongTensor([[0,1,2], [3,4,5]])
embeds = nn.Embedding(6,5)
print(embeds(x))

tensor([[[ 0.8112,  0.3393, -1.1498,  0.7866,  0.4077],
         [ 1.4390,  1.3084,  0.8373, -0.9947, -0.6837],
         [-0.5909,  0.0514, -0.3278, -0.9018, -0.1598]],

        [[ 0.2511, -1.2888,  0.5206,  1.3232,  0.8089],
         [ 0.5001, -0.4651, -0.5922, -0.1354,  0.4567],
         [ 0.4365,  0.6853,  0.5563,  0.2142,  1.2650]]],
       grad_fn=<EmbeddingBackward0>)


- 顯示嵌入層的起始權重

In [50]:
embeds.weight

Parameter containing:
tensor([[ 0.8112,  0.3393, -1.1498,  0.7866,  0.4077],
        [ 1.4390,  1.3084,  0.8373, -0.9947, -0.6837],
        [-0.5909,  0.0514, -0.3278, -0.9018, -0.1598],
        [ 0.2511, -1.2888,  0.5206,  1.3232,  0.8089],
        [ 0.5001, -0.4651, -0.5922, -0.1354,  0.4567],
        [ 0.4365,  0.6853,  0.5563,  0.2142,  1.2650]], requires_grad=True)

- 輸入改為 1~6:
  - nn.Embedding 第一個參數須改為 7, 即詞彙表應含 0-6, 共 7 個單字, 因為輸入最大索引值為 6

In [51]:
x = torch.LongTensor([[1,2,3], [4,5,6]])
embeds = nn.Embedding(7,5)
print(embeds(x))

tensor([[[-0.1920, -1.8358,  0.2627,  0.0705, -0.1247],
         [-0.3463,  0.3407, -0.5955,  0.3029, -0.6530],
         [-0.1293, -0.5286,  2.5180, -0.1145,  0.6909]],

        [[-0.5631,  0.6595,  0.4995, -1.2066, -0.3440],
         [ 0.1385, -0.0383, -0.0385, -0.2012,  1.9090],
         [ 0.4554, -1.3648,  0.4208,  1.2158, -0.4393]]],
       grad_fn=<EmbeddingBackward0>)


In [52]:
embeds = nn.Embedding(6,5)
x1 = torch.LongTensor([[0,1,2]])
x2 = torch.LongTensor([[3,4]])
print(embeds(x1))
print(embeds(x2))
embeds.weight

tensor([[[-0.2119, -0.3106,  0.2821, -0.5430,  0.0373],
         [-0.6810,  0.1177, -0.3397, -0.6144,  2.2208],
         [ 1.4055, -0.2549,  1.7545,  1.1058, -0.0865]]],
       grad_fn=<EmbeddingBackward0>)
tensor([[[-1.3001,  0.3459,  0.2197, -0.0573, -0.4091],
         [-0.7931,  0.3002,  0.0118, -2.0476, -0.5059]]],
       grad_fn=<EmbeddingBackward0>)


Parameter containing:
tensor([[-0.2119, -0.3106,  0.2821, -0.5430,  0.0373],
        [-0.6810,  0.1177, -0.3397, -0.6144,  2.2208],
        [ 1.4055, -0.2549,  1.7545,  1.1058, -0.0865],
        [-1.3001,  0.3459,  0.2197, -0.0573, -0.4091],
        [-0.7931,  0.3002,  0.0118, -2.0476, -0.5059],
        [-1.2425,  1.1734, -0.7721,  0.7615, -0.7256]], requires_grad=True)

In [53]:
embeds = nn.Embedding(6,5,5)
# embeds = nn.Embedding(6,5)
x1 = torch.LongTensor([[0,1,2]])
x2 = torch.LongTensor([[3,4]])
x3 = torch.LongTensor([[3,4]])
print(embeds(x1))
print(embeds(x2))
print(embeds(x3))
embeds.weight

tensor([[[-0.6861,  1.0274, -0.9390, -0.1321,  2.4225],
         [-0.6270, -0.4294, -0.4379,  0.8927, -0.9003],
         [ 1.3614,  0.0655,  0.5329, -0.9712,  0.2823]]],
       grad_fn=<EmbeddingBackward0>)
tensor([[[ 0.9803, -2.6386,  0.8132,  0.2225, -0.1751],
         [-0.4112,  0.7099,  0.3317, -1.3171,  2.4717]]],
       grad_fn=<EmbeddingBackward0>)
tensor([[[ 0.9803, -2.6386,  0.8132,  0.2225, -0.1751],
         [-0.4112,  0.7099,  0.3317, -1.3171,  2.4717]]],
       grad_fn=<EmbeddingBackward0>)


Parameter containing:
tensor([[-0.6861,  1.0274, -0.9390, -0.1321,  2.4225],
        [-0.6270, -0.4294, -0.4379,  0.8927, -0.9003],
        [ 1.3614,  0.0655,  0.5329, -0.9712,  0.2823],
        [ 0.9803, -2.6386,  0.8132,  0.2225, -0.1751],
        [-0.4112,  0.7099,  0.3317, -1.3171,  2.4717],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]], requires_grad=True)

- 以英文單字輸入:

In [54]:
# 測試資料
word_to_ix = {"hello": 0, "world": 1}

# 詞彙表含 2 個單字, 轉換為 5 維的向量
embeds = nn.Embedding(2, 5)

# 測試 hello
lookup_tensor = torch.LongTensor([word_to_ix["hello"]])
hello_embed = embeds(lookup_tensor)
print(hello_embed)

tensor([[ 0.7408,  0.5885,  0.5094, -0.4898,  1.0571]],
       grad_fn=<EmbeddingBackward0>)


## RNN 層測試

- 測試: 輸入為二維 (L, $H_{in}$)

In [55]:
# 測試資料
input = torch.randn(5, 10)

# 建立 RNN 物件
rnn = nn.RNN(10, 20, 2)

# RNN 處理
output, hn = rnn(input)

# 顯示輸出及隱藏層的維度
print(output.shape, hn.shape)

torch.Size([5, 20]) torch.Size([2, 20])


- 測試: 輸入為三維 (L, N, $H_{in}$)

In [56]:
#  測試資料
input = torch.randn(5, 4, 10)

# 建立 RNN 物件
rnn = nn.RNN(10, 20, 2)

# RNN 處理
output, hn = rnn(input)

# 顯示輸出及隱藏層的維度
print(output.shape, hn.shape)

torch.Size([5, 4, 20]) torch.Size([2, 4, 20])


- RNN 的輸入可以有初始的隱藏層狀態 (h0), h0 最後一維需等於 $H_{out}$

In [57]:
# 測試資料
input = torch.randn(5, 3, 10)

# 建立 RNN 物件
rnn = nn.RNN(10, 20, 2)

# 隱藏層的輸入
h0 = torch.randn(2, 3, 20)

# RNN 處理
output, hn = rnn(input, h0)

# 顯示輸出及隱藏層的維度
print(output.shape, hn.shape)

torch.Size([5, 3, 20]) torch.Size([2, 3, 20])


## 分詞

In [58]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")

text = "Could have done better."
tokenizer(text)

['could', 'have', 'done', 'better', '.']

## 詞彙表處理

In [59]:
from torchtext.vocab import vocab
from collections import Counter, OrderedDict

# BOW 統計
counter = Counter(tokenizer(text))

# 依出現次數降冪排列
sorted_by_freq_tuples = sorted(counter.items(),
                                key = lambda x: x[1], reverse = True)

# 建立詞彙字典
ordered_dict = OrderedDict(sorted_by_freq_tuples)

# 建立詞彙表物件, 並加一個未知單字(unknown)的索引值
vocab_object = torchtext.vocab.vocab(ordered_dict, specials = ["<unk>"])

# 設定詞彙表預設值為未知單字(unknown)的索引值
vocab_object.set_default_index(vocab_object["<unk>"])

# 測試
vocab_object["done"]

3

- 取得詞彙表的所有單字

In [60]:
vocab_object.get_itos()

['<unk>', 'could', 'have', 'done', 'better', '.']

- 取得詞彙表的單字個數

In [61]:
vocab_object.__len__()

6

- 字串標點

In [62]:
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

- 資料轉換函數

In [63]:
import string

def create_vocabulary(text_list):
    # 取得標點符號
    stopwords = list(string.punctuation)

    # 去除標點符號
    clean_text_list = []
    clean_tokens_list = []
    for text in text_list:
        tokens = tokenizer(text)
        clean_tokens = []
        for w in tokens:
            if w not in stopwords:
                clean_tokens.append(w)
        clean_tokens_list += clean_tokens
        clean_text_list.append(" ".join(clean_tokens))

    # 建立詞彙表物件
    counter = Counter(clean_tokens_list)
    sorted_by_freq_tuples = sorted(counter.items(),
                                    key = lambda x: x[1], reverse = True)
    ordered_dict = OrderedDict(sorted_by_freq_tuples)
    vocab_object = torchtext.vocab.vocab(ordered_dict, specials = ["<unk>"])
    vocab_object.set_default_index(vocab_object["<unk>"])

    # 將輸入字串轉為索引值: 自詞彙表物件查詢索引值
    clean_index_list = []
    for clean_tokens_list in clean_text_list:
        clean_index_list.append(
            vocab_object.lookup_indices(clean_tokens_list.split(" "))
        )
    
    # 輸出詞彙表物件, 去除標點符號的字串陣列, 字串陣列的索引值
    return vocab_object, clean_text_list, clean_index_list

## 測試

In [64]:
docs = ["Well done!",
        "Good work",
        "Great effort",
        "nice work",
        "Excellent!",
        "Weak",
        "Poor effort!",
        "not good",
        "poor work",
        "Could have done better."
        ]

vocab_object, clean_text_list, clean_index_list = create_vocabulary(docs)
vocab_object.get_itos()

['<unk>',
 'work',
 'done',
 'good',
 'effort',
 'poor',
 'well',
 'great',
 'nice',
 'excellent',
 'weak',
 'not',
 'could',
 'have',
 'better']

In [65]:
clean_text_list

['well done',
 'good work',
 'great effort',
 'nice work',
 'excellent',
 'weak',
 'poor effort',
 'not good',
 'poor work',
 'could have done better']

In [66]:
clean_index_list

[[6, 2],
 [3, 1],
 [7, 4],
 [8, 1],
 [9],
 [10],
 [5, 4],
 [11, 3],
 [5, 1],
 [12, 13, 2, 14]]

## 建立詞彙表: 整理輸入語句, 截長補短, 使語句長度一致

In [67]:
maxlen = 4

# 測試資料
docs = ["Well done!",
        "Good work",
        "Greate effort",
        "nice work",
        "Excellent!",
        "Weak",
        "Poor effort!",
        "not good",
        "poor work",
        "Could have done better"
        ]

vocab_object, clean_text_list, clean_index_list = create_vocabulary(docs)

# 若字串過長, 刪除多餘單字
clean_index_list = torchtext.functional.truncate(clean_index_list, maxlen)

# 若字串長度不足, 後面補 0
while len(clean_index_list[0]) < maxlen:
    clean_index_list[0] += [0]

torchtext.functional.to_tensor(clean_index_list, 0)

tensor([[ 6,  2,  0,  0],
        [ 3,  1,  0,  0],
        [ 7,  4,  0,  0],
        [ 8,  1,  0,  0],
        [ 9,  0,  0,  0],
        [10,  0,  0,  0],
        [ 5,  4,  0,  0],
        [11,  3,  0,  0],
        [ 5,  1,  0,  0],
        [12, 13,  2, 14]])

## 嵌入層轉換

In [68]:
# 測試
embeds = nn.Embedding(vocab_object.__len__(), 5)
X = torchtext.functional.to_tensor(clean_index_list, 0)
embed_output = embeds(X)
print(embed_output.shape)

torch.Size([10, 4, 5])


# 再加上完全連接層 (Linear)

In [70]:
class RecurrentNet(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim * maxlen, num_class)
        self.embed_dim = embed_dim
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text):
        embedded = self.embedding(text)
        out = embedded.reshape(embedded.size(0), -1)
        return self.fc(out)

model = RecurrentNet(vocab_object.__len__(), 10, 1)

## 模型訓練

In [71]:
# 定義 10 個語句的正面 (1) 或負面 (0) 的情緒
y = torch.FloatTensor([1,1,1,1,1,0,0,0,0,0])
X = torchtext.functional.to_tensor(clean_index_list, 0)

# 指定優化器, 損失函數
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

# 模型訓練
for epoch in range(1000):
    outputs = model.forward(X)
    optimizer.zero_grad()
    loss = criterion(outputs.reshape(-1), y)
    loss.backward()
    optimizer.step()
    if epoch % 100 == 0:
        print(f"Epoch: {epoch}, loss: {loss.item():1.5f}")

Epoch: 0, loss: 1.00512
Epoch: 100, loss: 0.10893
Epoch: 200, loss: 0.00998
Epoch: 300, loss: 0.00079
Epoch: 400, loss: 0.00016
Epoch: 500, loss: 0.00005
Epoch: 600, loss: 0.00002
Epoch: 700, loss: 0.00000
Epoch: 800, loss: 0.00000
Epoch: 900, loss: 0.00000


## 訓練資料預測

In [72]:
model.eval()
model(X)

tensor([[ 9.9998e-01],
        [ 9.9993e-01],
        [ 1.0001e+00],
        [ 9.9989e-01],
        [ 1.0000e+00],
        [-1.7501e-05],
        [-3.0486e-04],
        [ 1.0513e-05],
        [ 3.7056e-04],
        [ 6.2585e-07]], grad_fn=<AddmmBackward0>)

## 測試資料預測

In [73]:
# 測試資料
test_docs = ["great efforts", "well done", "poor effort"]

# 轉成數值
clean_index_list = []
for text in test_docs:
    clean_index_list.append(vocab_object.lookup_indices(text.split(" ")))
while len(clean_index_list[0]) < maxlen:
    clean_index_list[0] += [0]

clean_index_list = torchtext.functional.truncate(clean_index_list, maxlen)
X = torchtext.functional.to_tensor(clean_index_list, 0)
model(X)

tensor([[ 1.0085e+00],
        [ 9.9998e-01],
        [-3.0486e-04]], grad_fn=<AddmmBackward0>)