# 簡單的 RNN 模型
- 與 RNN_Basic.ipynb 的差異在於模型的建立

## 載入相關套件

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
import numpy as np

## 嵌入層測試
- nn.Embedding(num_embedding, embedding_dim)

In [2]:
x = torch.LongTensor([[0, 1, 2], [3, 4, 5]])
embeds = nn.Embedding(6, 5)
print(embeds(x))

tensor([[[-1.3802, -0.8863,  1.5304, -1.1923, -0.5984],
         [ 0.1555,  0.9591, -0.2323,  0.5426,  0.6681],
         [ 0.6925,  1.2024,  0.5174,  0.5054, -0.8329]],

        [[-0.5477, -0.0777,  0.1827,  0.0597, -1.0250],
         [ 2.4584,  0.9517, -1.7216, -0.7901, -0.8735],
         [-1.2764, -0.0849,  0.9628,  0.8394, -0.2454]]],
       grad_fn=<EmbeddingBackward0>)


- 顯示嵌入層的起始權重

In [3]:
embeds.weight

Parameter containing:
tensor([[-1.3802, -0.8863,  1.5304, -1.1923, -0.5984],
        [ 0.1555,  0.9591, -0.2323,  0.5426,  0.6681],
        [ 0.6925,  1.2024,  0.5174,  0.5054, -0.8329],
        [-0.5477, -0.0777,  0.1827,  0.0597, -1.0250],
        [ 2.4584,  0.9517, -1.7216, -0.7901, -0.8735],
        [-1.2764, -0.0849,  0.9628,  0.8394, -0.2454]], requires_grad=True)

- 輸入改為 1 ~ 6

In [4]:
x = torch.LongTensor([[1, 2, 3], [4, 5, 6]])
embeds = nn.Embedding(7, 5)
print(embeds(x))

tensor([[[ 1.1017, -0.4459, -1.2649,  1.1693, -0.4569],
         [-1.2596,  2.0345, -1.0866,  0.0611, -1.1104],
         [-0.5033,  0.4887,  0.8963, -2.0216, -1.7435]],

        [[ 0.9040,  1.1129,  1.0517,  0.1697,  0.6378],
         [-0.8464,  0.3780,  1.4029, -0.6041,  0.4422],
         [-0.2779, -0.3103,  0.9005, -0.5982,  0.3340]]],
       grad_fn=<EmbeddingBackward0>)


In [5]:
embeds = nn.Embedding(6, 5)
x1 = torch.LongTensor([[0, 1, 2]])
x2 = torch.LongTensor([[3, 4]])
print(embeds(x1))
print(embeds(x2))
embeds.weight

tensor([[[ 0.8223,  1.6487, -0.3123,  0.1081,  0.3252],
         [ 0.6250, -0.8517,  0.7489,  0.3979,  0.6393],
         [ 0.2944, -0.0113, -2.0318,  0.9561,  0.9945]]],
       grad_fn=<EmbeddingBackward0>)
tensor([[[ 1.4096,  0.2583, -0.2184, -0.8004, -0.0685],
         [-0.9140, -1.9244,  0.1771,  0.2556,  0.3434]]],
       grad_fn=<EmbeddingBackward0>)


Parameter containing:
tensor([[ 0.8223,  1.6487, -0.3123,  0.1081,  0.3252],
        [ 0.6250, -0.8517,  0.7489,  0.3979,  0.6393],
        [ 0.2944, -0.0113, -2.0318,  0.9561,  0.9945],
        [ 1.4096,  0.2583, -0.2184, -0.8004, -0.0685],
        [-0.9140, -1.9244,  0.1771,  0.2556,  0.3434],
        [ 2.2102, -0.4404,  0.8149,  0.0420, -1.4677]], requires_grad=True)

In [6]:
embeds = nn.Embedding(6, 5, 5)
x1 = torch.LongTensor([[0, 1, 2]])
x2 = torch.LongTensor([[3, 4]])
x3 = torch.LongTensor([[3, 4]])
print(embeds(x1))
print(embeds(x2))
print(embeds(x3))
embeds.weight

tensor([[[-0.3085, -0.1052,  0.7499,  1.2802, -0.5273],
         [ 1.5887,  0.3831,  0.7766,  1.3853,  0.5812],
         [ 1.1188,  2.2343, -0.7616, -1.3601, -0.3304]]],
       grad_fn=<EmbeddingBackward0>)
tensor([[[-2.4371, -1.8138,  0.1802,  0.1987,  0.4644],
         [ 0.2488, -0.9374,  0.1968,  0.0477, -0.0340]]],
       grad_fn=<EmbeddingBackward0>)
tensor([[[-2.4371, -1.8138,  0.1802,  0.1987,  0.4644],
         [ 0.2488, -0.9374,  0.1968,  0.0477, -0.0340]]],
       grad_fn=<EmbeddingBackward0>)


Parameter containing:
tensor([[-0.3085, -0.1052,  0.7499,  1.2802, -0.5273],
        [ 1.5887,  0.3831,  0.7766,  1.3853,  0.5812],
        [ 1.1188,  2.2343, -0.7616, -1.3601, -0.3304],
        [-2.4371, -1.8138,  0.1802,  0.1987,  0.4644],
        [ 0.2488, -0.9374,  0.1968,  0.0477, -0.0340],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]], requires_grad=True)

- 英文單字輸入

In [7]:
# 測試資料
word_to_ix = {"hello": 0, "world": 1}

# 詞彙表 (vocabulary) 含兩個單字, 轉換為 5 維向量
enbeds = nn.Embedding(2, 5)

# 測試 hello
lookup_tensor = torch.LongTensor([word_to_ix["hello"]])
hello_embed = embeds(lookup_tensor)
print(hello_embed)

tensor([[-0.3085, -0.1052,  0.7499,  1.2802, -0.5273]],
       grad_fn=<EmbeddingBackward0>)


## RNN 層的測試
- 輸入為二維 (batch_first 是 default, 預設 batch_first = False, => 輸入是二維(不含 batch_size))

In [10]:
# 測試資料
# (seq_len, input_size)
input = torch.randn(5, 10)

# 建立 RNN 物件
# RNN 可以單獨使用 (亦可接在嵌入層後)
# (input_size, hidden_size, num_layers)
rnn = nn.RNN(10, 20, 2)

# RNN 處理
# output : (seq_len, hidden_size)
# hidden : (num_layer, hidden_size)
output, hn = rnn(input)

# 顯示輸出及隱藏層的維度
print("output: ", output.shape)
print("hidden layer dim. :", hn.shape)

output:  torch.Size([5, 20])
hidden layer dim. : torch.Size([2, 20])


- 輸入為三維 (L, batch_size, $H_{in}$)

In [11]:
# 測試資料
# (seq_len, batch_size, input_size)
input = torch.randn(5, 4, 10)

# 建立 RNN 物件
# RNN 可以單獨使用 (亦可接在嵌入層後)
# (input_size, hidden_size, num_layers)
rnn = nn.RNN(10, 20, 2)

# RNN 處理
# output : (seq_len, batch_size, hidden_size)
# hidden : (num_layer, batch_size, hidden_size)
output, hn = rnn(input)

# 顯示輸出及隱藏層維度
print("output: ", output.shape)
print("hidden layer dim. :", hn.shape)

output:  torch.Size([5, 4, 20])
hidden layer dim. : torch.Size([2, 4, 20])


- RNN 的輸入可以有初始的隱藏層狀態 (h0), h0 最後一維需等於 $H_{out}$

In [13]:
# 測試資料
# (seq_len, batch_size, input_size)
input = torch.randn(5, 3, 10)

# 建立 RNN 物件
# RNN 可以單獨使用 (亦可接在嵌入層後)
# (input_size, hidden_size, num_layers)
rnn = nn.RNN(10, 20, 2)

# 隱藏層的輸入
# (num_layers, batch_size, hidden_size)
h0 = torch.randn(2, 3, 20)

# RNN 處理
# output : (seq_len, batch_size, hidden_size)
# hidden : (num_layer, batch_size, hidden_size)
output, hn = rnn(input, h0)

# 顯示輸出及隱藏層的維度
print("output: ", output.shape)
print("hidden layer dim. :", hn.shape)

output:  torch.Size([5, 3, 20])
hidden layer dim. : torch.Size([2, 3, 20])


## 接著介紹 PyTorch 前置處理功能

### 分詞

In [14]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")
text = "Could have done better"
tokenizer(text)


['could', 'have', 'done', 'better']

### 詞彙表處理

In [19]:
from torchtext.vocab import vocab
from collections import Counter, OrderedDict

# BOW 統計
counter = Counter(tokenizer(text))

# 依出現次數降冪排列
sort_by_freq_tuples = sorted(counter.items(),
                            key = lambda x: x[1], reverse = True)

# 建立詞彙字典
ordered_dict = OrderedDict(sort_by_freq_tuples)
print(ordered_dict)

# 建立詞彙表物件, 並加一個未知單字 (unknow) 的索引值
vocab_object = torchtext.vocab.vocab(ordered_dict, specials = ["<unk>"])

# 設定詞彙表預設值為未知單字(unknown)的索引值
vocab_object.set_default_index(vocab_object["<unk>"])

# 測試
vocab_object['done']

OrderedDict([('could', 1), ('have', 1), ('done', 1), ('better', 1)])


3

- 取得詞彙表的所有單字

In [20]:
vocab_object.get_itos()

['<unk>', 'could', 'have', 'done', 'better']

- 取得詞彙表的單字個數

In [21]:
vocab_object.__len__()

5

- 字串標點符號

In [22]:
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'