# RNN 簡單案例1

## 建立資料轉換函數

In [21]:
from sre_parse import Tokenizer
import string
from torchtext.vocab import vocab
from collections import Counter, OrderedDict
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")

def create_vocabulary(text_list):
    # 取得標點符號
    stopwords = list(string.punctuation)

    # 去除標點符號
    clean_text_list = []
    clean_tokens_list = []
    for text in text_list:
        tokens = tokenizer(text)
        clean_tokens = []
        for w in tokens:
            if w not in stopwords:
                clean_tokens.append(w)
        clean_tokens_list += clean_tokens
        clean_text_list.append(" ".join(clean_tokens))
    
    # print("clean_tokens_list \n", clean_tokens_list)
    # print("clean_text_list \n", clean_text_list)
    
    # 建立詞彙表物件
    counter = Counter(clean_tokens_list)
    sort_by_freq_tuples = sorted(counter.items(),
                                key = lambda x: x[1], reverse = True)
    ordered_dict = OrderedDict(sort_by_freq_tuples)
    vocab_object = vocab(ordered_dict, specials = ["<unk>"])
    vocab_object.set_default_index(vocab_object["<unk>"])

    # 將輸入字串轉為索引值 (從詞彙表物件查詢索引值)
    clean_index_list = []
    for clean_tokens_list in clean_text_list:
        clean_index_list.append(
            vocab_object.lookup_indices(clean_tokens_list.split(" "))
        )

    # 輸出詞彙表物件, 去除標點符號的字串陣列, 字串陣列的索引值
    return vocab_object, clean_text_list, clean_index_list

### 測試

In [26]:
docss = ["Well done!",
        "Good work",
        "Great effort",
        "nice work",
        "Excellent!!",
        "Weak",
        "not good",
        "poor work",
        "Could have done beeter."
        ]

vocab_objectt, clean_text_listt, clean_index_listt = create_vocabulary(docss)
vocab_objectt.get_itos()

['<unk>',
 'work',
 'done',
 'good',
 'well',
 'great',
 'effort',
 'nice',
 'excellent',
 'weak',
 'not',
 'poor',
 'could',
 'have',
 'beeter']

In [27]:
vocab_objectt.__len__()

15

In [28]:
clean_text_listt

['well done',
 'good work',
 'great effort',
 'nice work',
 'excellent',
 'weak',
 'not good',
 'poor work',
 'could have done beeter']

In [29]:
clean_index_listt

[[4, 2], [3, 1], [5, 6], [7, 1], [8], [9], [10, 3], [11, 1], [12, 13, 2, 14]]

## 建立詞彙表
- 整理輸入與句, 截長補短, 使語句長度一致

In [33]:
import torchtext

# 設定語句最大字數
maxlen = 4

# 測試資料
docs = [
        "Well done!",
        "Good work",
        "Great effort",
        "nice work",
        "Excellent!",
        "Weak",
        "Poor effort!",
        "not good",
        "poor work",
        "Could have done better",
]

vocab_object, clean_text_list, clean_index_list = create_vocabulary(docs)
# print("clean_index_list \n", clean_index_list)

# 若字串過長, 刪除多餘單字
clean_index_list = torchtext.functional.truncate(clean_index_list, maxlen)
# print("clean_index_list \n", clean_index_list)

# 若字串長度不足, 後面補 0
while len(clean_index_list[0]) < maxlen:
        clean_index_list[0] += [0]
        # print("clean_index_list \n", clean_index_list)
torchtext.functional.to_tensor(clean_index_list, 0)


tensor([[ 6,  2,  0,  0],
        [ 3,  1,  0,  0],
        [ 7,  4,  0,  0],
        [ 8,  1,  0,  0],
        [ 9,  0,  0,  0],
        [10,  0,  0,  0],
        [ 5,  4,  0,  0],
        [11,  3,  0,  0],
        [ 5,  1,  0,  0],
        [12, 13,  2, 14]])

## 嵌入層轉換

### 先測試看看

In [34]:
# 測試
import torch.nn as nn

embeds = nn.Embedding(vocab_object.__len__(), 5)
X = torchtext.functional.to_tensor(clean_index_list, 0)
embed_output = embeds(X)
print(embed_output.shape)

torch.Size([10, 4, 5])


# 再加上完全連接層(Linear)

In [35]:
from turtle import forward
import torch.nn as nn

class RecurrentNet(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim * maxlen, num_class)      # 要乘以 maxlen
        self.embed_dim = embed_dim
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text):
        embedded = self.embedding(text)
        out = embedded.reshape(embedded.size(0), -1)    # 轉換成1維
        return self.fc(out)

model = RecurrentNet(vocab_object.__len__(), 10, 1)


## 模型訓練

In [36]:
import torch

# 定義 10 個語句的正面 (1) 或負面 (0) 的情緒
y = torch.FloatTensor([1,1,1,1,1,0,0,0,0,0])
X = torchtext.functional.to_tensor(clean_index_list, 0)     # 不足補 0

# 指定優化器, 損失函數
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

# 模型訓練
for epoch in range(1000):
    outputs = model.forward(X)  # forward pass
    optimizer.zero_grad()
    loss = criterion(outputs.reshape(-1), y)
    loss.backward()
    optimizer.step()
    if epoch % 100 == 0:
        #print(outputs.shape)
        print(f"Epoch: {epoch}, loss: {loss.item(): 1.5f}")

Epoch: 0, loss:  0.40235
Epoch: 100, loss:  0.04761
Epoch: 200, loss:  0.00702
Epoch: 300, loss:  0.00127
Epoch: 400, loss:  0.00015
Epoch: 500, loss:  0.00001
Epoch: 600, loss:  0.00000
Epoch: 700, loss:  0.00000
Epoch: 800, loss:  0.00000
Epoch: 900, loss:  0.00000


## 訓練資料預測 
- 模型評估

In [37]:
model.eval()
model(X)

tensor([[ 1.0000e+00],
        [ 1.0000e+00],
        [ 1.0000e+00],
        [ 1.0000e+00],
        [ 1.0000e+00],
        [-1.3784e-07],
        [-7.0408e-07],
        [ 7.0781e-08],
        [ 1.3486e-06],
        [ 0.0000e+00]], grad_fn=<AddmmBackward0>)

## 測試資料預測

In [38]:
# 測試資料
test_docs = ["great effort",
                "well done",
                "poor effort"
                ]

# 轉成數值
clean_index_list = []
for text in test_docs:
    clean_index_list.append(vocab_object.lookup_indices(text.split(" ")))

while len(clean_index_list[0]) < maxlen:
    clean_index_list[0] += [0]
clean_index_list = torchtext.functional.truncate(clean_index_list, maxlen)
X = torchtext.functional.to_tensor(clean_index_list, 0)

model(X)

tensor([[ 1.0000e+00],
        [ 1.0000e+00],
        [-6.7800e-07]], grad_fn=<AddmmBackward0>)