# 與案例1差異在訓練模型的不同

## Pytorch 前置處理

### 分詞

In [2]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")
text = "Could have done better"
tokenizer(text)

['could', 'have', 'done', 'better']

### 詞彙表處理

In [3]:
from torchtext.vocab import vocab
from collections import Counter, OrderedDict

# BOW 統計
counter = Counter(tokenizer(text))
print(counter.items())

# 依出現次數降冪排列
sorted_by_freq_tuples = sorted(counter.items(),
                                key = lambda x: x[1], reverse = True
                                )

# 建立詞彙字典
ordered_dict = OrderedDict(sorted_by_freq_tuples)

# 建立詞彙表物件, 並加一個未知單字(unknown)的索引值
vocab_object = vocab(ordered_dict, specials = ["<unk>"])

# 設定詞彙表預設值為未知單字(unknow)的索引值
vocab_object.set_default_index(vocab_object["<unk>"])

# 測試
print(vocab_object["Done"])

a = tokenizer("Done")   # a 為list, 把字串取出來用 a[0]
print(vocab_object[a[0]])

dict_items([('could', 1), ('have', 1), ('done', 1), ('better', 1)])
0
3


- 取得詞彙表的所有單字

In [4]:
vocab_object.get_itos()

['<unk>', 'could', 'have', 'done', 'better']

- 取得詞彙表單字個數

In [5]:
vocab_object.__len__()

5

- 字串標點符號

In [6]:
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

## 資料轉換函數

In [21]:
import string
import torch
import torchtext
import torch.nn as nn
from torchtext.vocab import vocab
from collections import Counter, OrderedDict

def create_vocabulary(text_list):
    # 取得標點符號
    stopwords = list(string.punctuation)

    # 去除標點符號
    clean_text_list = []
    clean_tokens_list = []
    for text in text_list:
        tokens = tokenizer(text)
        clean_tokens = []
        for w in tokens:
            if w not in stopwords:
                clean_tokens.append(w)
        clean_tokens_list += clean_tokens
        clean_text_list.append(" ".join(clean_tokens))
    # print("clean_tokens_list: \n", clean_tokens_list)
    # print("clean_text_list: \n", clean_text_list)

    # 建立詞彙表物件
    counter = Counter(clean_tokens_list)
    sorted_by_freq_tuples = sorted(counter.items(),
                                    key = lambda x: x[1], reverse = True
                                    )
    ordered_dict = OrderedDict(sorted_by_freq_tuples)
    vocab_object = vocab(ordered_dict, specials = ["<unk>"])
    vocab_object.set_default_index(vocab_object["<unk>"])

    # 將輸入字串轉為索引值
    clean_index_list = []
    for clean_tokens_list in clean_text_list:
        clean_index_list.append(
            vocab_object.lookup_indices(clean_tokens_list.split(" "))
        )
    
    # 輸出詞彙表物件, 去除標點符號的字串陣列, 字串陣列的索引值
    return vocab_object, clean_text_list, clean_index_list
    

### 測試

In [14]:
docs = [
        "Well done!!",
        "Good work",
        "Great effort",
        "nice work",
        "Excellent!!",
        "Weak",
        "Poor effort!",
        "not good",
        "poor work",
        "Could have done better."
]

vocab_object, clean_text_list, clean_index_list = create_vocabulary(docs)
vocab_object.get_itos()

['<unk>',
 'work',
 'done',
 'good',
 'effort',
 'poor',
 'well',
 'great',
 'nice',
 'excellent',
 'weak',
 'not',
 'could',
 'have',
 'better']

In [15]:
clean_text_list

['well done',
 'good work',
 'great effort',
 'nice work',
 'excellent',
 'weak',
 'poor effort',
 'not good',
 'poor work',
 'could have done better']

In [16]:
clean_index_list

[[6, 2],
 [3, 1],
 [7, 4],
 [8, 1],
 [9],
 [10],
 [5, 4],
 [11, 3],
 [5, 1],
 [12, 13, 2, 14]]

### 建立詞彙表

In [17]:
# 語句最大字數
maxlen = 4

# 測試資料
docs = [
        "Well done!!",
        "Good work",
        "Great effort",
        "nice work",
        "Excellent!!",
        "Weak",
        "Poor effort!",
        "not good",
        "poor work",
        "Could have done better."
]

vocab_object, clean_text_list, clean_index_list = create_vocabulary(docs)

# 若句子過長, 刪除多餘單字
clean_index_list = torchtext.functional.truncate(clean_index_list, maxlen)

# 若句子長度不足, 後面補 0
while len(clean_index_list[0]) < maxlen:
        clean_index_list[0] += [0]
torchtext.functional.to_tensor(clean_index_list, 0)

tensor([[ 6,  2,  0,  0],
        [ 3,  1,  0,  0],
        [ 7,  4,  0,  0],
        [ 8,  1,  0,  0],
        [ 9,  0,  0,  0],
        [10,  0,  0,  0],
        [ 5,  4,  0,  0],
        [11,  3,  0,  0],
        [ 5,  1,  0,  0],
        [12, 13,  2, 14]])

### 遷入層轉換

In [18]:
embeds = nn.Embedding(vocab_object.__len__(), 5)
X = torchtext.functional.to_tensor(clean_index_list, 0)
embed_output = embeds(X)
print(embed_output.shape)

torch.Size([10, 4, 5])


## 再加上完全連接層
- 採用另一種寫法, 使用 EmbeddingBag

In [22]:
from turtle import forward


class RecurrentNet(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_class)
        self.embed_dim = embed_dim
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text):
        embedded = self.embedding(text)
        return self.fc(embedded)

model = RecurrentNet(vocab_object.__len__(), 10, 1)

## 模型訓練

In [23]:
# 定義 10 個語句的正面(1)或負面(0)的情緒
y = torch.FloatTensor([1,1,1,1,1,0,0,0,0,0])
X = torchtext.functional.to_tensor(clean_index_list, 0)

# 指定優化器, 損失函數
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

# 模型訓練
for epoch in range(1000):
    outputs = model.forward(X)
    optimizer.zero_grad()
    loss = criterion(outputs.reshape(-1), y)
    loss.backward()
    optimizer.step()
    if epoch % 100 == 0:
        print(f"Epoch: {epoch}, loss: {loss.item():1.5f}")

Epoch: 0, loss: 0.58175
Epoch: 100, loss: 0.25229
Epoch: 200, loss: 0.13172
Epoch: 300, loss: 0.06933
Epoch: 400, loss: 0.02938
Epoch: 500, loss: 0.00905
Epoch: 600, loss: 0.00235
Epoch: 700, loss: 0.00071
Epoch: 800, loss: 0.00026
Epoch: 900, loss: 0.00011


## 模型評估

In [24]:
model.eval()
model(X)

tensor([[ 1.0001e+00],
        [ 9.8575e-01],
        [ 1.0049e+00],
        [ 1.0056e+00],
        [ 9.9968e-01],
        [-2.1247e-03],
        [-7.9737e-03],
        [ 7.4796e-03],
        [ 8.6606e-03],
        [-7.1809e-05]], grad_fn=<AddmmBackward0>)

## 測試新資料

In [25]:
# 測試資料
test_docs = [
            "great effort",
            "well done",
            "poor effort",
]

# 轉成數值
clean_index_list = []
for text in test_docs:
    clean_index_list.append(vocab_object.lookup_indices(text.split(" ")))
while len(clean_index_list[0]) < maxlen:
    clean_index_list[0] += [0]

clean_index_list = torchtext.functional.truncate(clean_index_list, maxlen)
X = torchtext.functional.to_tensor(clean_index_list, 0)
model(X)


tensor([[ 1.0049],
        [ 1.0001],
        [-0.0080]], grad_fn=<AddmmBackward0>)