# RNNLM

*Recurrent Neural Network Language Model*

In [1]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torchtext import transforms
from torchtext.vocab import build_vocab_from_iterator
from torchvision.transforms import Compose
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import List


---

## Dataset

In [2]:
df = pd.read_excel('data/JEC_basic_sentence_v1-3.xls', header=None)
df.columns = ['id', 'japanese', 'english', 'chinese']
print('num of data: ', len(df))
df.head()

num of data:  5304


Unnamed: 0,id,japanese,english,chinese
0,#0001,Xではないかとつくづく疑問に思う,I often wonder if it might be X.,难道不会是X吗，我实在是感到怀疑。
1,#0002,Xがいいなといつも思います,I always think X would be nice.,我总觉得X不错。
2,#0003,それがあるようにいつも思います,It always seems like it is there.,我总觉得那好像是有的。
3,#0004,それが多すぎないかと正直思う,I honestly feel like there is too much.,老实说我觉得那太多了。
4,#0005,山田はみんなに好かれるタイプの人だと思う,I think that Yamada is the type everybody likes.,我想山田是受大家欢迎的那种人。


In [3]:
nlp = spacy.load('ja_core_news_sm')
text = df['japanese']
text =  [[token.text for token in nlp(sentence)] for sentence in text]

In [4]:
text

[['X', 'で', 'は', 'ない', 'か', 'と', 'つくづく', '疑問', 'に', '思う'],
 ['X', 'が', 'いい', 'な', 'と', 'いつも', '思い', 'ます'],
 ['それ', 'が', 'ある', 'よう', 'に', 'いつも', '思い', 'ます'],
 ['それ', 'が', '多', 'すぎ', 'ない', 'か', 'と', '正直', '思う'],
 ['山田', 'は', 'みんな', 'に', '好か', 'れる', 'タイプ', 'の', '人', 'だ', 'と', '思う'],
 ['〜', 'と', '誰', 'か', 'が', '思っ', 'た'],
 ['X', 'は', 'しんどい', 'こと', 'だ', 'と', '思い', 'ます'],
 ['X', 'は', '時間', 'の', '問題', 'と', '思い', 'ます'],
 ['X', 'は', '今後', 'の', '課題', 'と', '思い', 'ます'],
 ['それ', 'は', '桃山', '時代', '前後', 'の', '作品', 'だ', 'と', '思い', 'ます'],
 ['それ', 'は', '昭和', '初期', 'の', '映画', 'だ', 'と', '思い', 'ます'],
 ['勝とう', 'など', 'と', '誰', 'が', '思う', 'か'],
 ['X', 'の', '方', 'が', 'どう', 'か', 'と', '思う'],
 ['彼', 'が', 'でき', 'て', 'なかっ', 'た', 'よう', 'に', '思う'],
 ['彼', 'が', '常', 'に', '上', 'の', '存在', 'で', 'あっ', 'て', '欲しい', 'と', '思い', 'ます'],
 ['過言', 'で', 'は', 'ない', 'と', '思い', 'ます'],
 ['X', 'の', '方', 'が', '納得', 'いく', 'と', '思い', 'ます'],
 ['私',
  'は',
  '、',
  '五十',
  '時間',
  'も',
  'かかる',
  'と',
  '、',
  '時間',
  'が',
  '惜しい',
  '気',
  

#### 前処理

In [5]:
len(max(text, key=len))

27

In [6]:
pad, bos, eos, unk = '<pad>', '<bos>', '<eos>', '<unk>'
max_len = 30
specials = [pad, bos, eos, unk]
vocab = build_vocab_from_iterator(text, specials=specials)
n_vocab = len(vocab)

transform = Compose([
    transforms.Truncate(max_len),
    transforms.AddToken(bos, begin=True),
    transforms.AddToken(eos, begin=False),
    transforms.VocabTransform(vocab),
    transforms.ToTensor(),
    transforms.PadTransform(max_len, vocab[pad])
])

In [7]:
ex = text[0]
ex

['X', 'で', 'は', 'ない', 'か', 'と', 'つくづく', '疑問', 'に', '思う']

In [8]:
transform(ex)

tensor([   1,   17,   14,   11,   29,   34,   18, 1617,  999,    7,  831,    2,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0])

##### Vocab

数値化

In [9]:
ex

['X', 'で', 'は', 'ない', 'か', 'と', 'つくづく', '疑問', 'に', '思う']

In [10]:
v = build_vocab_from_iterator([ex], specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [11]:
v.get_stoi()

{'は': 11,
 '思う': 12,
 'で': 7,
 'に': 10,
 'と': 8,
 'X': 4,
 'ない': 9,
 'つくづく': 6,
 '<eos>': 3,
 '<unk>': 0,
 'か': 5,
 '疑問': 13,
 '<bos>': 2,
 '<pad>': 1}

In [12]:
t = transforms.VocabTransform(v)
tokens = t(ex)
tokens

[4, 7, 11, 9, 5, 8, 6, 13, 10, 12]

##### Padding

長さの調整

In [13]:
ex

['X', 'で', 'は', 'ない', 'か', 'と', 'つくづく', '疑問', 'に', '思う']

In [15]:
t = transforms.PadTransform(20, 0)
tokens = torch.tensor(tokens)
t(tokens)

tensor([ 4,  7, 11,  9,  5,  8,  6, 13, 10, 12,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0])

Dataset

In [16]:
class TextDataset(Dataset):
    def __init__(self, text_data, transform):
        self.data = torch.stack([transform(text) for text in text_data])
        self.n_samples = len(text)

    def __getitem__(self, index):
        in_text = self.data[index, :-1]
        out_text = self.data[index, 1:]
        return in_text, out_text

    def __len__(self):
        return self.n_samples

In [17]:
dataset = TextDataset(text, transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

## Model

In [30]:
class RNNLM(nn.Module):
    def __init__(
        self,
        n_in_vocab: int,
        n_out_vocab: int,
        n_hidden: int,
    ):
        super().__init__()
        self._eye = torch.eye(n_in_vocab)
        self.rnn = nn.RNN(n_in_vocab, n_hidden, batch_first=True)
        self.fc = nn.Linear(n_hidden, n_out_vocab)

    def forward(self, x, h=None):
        x = self._eye[x]
        y, h = self.rnn(x, h)
        y = self.fc(y)
        return y, h

### Train

In [31]:
criterion = nn.CrossEntropyLoss(ignore_index=vocab[pad])
def train(model, optimizer, n_epochs):
    model.train()
    for epoch in range(1, n_epochs + 1):
        for x, y in tqdm(dataloader, desc=f'Epoch {epoch}/{n_epochs}'):
            optimizer.zero_grad()
            y_pred, _ = model(x)
            loss = criterion(y_pred.reshape(-1, n_vocab), y.reshape(-1))
            loss.backward()
            optimizer.step()
        print('loss:', loss.item(), flush=True)

In [32]:
model = RNNLM(n_vocab, n_vocab, 1024)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [33]:
train(model, optimizer, 5)

Epoch 1/5: 100%|██████████| 166/166 [00:24<00:00,  6.69it/s]

loss: 5.946131229400635



Epoch 2/5: 100%|██████████| 166/166 [00:22<00:00,  7.29it/s]

loss: 5.56324577331543



Epoch 3/5: 100%|██████████| 166/166 [00:25<00:00,  6.55it/s]

loss: 5.42683219909668



Epoch 4/5: 100%|██████████| 166/166 [00:25<00:00,  6.39it/s]

loss: 4.808581829071045



Epoch 5/5: 100%|██████████| 166/166 [00:24<00:00,  6.68it/s]

loss: 4.397670269012451





## gen

In [66]:
@torch.no_grad()
def generate_sentence(
    model: nn.Module,
    start: str|None = None,
    max_len: int = 30
) -> str:
    model.eval()
    start = start or ''
    tokens =  [token.text for token in nlp(start)]
    tokens = [vocab[bos]] + [vocab[t] for t in tokens]
    tokens = torch.tensor(tokens).unsqueeze(0)

    y, h = model(tokens)
    next_token = y[0, -1].argmax()

    gen_tokens = [next_token.item()]
    for _ in range(max_len):
        y_pred, h = model(next_token.reshape(1, 1), h)
        next_token = y_pred[0, -1].argmax()
        gen_tokens.append(next_token.item())
        if next_token == vocab[eos]:
            break
    gen_tokens = [vocab.get_itos()[t] for t in gen_tokens[:-1]]
    return start + ''.join(gen_tokens)
    

In [71]:
generate_sentence(model, '昨日')

'昨日の人が、、人を見た'