# RNNLM

*Recurrent Neural Network Language Model*

In [1]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torchtext import transforms
from torchtext.vocab import build_vocab_from_iterator
from torchvision.transforms import Compose
import spacy
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')


---

## Dataset

[日英中基本文データ - LANGUAGE MEDIA PROCESSING LAB](https://nlp.ist.i.kyoto-u.ac.jp/?%E6%97%A5%E8%8B%B1%E4%B8%AD%E5%9F%BA%E6%9C%AC%E6%96%87%E3%83%87%E3%83%BC%E3%82%BF)

In [2]:
df = pd.read_excel('data/JEC_basic_sentence_v1-3.xls', header=None)
df.columns = ['id', 'japanese', 'english', 'chinese']
print('num of data:', len(df))
df.head()

num of data: 5304


Unnamed: 0,id,japanese,english,chinese
0,#0001,Xではないかとつくづく疑問に思う,I often wonder if it might be X.,难道不会是X吗，我实在是感到怀疑。
1,#0002,Xがいいなといつも思います,I always think X would be nice.,我总觉得X不错。
2,#0003,それがあるようにいつも思います,It always seems like it is there.,我总觉得那好像是有的。
3,#0004,それが多すぎないかと正直思う,I honestly feel like there is too much.,老实说我觉得那太多了。
4,#0005,山田はみんなに好かれるタイプの人だと思う,I think that Yamada is the type everybody likes.,我想山田是受大家欢迎的那种人。


In [None]:
nlp = spacy.load('ja_core_news_sm') # 形態素解析器
data = df['japanese']
data =  [[token.text for token in nlp(sentence)] for sentence in data]


---

## 前処理

このままでは使えないので、前処理を施す

### 数値化

単語に整数値を割り当てる。

In [4]:
samples = data[:5]
samples

[['X', 'で', 'は', 'ない', 'か', 'と', 'つくづく', '疑問', 'に', '思う'],
 ['X', 'が', 'いい', 'な', 'と', 'いつも', '思い', 'ます'],
 ['それ', 'が', 'ある', 'よう', 'に', 'いつも', '思い', 'ます'],
 ['それ', 'が', '多', 'すぎ', 'ない', 'か', 'と', '正直', '思う'],
 ['山田', 'は', 'みんな', 'に', '好か', 'れる', 'タイプ', 'の', '人', 'だ', 'と', '思う']]

In [5]:
vocab = build_vocab_from_iterator(samples)

In [None]:
vocab.get_stoi()

In [7]:
t = transforms.VocabTransform(vocab)
t(samples[0])

[4, 17, 9, 8, 6, 0, 16, 29, 2, 3]

### Padding

指定した長さに揃える

In [8]:
x = torch.tensor([1, 2, 3])
t = transforms.PadTransform(10, 0)
t(x)

tensor([1, 2, 3, 0, 0, 0, 0, 0, 0, 0])


---

## DataLoader

### Transform

前処理をまとめる

In [9]:
pad, bos, eos, unk = '<pad>', '<bos>', '<eos>', '<unk>'
specials = [pad, bos, eos, unk]
max_len = len(max(data, key=len)) + 2
vocab = build_vocab_from_iterator(data, specials=specials)
n_vocab = len(vocab)

transform = Compose([
    transforms.Truncate(max_len),
    transforms.AddToken(bos, begin=True),
    transforms.AddToken(eos, begin=False),
    transforms.VocabTransform(vocab),
    transforms.ToTensor(),
    transforms.PadTransform(max_len, vocab[pad])
])

In [10]:
sample = samples[0]
sample

['X', 'で', 'は', 'ない', 'か', 'と', 'つくづく', '疑問', 'に', '思う']

In [11]:
transform(sample)

tensor([   1,   17,   14,   11,   29,   34,   18, 1617,  999,    7,  831,    2,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0])

### DataLoader

In [12]:
class TextDataset(Dataset):
    def __init__(self, text_data, transform):
        self._n_samples = len(text_data)
        self.data = torch.stack([transform(text) for text in text_data])

    def __getitem__(self, index):
        in_text = self.data[index, :-1]
        out_text = self.data[index, 1:]
        return in_text, out_text

    def __len__(self):
        return self._n_samples

dataset = TextDataset(data, transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


---

## モデル構築

RNNLMの実装

In [13]:
class RNNLM(nn.Module):
    def __init__(self, n_vocab, n_hidden):
        super().__init__()
        self._eye = torch.eye(n_vocab)
        self.rnn = nn.RNN(n_vocab, n_hidden, batch_first=True)
        self.fc = nn.Linear(n_hidden, n_vocab)

    def forward(self, x, h=None):
        x = self._eye.to(x.device)[x]
        y, h = self.rnn(x, h)
        y = self.fc(y)
        return y, h


---

## 学習

In [14]:
criterion = nn.CrossEntropyLoss(ignore_index=vocab[pad])
def train(model, optimizer, n_epochs):
    model.train()
    for epoch in range(1, n_epochs + 1):
        for x, t in tqdm(dataloader, desc=f'Epoch {epoch}/{n_epochs}', disable=True):
            x, t = x.to(device), t.to(device)
            optimizer.zero_grad()
            y, _ = model(x)
            loss = criterion(y.reshape(-1, n_vocab), t.ravel())
            loss.backward()
            optimizer.step()
        print(f'{epoch}/{n_epochs} loss:', loss.item(), flush=True)

In [15]:
model = RNNLM(n_vocab, 1024).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [16]:
train(model, optimizer, 10)

1/10 loss: 6.108832836151123
2/10 loss: 5.316364288330078
3/10 loss: 4.568809986114502
4/10 loss: 4.36100435256958
5/10 loss: 3.747121810913086
6/10 loss: 4.063011646270752
7/10 loss: 3.583484649658203
8/10 loss: 3.1323118209838867
9/10 loss: 3.0378060340881348
10/10 loss: 2.7572286128997803



---

## 文章生成

In [17]:
@torch.no_grad()
def generate_sentence(
    model: nn.Module,
    start: str|None = None,
    max_len: int = 30
) -> str:
    model.eval()
    start = start or ''
    tokens =  [token.text for token in nlp(start)]
    tokens = [vocab[bos]] + [vocab[t] for t in tokens]
    tokens = torch.tensor(tokens).unsqueeze(0).to(device)

    y, h = model(tokens)
    next_token = y[0, -1].argmax()

    gen_tokens = [next_token.item()]
    for _ in range(max_len):
        y, h = model(next_token.reshape(1, 1), h)
        next_token = y[0, -1].argmax()
        gen_tokens.append(next_token.item())
        if next_token == vocab[eos]:
            break
    gen_tokens = [vocab.get_itos()[t] for t in gen_tokens[:-1]]
    return start + ''.join(gen_tokens)

In [18]:
generate_sentence(model, '私は')

'私はXを時々念頭においています'

In [19]:
generate_sentence(model, '昨日')

'昨日、やっと、子どもの人が多い'