# RNNLM

*Recurrent Neural Network Language Model*

In [1]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext import transforms
from torchtext.vocab import build_vocab_from_iterator
from torchvision.transforms import Compose
import spacy
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import List

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

2023-07-06 18:34:21.002420: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-06 18:34:22.102270: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-07-06 18:34:22.103188: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-07-06 18:34:22.103386: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been bu

device(type='cuda')


---

## Dataset

[日英中基本文データ - LANGUAGE MEDIA PROCESSING LAB](https://nlp.ist.i.kyoto-u.ac.jp/?%E6%97%A5%E8%8B%B1%E4%B8%AD%E5%9F%BA%E6%9C%AC%E6%96%87%E3%83%87%E3%83%BC%E3%82%BF)

In [2]:
df = pd.read_excel('data/JEC_basic_sentence_v1-3.xls', header=None)
df.columns = ['id', 'japanese', 'english', 'chinese']
print('num of data:', len(df))
df.head()

num of data: 5304


Unnamed: 0,id,japanese,english,chinese
0,#0001,Xではないかとつくづく疑問に思う,I often wonder if it might be X.,难道不会是X吗，我实在是感到怀疑。
1,#0002,Xがいいなといつも思います,I always think X would be nice.,我总觉得X不错。
2,#0003,それがあるようにいつも思います,It always seems like it is there.,我总觉得那好像是有的。
3,#0004,それが多すぎないかと正直思う,I honestly feel like there is too much.,老实说我觉得那太多了。
4,#0005,山田はみんなに好かれるタイプの人だと思う,I think that Yamada is the type everybody likes.,我想山田是受大家欢迎的那种人。


In [3]:
nlp = spacy.load('ja_core_news_sm') # 形態素解析器
data = df['japanese']
data =  [[token.text for token in nlp(sentence)] for sentence in data]


---

## 前処理

このままでは使えないので、前処理を施す

### 数値化

単語に整数値を割り当てる。

In [4]:
samples = data[:5]
samples

[['X', 'で', 'は', 'ない', 'か', 'と', 'つくづく', '疑問', 'に', '思う'],
 ['X', 'が', 'いい', 'な', 'と', 'いつも', '思い', 'ます'],
 ['それ', 'が', 'ある', 'よう', 'に', 'いつも', '思い', 'ます'],
 ['それ', 'が', '多', 'すぎ', 'ない', 'か', 'と', '正直', '思う'],
 ['山田', 'は', 'みんな', 'に', '好か', 'れる', 'タイプ', 'の', '人', 'だ', 'と', '思う']]

In [5]:
vocab = build_vocab_from_iterator(samples)

In [6]:
vocab.get_stoi()

{'疑問': 29,
 'ある': 12,
 '思い': 11,
 'ない': 8,
 'つくづく': 16,
 'タイプ': 23,
 'X': 4,
 '山田': 27,
 '思う': 3,
 'いつも': 5,
 'に': 2,
 'ます': 10,
 'は': 9,
 'が': 1,
 '人': 24,
 '正直': 28,
 'か': 6,
 'と': 0,
 'いい': 13,
 'それ': 7,
 'で': 17,
 'すぎ': 14,
 'の': 19,
 'よう': 21,
 'れる': 22,
 'みんな': 20,
 'だ': 15,
 '多': 25,
 'な': 18,
 '好か': 26}

In [7]:
t = transforms.VocabTransform(vocab)
t(samples[0])

[4, 17, 9, 8, 6, 0, 16, 29, 2, 3]


---

## DataLoader

### Transform

前処理をまとめる

In [8]:
pad, bos, eos, unk = '<pad>', '<bos>', '<eos>', '<unk>'
specials = [pad, bos, eos, unk]
vocab = build_vocab_from_iterator(data, specials=specials)
n_vocab = len(vocab)

transform = Compose([
    transforms.AddToken(bos, begin=True),
    transforms.AddToken(eos, begin=False),
    transforms.VocabTransform(vocab),
    transforms.ToTensor()
])

In [9]:
sample = samples[0]
sample

['X', 'で', 'は', 'ない', 'か', 'と', 'つくづく', '疑問', 'に', '思う']

In [10]:
transform(sample)

tensor([   1,   17,   14,   11,   29,   34,   18, 1617,  999,    7,  831,    2])

### DataLoader

In [11]:
class TextDataset(Dataset):
    def __init__(self, text_data, transform):
        self._n_samples = len(text_data)
        self.data = [transform(text) for text in text_data]

    def __getitem__(self, index):
        in_text = self.data[index][:-1]
        out_text = self.data[index][1:]
        return in_text, out_text

    def __len__(self):
        return self._n_samples

def to_padded_tensor(text_data: List[int], pad_value: int = 0) -> torch.Tensor:
    data = pad_sequence(text_data, batch_first=True, padding_value=pad_value)
    return data

def collate_fn(batch):
    in_text, out_text = zip(*batch)
    in_text = to_padded_tensor(in_text)
    out_text = to_padded_tensor(out_text)
    return in_text, out_text

dataset = TextDataset(data, transform)
dataloader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)
sample = next(iter(dataloader))
sample[0][0], sample[1][0]

(tensor([  1, 109, 443,   5, 218,   4, 868,  45,  42,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0]),
 tensor([109, 443,   5, 218,   4, 868,  45,  42,   2,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0]))


---

## モデル構築

RNNLMの実装

In [12]:
class RNNLM(nn.Module):
    def __init__(self, n_vocab, n_hidden):
        super().__init__()
        self._eye = torch.eye(n_vocab)
        self.rnn = nn.RNN(n_vocab, n_hidden, batch_first=True)
        self.fc = nn.Linear(n_hidden, n_vocab)

    def forward(self, x, h=None):
        x = self._eye.to(x.device)[x]
        y, h = self.rnn(x, h)
        y = self.fc(y)
        return y, h


---

## 学習

In [13]:
criterion = nn.CrossEntropyLoss(ignore_index=vocab[pad])
def train(model, optimizer, n_epochs):
    model.train()
    for epoch in range(1, n_epochs + 1):
        for x, t in tqdm(dataloader, desc=f'Epoch {epoch}/{n_epochs}', disable=True):
            x, t = x.to(device), t.to(device)
            optimizer.zero_grad()
            y, _ = model(x)
            loss = criterion(y.reshape(-1, n_vocab), t.ravel())
            loss.backward()
            optimizer.step()
        print(f'{epoch}/{n_epochs} loss:', loss.item(), flush=True)

In [14]:
model = RNNLM(n_vocab, 1024).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [15]:
train(model, optimizer, 20)

1/20 loss: 6.084326267242432
2/20 loss: 5.663784980773926
3/20 loss: 5.520203590393066
4/20 loss: 4.741633892059326
5/20 loss: 4.393838405609131
6/20 loss: 4.088660717010498
7/20 loss: 3.863466262817383
8/20 loss: 3.3202033042907715
9/20 loss: 3.1980020999908447
10/20 loss: 3.0173561573028564
11/20 loss: 2.645954132080078
12/20 loss: 2.491687536239624
13/20 loss: 2.1107072830200195
14/20 loss: 1.9268825054168701
15/20 loss: 1.8158470392227173
16/20 loss: 1.5304409265518188
17/20 loss: 1.4114043712615967
18/20 loss: 1.248309850692749
19/20 loss: 1.2060176134109497
20/20 loss: 1.32364821434021



---

## 文章生成

In [16]:
@torch.no_grad()
def generate_sentence(
    model: nn.Module,
    start: str|None = None,
    max_len: int = 30
) -> str:
    model.eval()
    start = start or ''
    tokens =  [token.text for token in nlp(start)]
    tokens = [vocab[bos]] + [vocab[t] for t in tokens]
    tokens = torch.tensor(tokens).unsqueeze(0).to(device)

    y, h = model(tokens)
    next_token = y[0, -1].argmax()

    gen_tokens = [next_token.item()]
    for _ in range(max_len):
        y, h = model(next_token.reshape(1, 1), h)
        next_token = y[0, -1].argmax()
        gen_tokens.append(next_token.item())
        if next_token == vocab[eos]:
            break
    gen_tokens = [vocab.get_itos()[t] for t in gen_tokens[:-1]]
    return start + ''.join(gen_tokens)

In [17]:
generate_sentence(model, '私は')

'私はXを視野に入れるべきだ'

In [18]:
generate_sentence(model, '昨日')

'昨日、やっと今年の仕事が終わりました'