# RNNLM

*Recurrent Neural Network Language Model*

In [None]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torchtext import transforms
from torchtext.vocab import build_vocab_from_iterator
from torchvision.transforms import Compose
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import List


---

## Dataset

In [None]:
df = pd.read_excel('data/JEC_basic_sentence_v1-3.xls', header=None)
df.columns = ['id', 'japanese', 'english', 'chinese']
print('num of data: ', len(df))
df.head()

num of data:  5304


Unnamed: 0,id,japanese,english,chinese
0,#0001,Xではないかとつくづく疑問に思う,I often wonder if it might be X.,难道不会是X吗，我实在是感到怀疑。
1,#0002,Xがいいなといつも思います,I always think X would be nice.,我总觉得X不错。
2,#0003,それがあるようにいつも思います,It always seems like it is there.,我总觉得那好像是有的。
3,#0004,それが多すぎないかと正直思う,I honestly feel like there is too much.,老实说我觉得那太多了。
4,#0005,山田はみんなに好かれるタイプの人だと思う,I think that Yamada is the type everybody likes.,我想山田是受大家欢迎的那种人。


In [None]:
nlp_en = spacy.load('en_core_web_sm')
nlp_ja = spacy.load('ja_core_news_sm')
def tokenize(data: List[str], l='en') -> List[List[str]]:
    nlp = eval('nlp_' + l)
    return [[token.text for token in nlp(sentence)] for sentence in data]

In [None]:
text_ja = tokenize(df['japanese'], l='ja')
text_en = tokenize(df['english'], l='en')

In [None]:
text_en

[['I', 'often', 'wonder', 'if', 'it', 'might', 'be', 'X.'],
 ['I', 'always', 'think', 'X', 'would', 'be', 'nice', '.'],
 ['It', 'always', 'seems', 'like', 'it', 'is', 'there', '.'],
 ['I', 'honestly', 'feel', 'like', 'there', 'is', 'too', 'much', '.'],
 ['I',
  'think',
  'that',
  'Yamada',
  'is',
  'the',
  'type',
  'everybody',
  'likes',
  '.'],
 ['Someone', 'thought', 'that', '〜'],
 ['X', 'seems', 'like', 'it', "'s", 'really', 'tough', '.'],
 ['I', 'think', 'X', 'is', 'just', 'a', 'matter', 'of', 'time', '.'],
 ['I',
  'think',
  'that',
  'X',
  'will',
  'become',
  'an',
  'issue',
  'in',
  'the',
  'future',
  '.'],
 ['I',
  'think',
  'this',
  'was',
  'made',
  'around',
  'the',
  'Momoyama',
  'Period',
  '.'],
 ['I',
  'think',
  'this',
  'movie',
  'is',
  'from',
  'the',
  'early',
  'Showa',
  'Period',
  '.'],
 ['Who', 'thinks', 'they', "'re", 'going', 'to', 'win', '?'],
 ['I', 'wonder', 'about', 'X.'],
 ['I', 'do', "n't", 'think', 'he', 'was', 'able', 'do', 'it

#### 前処理

In [None]:
len(max(text_ja, key=len))

27

In [None]:
pad, bos, eos, unk = '<pad>', '<bos>', '<eos>', '<unk>'
max_len = 30
specials = [pad, bos, eos, unk]
vocab_ja = build_vocab_from_iterator(text_ja, specials=specials)
vocab_en = build_vocab_from_iterator(text_en, specials=specials)

transform_ja = Compose([
    transforms.Truncate(max_len),
    transforms.AddToken(bos, begin=True),
    transforms.AddToken(eos, begin=False),
    transforms.VocabTransform(vocab_ja),
    transforms.ToTensor(),
    transforms.PadTransform(max_len, vocab_ja[pad])
])

transform_en = Compose([
    transforms.Truncate(max_len),
    transforms.AddToken(bos, begin=True),
    transforms.AddToken(eos, begin=False),
    transforms.VocabTransform(vocab_en),
    transforms.ToTensor(),
    transforms.PadTransform(max_len, vocab_en[pad])
])

In [None]:
ex2 =text_ja[2]
ex2

['それ', 'が', 'ある', 'よう', 'に', 'いつも', '思い', 'ます']

In [None]:
transform_ja(ex2)

tensor([  1,  20,   4, 253, 104,   7,  88, 161,   9,   2,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0])

##### Vocab

数値化

In [None]:
v = build_vocab_from_iterator(ex2, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [None]:
v.get_stoi()

{'よ': 14,
 '<unk>': 0,
 '<pad>': 1,
 'あ': 5,
 '<eos>': 3,
 'す': 8,
 '<bos>': 2,
 'に': 11,
 'い': 4,
 'う': 6,
 'れ': 16,
 'る': 15,
 'も': 13,
 'が': 7,
 '思': 17,
 'そ': 9,
 'つ': 10,
 'ま': 12}

In [None]:
t = transforms.VocabTransform(v)
tokens = t(ex2)
tokens

RuntimeError: Token それ not found and default index is not set
Exception raised from __getitem__ at /__w/text/text/pytorch/text/torchtext/csrc/vocab.cpp:43 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7f57704784d7 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/lib/python3.11/site-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x7f577044236b in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/lib/python3.11/site-packages/torch/lib/libc10.so)
frame #2: torchtext::Vocab::__getitem__(c10::basic_string_view<char> const&) const + 0x384 (0x7f5676f0dde4 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/lib/python3.11/site-packages/torchtext/lib/libtorchtext.so)
frame #3: <unknown function> + 0x1e5a0 (0x7f5676df95a0 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/lib/python3.11/site-packages/torchtext/_torchtext.so)
frame #4: <unknown function> + 0x3bd67 (0x7f5676e16d67 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/lib/python3.11/site-packages/torchtext/_torchtext.so)
frame #5: <unknown function> + 0x1fd3c7 (0x5563b14eb3c7 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #6: _PyObject_MakeTpCall + 0x264 (0x5563b14c7d34 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #7: _PyEval_EvalFrameDefault + 0x756 (0x5563b14d4386 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #8: <unknown function> + 0x22c8ed (0x5563b151a8ed in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #9: <unknown function> + 0x22c635 (0x5563b151a635 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #10: _PyEval_EvalFrameDefault + 0x4469 (0x5563b14d8099 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #11: _PyFunction_Vectorcall + 0x173 (0x5563b14facd3 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #12: _PyObject_FastCallDictTstate + 0x65 (0x5563b14cb4b5 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #13: _PyObject_Call_Prepend + 0x69 (0x5563b1502d99 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #14: <unknown function> + 0x2ead79 (0x5563b15d8d79 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #15: _PyObject_MakeTpCall + 0x264 (0x5563b14c7d34 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #16: _PyEval_EvalFrameDefault + 0x756 (0x5563b14d4386 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #17: <unknown function> + 0x2a342e (0x5563b159142e in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #18: PyEval_EvalCode + 0x9f (0x5563b1590a8f in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #19: <unknown function> + 0x2bcade (0x5563b15aaade in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #20: _PyEval_EvalFrameDefault + 0x3969 (0x5563b14d7599 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #21: <unknown function> + 0x2b7b39 (0x5563b15a5b39 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #22: _PyEval_EvalFrameDefault + 0x3274 (0x5563b14d6ea4 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #23: <unknown function> + 0x2b7b39 (0x5563b15a5b39 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #24: _PyEval_EvalFrameDefault + 0x3274 (0x5563b14d6ea4 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #25: <unknown function> + 0x2b7b39 (0x5563b15a5b39 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #26: <unknown function> + 0x2ba3ea (0x5563b15a83ea in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #27: _PyEval_EvalFrameDefault + 0x3528 (0x5563b14d7158 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #28: <unknown function> + 0x22c8ed (0x5563b151a8ed in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #29: <unknown function> + 0x22c683 (0x5563b151a683 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #30: PyObject_Call + 0x9d (0x5563b150503d in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #31: _PyEval_EvalFrameDefault + 0x4469 (0x5563b14d8099 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #32: <unknown function> + 0x2b7b39 (0x5563b15a5b39 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #33: _PyEval_EvalFrameDefault + 0x3274 (0x5563b14d6ea4 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #34: <unknown function> + 0x2b7b39 (0x5563b15a5b39 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #35: _PyEval_EvalFrameDefault + 0x3274 (0x5563b14d6ea4 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #36: <unknown function> + 0x2b7b39 (0x5563b15a5b39 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #37: _PyEval_EvalFrameDefault + 0x3274 (0x5563b14d6ea4 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #38: <unknown function> + 0x2b7b39 (0x5563b15a5b39 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #39: _PyEval_EvalFrameDefault + 0x3274 (0x5563b14d6ea4 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #40: <unknown function> + 0x2b7b39 (0x5563b15a5b39 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #41: <unknown function> + 0x79c7 (0x7f57855679c7 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/lib/python3.11/lib-dynload/_asyncio.cpython-311-x86_64-linux-gnu.so)
frame #42: <unknown function> + 0x1fb7db (0x5563b14e97db in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #43: <unknown function> + 0x194e6f (0x5563b1482e6f in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #44: <unknown function> + 0x196eac (0x5563b1484eac in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #45: <unknown function> + 0x1f376f (0x5563b14e176f in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #46: _PyEval_EvalFrameDefault + 0x85da (0x5563b14dc20a in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #47: <unknown function> + 0x2a342e (0x5563b159142e in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #48: PyEval_EvalCode + 0x9f (0x5563b1590a8f in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #49: <unknown function> + 0x2bcade (0x5563b15aaade in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #50: <unknown function> + 0x1f376f (0x5563b14e176f in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #51: PyObject_Vectorcall + 0x31 (0x5563b14e1651 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #52: _PyEval_EvalFrameDefault + 0x756 (0x5563b14d4386 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #53: _PyFunction_Vectorcall + 0x173 (0x5563b14facd3 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #54: <unknown function> + 0x2cef3b (0x5563b15bcf3b in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #55: Py_RunMain + 0x14a (0x5563b15bc90a in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #56: Py_BytesMain + 0x39 (0x5563b157f179 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)
frame #57: __libc_start_main + 0xf3 (0x7f5786b07083 in /lib/x86_64-linux-gnu/libc.so.6)
frame #58: <unknown function> + 0x291015 (0x5563b157f015 in /home/komiya/.pyenv/versions/miniforge3-4.10.3-10/envs/nlp/bin/python)


##### Padding

長さの調整

In [None]:
ex1

['I', 'often', 'wonder', 'if', 'it', 'might', 'be', 'X.']

In [None]:
t = transforms.PadTransform(10, 0)
tokens = torch.tensor(tokens)
t(tokens)

tensor([ 4, 14, 16, 10, 11, 12,  5,  8,  0,  0])

Dataset

In [None]:
class TextDataset(Dataset):
    def __init__(self, in_text, out_text, in_transform, out_transform):
        self.in_text = in_text
        self.out_text = out_text
        self.in_transform = in_transform
        self.out_transform = out_transform
        self.n_samples = len(in_text)

    def __getitem__(self, index):
        in_text = self.in_text[index]
        out_text = self.out_text[index]
        in_text = self.in_transform(in_text)
        out_text = self.out_transform(out_text)
        return in_text, out_text

    def __len__(self):
        return self.n_samples

In [None]:
dataset = TextDataset(text_ja, text_en, transform_ja, transform_en)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

### Train

In [None]:
def train(model, optimizer, n_epochs):
    model.train()
    for epoch in range(1, n_epochs + 1):
        for x, y in tqdm(dataloader, desc=f'Epoch {epoch}/{n_epochs}'):
            optimizer.zero_grad()
            y_pred = model(x)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
        print('loss:', loss.item(), flush=True)