# seq2seq

In [None]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torchtext import transforms
from torchtext.vocab import build_vocab_from_iterator
from torchvision.transforms import Compose
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import List


---

## Dataset

In [None]:
df = pd.read_excel('data/JEC_basic_sentence_v1-3.xls', header=None)
df.columns = ['id', 'japanese', 'english', 'chinese']
print('num of data: ', len(df))
df.head()

num of data:  5304


Unnamed: 0,id,japanese,english,chinese
0,#0001,Xではないかとつくづく疑問に思う,I often wonder if it might be X.,难道不会是X吗，我实在是感到怀疑。
1,#0002,Xがいいなといつも思います,I always think X would be nice.,我总觉得X不错。
2,#0003,それがあるようにいつも思います,It always seems like it is there.,我总觉得那好像是有的。
3,#0004,それが多すぎないかと正直思う,I honestly feel like there is too much.,老实说我觉得那太多了。
4,#0005,山田はみんなに好かれるタイプの人だと思う,I think that Yamada is the type everybody likes.,我想山田是受大家欢迎的那种人。


In [None]:
nlp_en = spacy.load('en_core_web_sm')
nlp_ja = spacy.load('ja_core_news_sm')
def tokenize(data: List[str], l='en') -> List[List[str]]:
    nlp = eval('nlp_' + l)
    return [[token.text for token in nlp(sentence)] for sentence in data]

In [None]:
text_ja = tokenize(df['japanese'], l='ja')
text_en = tokenize(df['english'], l='en')

In [None]:
pad, bos, eos, unk = '<pad>', '<bos>', '<eos>', '<unk>'
max_len = 30
specials = [pad, bos, eos, unk]
vocab_ja = build_vocab_from_iterator(text_ja, specials=specials)
vocab_en = build_vocab_from_iterator(text_en, specials=specials)

transform_ja = Compose([
    transforms.Truncate(max_len),
    transforms.AddToken(bos, begin=True),
    transforms.AddToken(eos, begin=False),
    transforms.VocabTransform(vocab_ja),
    transforms.ToTensor(),
    transforms.PadTransform(max_len, vocab_ja[pad])
])

transform_en = Compose([
    transforms.Truncate(max_len),
    transforms.AddToken(bos, begin=True),
    transforms.AddToken(eos, begin=False),
    transforms.VocabTransform(vocab_en),
    transforms.ToTensor(),
    transforms.PadTransform(max_len, vocab_en[pad])
])

In [None]:
class TextDataset(Dataset):
    def __init__(self, in_text, out_text, in_transform, out_transform):
        self.in_text = in_text
        self.out_text = out_text
        self.in_transform = in_transform
        self.out_transform = out_transform
        self.n_samples = len(in_text)

    def __getitem__(self, index):
        in_text = self.in_text[index]
        out_text = self.out_text[index]
        in_text = self.in_transform(in_text)
        out_text = self.out_transform(out_text)
        return in_text, out_text

    def __len__(self):
        return self.n_samples

In [None]:
class RNNLM(nn.Module):
    def __init__(
        self,
        n_in_vocab: int,
        n_out_vocab: int,
        n_hidden: int,
    ):
        self._eye = torch.eye(n_in_vocab)
        self.rnn = nn.RNN(n_in_vocab, n_hidden, batch_first=True)
        self.fc = nn.Linear(n_hidden, n_out_vocab)

    def forward(self, x):
        x = self._eye[x]
        y, _ = self.rnn(x)
        y = self.fc(y)
        return y