#Drive mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Install

In [2]:
# =========Tokenizers=========
# Janome
!pip install janome
# Mecab
#インストール
!pip install mecab-python3
#辞書インストール
!pip install unidic-lite



# Import

In [3]:
import os
import MeCab
import janome
from janome.tokenizer import Tokenizer
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from itertools import chain
from torchtext.vocab import vocab
import torchtext.transforms as T
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, Tensor
import torch.optim as optim
import math
from torch.nn import Transformer

# Configs

In [4]:
DEVICE = "cuda"
BATCH_SIZE = 8
MAX_LEN = 16
EMBEDDING_DIMENSION = 512
ATTENTION_HEADER_NUM = 8

# Load data

In [5]:
os.chdir("/content/drive/MyDrive/git_project/Transformers")
df = pd.read_excel("./data/JEC_basic_sentence_v1-3/JEC_basic_sentence_v1-3.xls", header = None)

train_df, valid_df = train_test_split(df, test_size=0.2)

In [6]:
train_data = train_df[1].tolist()
train_target = train_df[2].tolist()

valid_data = valid_df[1].tolist()
valid_target = valid_df[2].tolist()

print(f"train len:{len(train_data)} valid len:{len(valid_data)}")

train len:4243 valid len:1061


# Define Tokenizers

In [7]:
# Japanese Tokenizers
mecab = MeCab.Tagger("-Owakati")
janome = Tokenizer()

def jp_tokenizer(text, tokenizer="mecab"):
    if tokenizer == "mecab":
        return mecab.parse(text).split()
    elif tokenizer == "janome":
        return [tok for tok in janome.tokenize(text, wakati=True)]
    else:
        raise Exception("Undefined jp tokenizer")

# English Tokenizers
en_spacy = spacy.load('en_core_web_sm')

def en_tokenize(text, tokenizer="spacy"):
    if tokenizer == "spacy":
        return [t.text for t in en_spacy.tokenizer(text)]
    else:
        raise Exception("Undefined en tokenizer")

tokenized_train_data = [jp_tokenizer(data) for data in train_data]
tokenized_target_data = [en_tokenize(data) for data in train_target]

# Define vocab

In [8]:
jp_counter = Counter()
jp_counter.update(chain(*tokenized_train_data))
jp_vocab = vocab(jp_counter, specials=(["<unk>", "<pad>", "<bos>", "<eos>"]))
jp_vocab.set_default_index(jp_vocab["<unk>"])

en_counter = Counter()
en_counter.update(chain(*tokenized_target_data))
en_vocab = vocab(en_counter, specials=(["<unk>", "<pad>", "<bos>", "<eos>"]))
en_vocab.set_default_index(en_vocab["<unk>"])

# Define text transform

In [9]:
text_max_len = MAX_LEN - 2

jp_text_transform = T.Sequential(
  T.VocabTransform(jp_vocab),
  T.Truncate(text_max_len),
  T.AddToken(token=jp_vocab['<bos>'], begin=True),
  T.AddToken(token=jp_vocab['<eos>'], begin=False),
  T.ToTensor(),
  T.PadTransform(MAX_LEN, jp_vocab['<pad>'])
)

en_text_transform = T.Sequential(
  T.VocabTransform(en_vocab),
  T.Truncate(text_max_len),
  T.AddToken(token=en_vocab['<bos>'], begin=True),
  T.AddToken(token=en_vocab['<eos>'], begin=False),
  T.ToTensor(),
  T.PadTransform(MAX_LEN, en_vocab['<pad>'])
)

# Define dataset

In [10]:
class Dataset(Dataset):
    def __init__(
        self,
        texts,
        targets,
        jp_text_transform,
        en_text_transform,
        ):

        self.texts = texts
        self.targets = targets
        self.jp_text_transform = jp_text_transform
        self.en_text_transform = en_text_transform
        self.build()

    def build(self):
        self.texts = [jp_tokenizer(data) for data in self.texts]
        self.targets = [jp_tokenizer(data) for data in self.targets]

    def __getitem__(self, i):
        text = self.texts[i]
        original_text = text
        text = self.jp_text_transform([text]).squeeze()

        target = self.targets[i]
        target = self.en_text_transform([target]).squeeze()


        # dec input should be started by <bos>
        dec_input = target[:-1]
        dec_target = target[1:]
        data = {"text": text, "dec_input": dec_input, "dec_target": dec_target}
        return data

    def __len__(self):
        return len(self.texts)

# Define model architecture

In [11]:
# # https://qiita.com/gensal/items/e1c4a34dbfd0d7449099
class PositionalEncoding(nn.Module):

  def __init__(self, dim, dropout = 0.1, max_len = 5000):
    super().__init__()
    self.dropout = nn.Dropout(p=dropout)
    position = torch.arange(max_len).unsqueeze(1).to(DEVICE)
    div_term = torch.exp(torch.arange(0, dim, 2) * (-math.log(10000.0) / dim)).to(DEVICE)
    pe = torch.zeros(max_len, 1, dim).to(DEVICE)
    pe[:, 0, 0::2] = torch.sin(position * div_term)
    pe[:, 0, 1::2] = torch.cos(position * div_term)
    self.register_buffer("pe", pe)

  def forward(self, x):
    x = x + self.pe[:x.size(0)]
    return self.dropout(x)

class TorchTransformer(nn.Module):
    def __init__(self,
                 emb_size: int,
                 nhead: int,
                 enc_vocab_size: int,
                 dec_vocab_size: int):
        super(TorchTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size, nhead=nhead, batch_first=True)
        self.generator = nn.Linear(emb_size, dec_vocab_size)
        self.enc_tok_emb = nn.Embedding(enc_vocab_size, emb_size)
        self.dec_tok_emb = nn.Embedding(dec_vocab_size, emb_size)
        self.pe = PositionalEncoding(emb_size)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor):
        src_emb = self.pe(self.enc_tok_emb(src))
        tgt_emb = self.pe(self.dec_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, src_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.pe(
                            self.enc_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.pe(
                          self.dec_tok_emb(tgt)), memory,
                          tgt_mask)

# Load Dataset & DataLoader

In [12]:
train_dataset = Dataset(
    train_data,
    train_target,
    jp_text_transform,
    en_text_transform
)
train_data_loader = DataLoader(train_dataset,
                          batch_size=BATCH_SIZE,
                          num_workers=4,
                          drop_last=True,
                          shuffle=False)

# data = next(iter(train_data_loader))
# text, dec_input, target = data["text"], data["dec_input"], data["dec_target"]
# print(text[0], dec_input[0], target[0], sep="\n")

valid_dataset = Dataset(
    valid_data,
    valid_target,
    jp_text_transform,
    en_text_transform
)
valid_data_loader = DataLoader(valid_dataset,
                          batch_size=BATCH_SIZE,
                          num_workers=4,
                          drop_last=True,
                          shuffle=False)

# Define util function

In [13]:
def create_mask(src, tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]

    tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt_seq_len).to(DEVICE)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == jp_vocab["<pad>"])
    tgt_padding_mask = (tgt == en_vocab["<pad>"])
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

def translate(texts, src_mask):
    max_len = MAX_LEN
    texts = texts.to(DEVICE)
    memory = model.encode(texts, src_mask)
    ys = torch.ones(text.size(0), 1).fill_(en_vocab['<bos>']).type(torch.long).to(DEVICE)

    for i in range(max_len-1):
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(ys.size(1)).to(DEVICE)
        memory = memory.to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        prob = model.generator(out[:, -1])
        pred = prob.argmax(dim=1)
        ys = torch.cat([ys, pred.unsqueeze(1)], dim=1)
    text_outputs = []
    for toks in ys.detach().cpu().numpy().tolist():
        text_outputs.append(" ".join(en_vocab.lookup_tokens(toks)))
    return text_outputs

# Train

In [None]:
model = TorchTransformer(
    emb_size = EMBEDDING_DIMENSION,
    nhead = ATTENTION_HEADER_NUM,
    enc_vocab_size = len(jp_vocab),
    dec_vocab_size = len(en_vocab)
).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

epoch_num = 200
print_coef = 10
train_length = len(train_dataset)

history = {"train_loss": []}
n = 0
train_loss = 0

except_token = [1,2,3]
for epoch in range(epoch_num):

    for i, data in enumerate(train_data_loader):
        model.train()
        optimizer.zero_grad()
        text, dec_input, target = data["text"].to(DEVICE), data["dec_input"].to(DEVICE), data["dec_target"].to(DEVICE)
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(text, dec_input)
        logits = model(text, dec_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask)
        loss = criterion(logits.reshape(-1, logits.shape[-1]), target.reshape(-1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        history["train_loss"].append(loss.item())
        n += 1
        if i % ((train_length//BATCH_SIZE)//print_coef) == (train_length//BATCH_SIZE)//print_coef - 1:
            print(f"epoch:{epoch+1}  index:{i+1}  loss:{train_loss/n:.10f}")
            train_loss = 0
            n = 0

            # just one sample validation
            model.eval()
            for i, data in enumerate(train_data_loader):
                with torch.no_grad():
                    text, dec_input, target = data["text"].to(DEVICE), data["dec_input"].to(DEVICE), data["dec_target"].to(DEVICE)
                    src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(text, dec_input)
                    outputs = model(text, dec_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask)
                    input_token = text[0].detach().cpu().numpy().tolist()
                    input_token = [t for t in input_token if t not in except_token]
                    input_text = "".join(jp_vocab.lookup_tokens(input_token))

                    target_token = target[0].detach().cpu().numpy().tolist()
                    target_token = [t for t in target_token if t not in except_token]
                    target_text = " ".join(en_vocab.lookup_tokens(target_token))

                    output_text = translate(text, src_mask)[0]

                    print(f"input_text:{input_text},\n target_text:{target_text},\n output_text:{output_text}")
                break

epoch:1  index:53  loss:5.5060817431
input_text:私はまだまだ自分の技術不足を感じます,
 target_text:I still feel I lack skills .,
 output_text:<bos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
epoch:1  index:106  loss:4.0863474945
input_text:私はまだまだ自分の技術不足を感じます,
 target_text:I still feel I lack skills .,
 output_text:<bos> He the the the . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
epoch:1  index:159  loss:3.9569670389
input_text:私はまだまだ自分の技術不足を感じます,
 target_text:I still feel I lack skills .,
 output_text:<bos> He the the the the the the . <eos> <pad> <pad> <pad> <pad> <pad> <pad>
epoch:1  index:212  loss:3.9630156733
input_text:私はまだまだ自分の技術不足を感じます,
 target_text:I still feel I lack skills .,
 output_text:<bos> He the the the the the . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
epoch:1  index:265  loss:3.8534697812
input_text:私はまだまだ自分の技術不足を感じます,
 target_text:I still feel I lack skills .,
 output_text:<bos> He <unk> the <unk> the the the the 