#Drive mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Install

In [3]:
# # =========Tokenizers=========
# # Janome
# !pip install janome
# # Mecab
# #インストール
# !pip install mecab-python3
# #辞書インストール
# !pip install unidic-lite

Collecting janome
  Downloading Janome-0.5.0-py2.py3-none-any.whl (19.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.7/19.7 MB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: janome
Successfully installed janome-0.5.0
Collecting mecab-python3
  Downloading mecab_python3-1.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (581 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m581.7/581.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mecab-python3
Successfully installed mecab-python3-1.0.8
Collecting unidic-lite
  Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: unidic-lite
  Building wheel for unidic-lite (setup.py) ... [?25l[?25hdone
  Created w

# Import

In [2]:
import os
import MeCab
import janome
from janome.tokenizer import Tokenizer
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from itertools import chain
from torchtext.vocab import vocab
import torchtext.transforms as T
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, Tensor
import torch.optim as optim
import math

# Configs

In [3]:
DEVICE = "cuda"
BATCH_SIZE = 8
MAX_LEN = 16
EMBEDDING_DIMENSION = 512
ATTENTION_HEADER_NUM = 8

# Load data

In [4]:
os.chdir("/content/drive/MyDrive/git_project/Transformers")
df = pd.read_excel("./data/JEC_basic_sentence_v1-3/JEC_basic_sentence_v1-3.xls", header = None)

train_df, valid_df = train_test_split(df, test_size=0.2)

In [7]:
train_data = train_df[1].tolist()
train_target = train_df[2].tolist()

valid_data = valid_df[1].tolist()
valid_target = valid_df[2].tolist()

print(f"train len:{len(train_data)} valid len:{len(valid_data)}")

train len:4243 valid len:1061


# Define Tokenizers

In [8]:
# Japanese Tokenizers
mecab = MeCab.Tagger("-Owakati")
janome = Tokenizer()

def jp_tokenizer(text, tokenizer="mecab"):
    if tokenizer == "mecab":
        return mecab.parse(text).split()
    elif tokenizer == "janome":
        return [tok for tok in janome.tokenize(text, wakati=True)]
    else:
        raise Exception("Undefined jp tokenizer")

# English Tokenizers
en_spacy = spacy.load('en_core_web_sm')

def en_tokenize(text, tokenizer="spacy"):
    if tokenizer == "spacy":
        return [t.text for t in en_spacy.tokenizer(text)]
    else:
        raise Exception("Undefined en tokenizer")

tokenized_train_data = [jp_tokenizer(data) for data in train_data]
tokenized_target_data = [en_tokenize(data) for data in train_target]

# Define vocab

In [9]:
jp_counter = Counter()
jp_counter.update(chain(*tokenized_train_data))
jp_vocab = vocab(jp_counter, specials=(["<unk>", "<pad>", "<bos>", "<eos>"]))
jp_vocab.set_default_index(jp_vocab["<unk>"])

en_counter = Counter()
en_counter.update(chain(*tokenized_target_data))
en_vocab = vocab(en_counter, specials=(["<unk>", "<pad>", "<bos>", "<eos>"]))
en_vocab.set_default_index(en_vocab["<unk>"])

# Define text transformer

In [10]:
text_max_len = MAX_LEN - 2

jp_text_transform = T.Sequential(
  T.VocabTransform(jp_vocab),
  T.Truncate(text_max_len),
  T.AddToken(token=jp_vocab['<bos>'], begin=True),
  T.AddToken(token=jp_vocab['<eos>'], begin=False),
  T.ToTensor(),
  T.PadTransform(MAX_LEN, jp_vocab['<pad>'])
)

en_text_transform = T.Sequential(
  T.VocabTransform(en_vocab),
  T.Truncate(text_max_len),
  T.AddToken(token=en_vocab['<bos>'], begin=True),
  T.AddToken(token=en_vocab['<eos>'], begin=False),
  T.ToTensor(),
  T.PadTransform(MAX_LEN, en_vocab['<pad>'])
)

# Define dataset

In [11]:
class Dataset(Dataset):
    def __init__(
        self,
        texts,
        targets,
        jp_text_transform,
        en_text_transform,
        ):

        self.texts = texts
        self.targets = targets
        self.jp_text_transform = jp_text_transform
        self.en_text_transform = en_text_transform
        self.build()

    def build(self):
        self.texts = [jp_tokenizer(data) for data in self.texts]
        self.targets = [jp_tokenizer(data) for data in self.targets]

    def max_word(self):
        return len(self.j_v), len(self.e_v)

    def __getitem__(self, i):
        text = self.texts[i]
        original_text = text
        text = self.jp_text_transform([text]).squeeze()

        target = self.targets[i]
        target = self.en_text_transform([target]).squeeze()


        # dec input should be started by <bos>
        dec_input = target[:-1]
        dec_target = target[1:]
        data = {"text": text, "dec_input": dec_input, "dec_target": dec_target}
        return data

    def __len__(self):
        return len(self.texts)

In [12]:
train_dataset = Dataset(
    train_data,
    train_target,
    jp_text_transform,
    en_text_transform
)
train_data_loader = DataLoader(train_dataset,
                          batch_size=BATCH_SIZE,
                          num_workers=4,
                          drop_last=True,
                          shuffle=False)

# Define model architecture

In [13]:
# https://qiita.com/gensal/items/e1c4a34dbfd0d7449099
class PositionalEncoding(nn.Module):

  def __init__(self, dim, dropout = 0.1, max_len = 5000):
    super().__init__()
    self.dropout = nn.Dropout(p=dropout)
    position = torch.arange(max_len).unsqueeze(1).to(DEVICE)
    div_term = torch.exp(torch.arange(0, dim, 2) * (-math.log(10000.0) / dim)).to(DEVICE)
    pe = torch.zeros(max_len, 1, dim).to(DEVICE)
    pe[:, 0, 0::2] = torch.sin(position * div_term)
    pe[:, 0, 1::2] = torch.cos(position * div_term)
    self.register_buffer("pe", pe)

  def forward(self, x):
    x = x + self.pe[:x.size(0)]
    return self.dropout(x)

class MultiHeadAttention(nn.Module):

  def __init__(self, dim, head_num, dropout = 0.1):
    super().__init__()
    self.dim = dim
    self.head_num = head_num
    self.linear_Q = nn.Linear(dim, dim, bias = False)
    self.linear_K = nn.Linear(dim, dim, bias = False)
    self.linear_V = nn.Linear(dim, dim, bias = False)
    self.linear = nn.Linear(dim, dim, bias = False)
    self.soft = nn.Softmax(dim = 3)
    self.dropout = nn.Dropout(dropout)

  def split_head(self, x):
    x = torch.tensor_split(x, self.head_num, dim = 2)
    x = torch.stack(x, dim = 1)
    return x

  def concat_head(self, x):
    x = torch.tensor_split(x, x.size()[1], dim = 1)
    x = torch.concat(x, dim = 3).squeeze(dim = 1)
    return x

  def forward(self, Q, K, V, mask = None):
    Q = self.linear_Q(Q)   #(BATCH_SIZE,word_count,dim)
    K = self.linear_K(K)
    V = self.linear_V(V)

    Q = self.split_head(Q)   #(BATCH_SIZE,head_num,word_count,dim//head_num)
    K = self.split_head(K)
    V = self.split_head(V)

    QK = torch.matmul(Q, torch.transpose(K, 3, 2))
    QK = QK/((self.dim//self.head_num)**0.5)

    if mask is not None:
      QK = QK + mask

    softmax_QK = self.soft(QK)
    softmax_QK = self.dropout(softmax_QK)

    QKV = torch.matmul(softmax_QK, V)
    QKV = self.concat_head(QKV)
    QKV = self.linear(QKV)
    return QKV

class FeedForward(nn.Module):

  def __init__(self, dim, hidden_dim = 2048, dropout = 0.1):
    super().__init__()
    self.dropout = nn.Dropout(dropout)
    self.linear_1 = nn.Linear(dim, hidden_dim)
    self.relu = nn.ReLU()
    self.linear_2 = nn.Linear(hidden_dim, dim)

  def forward(self, x):
    x = self.linear_1(x)
    x = self.relu(x)
    x = self.dropout(x)
    x = self.linear_2(x)
    return x

class EncoderBlock(nn.Module):

  def __init__(self, dim, head_num, dropout = 0.1):
    super().__init__()
    self.MHA = MultiHeadAttention(dim, head_num)
    self.layer_norm_1 = nn.LayerNorm([dim])
    self.layer_norm_2 = nn.LayerNorm([dim])
    self.FF = FeedForward(dim)
    self.dropout_1 = nn.Dropout(dropout)
    self.dropout_2 = nn.Dropout(dropout)

  def forward(self, x):
    Q = K = V = x
    x = self.MHA(Q, K, V)
    x = self.dropout_1(x)
    x = x + Q
    x = self.layer_norm_1(x)
    _x = x
    x = self.FF(x)
    x = self.dropout_2(x)
    x = x + _x
    x = self.layer_norm_2(x)
    return x

class Encoder(nn.Module):

  def __init__(self, enc_vocab_size, dim, head_num, dropout = 0.1):
    super().__init__()
    self.dim = dim
    self.embed = nn.Embedding(enc_vocab_size, dim)
    self.PE = PositionalEncoding(dim)
    self.dropout = nn.Dropout(dropout)
    self.EncoderBlocks = nn.ModuleList([EncoderBlock(dim, head_num) for _ in range(6)])

  def forward(self, x):
    x = self.embed(x)
    x = x*(self.dim**0.5)
    x = self.PE(x)
    x = self.dropout(x)
    for i in range(6):
      x = self.EncoderBlocks[i](x)
    return x

class DecoderBlock(nn.Module):

  def __init__(self, dim, head_num, dropout = 0.1):
    super().__init__()
    self.MMHA = MultiHeadAttention(dim, head_num)
    self.MHA = MultiHeadAttention(dim, head_num)
    self.layer_norm_1 = nn.LayerNorm([dim])
    self.layer_norm_2 = nn.LayerNorm([dim])
    self.layer_norm_3 = nn.LayerNorm([dim])
    self.FF = FeedForward(dim)
    self.dropout_1 = nn.Dropout(dropout)
    self.dropout_2 = nn.Dropout(dropout)
    self.dropout_3 = nn.Dropout(dropout)

  def forward(self, x, y, mask):
    Q = K = V = x
    x = self.MMHA(Q, K, V, mask)
    x = self.dropout_1(x)
    x = x + Q
    x = self.layer_norm_1(x)
    Q = x
    K = V = y
    x = self.MHA(Q, K, V)
    x = self.dropout_2(x)
    x = x + Q
    x = self.layer_norm_2(x)
    _x = x
    x = self.FF(x)
    x = self.dropout_3(x)
    x = x + _x
    x = self.layer_norm_3(x)
    return x

class Decoder(nn.Module):

  def __init__(self, dec_vocab_size, dim, head_num, dropout = 0.1):
    super().__init__()
    self.dim = dim
    self.embed = nn.Embedding(dec_vocab_size, dim)
    self.PE = PositionalEncoding(dim)
    self.DecoderBlocks = nn.ModuleList([DecoderBlock(dim, head_num) for _ in range(6)])
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(dim, dec_vocab_size)

  def forward(self, x, y, mask):
    x = self.embed(x)
    x = x*(self.dim**0.5)
    x = self.PE(x)
    x = self.dropout(x)
    for i in range(6):
      x = self.DecoderBlocks[i](x, y, mask)
    x = self.linear(x)
    return x

class Transformer(nn.Module):

  def __init__(self, enc_vocab_size, dec_vocab_size, dim, head_num):
    super().__init__()
    self.encoder = Encoder(enc_vocab_size, dim, head_num)
    self.decoder = Decoder(dec_vocab_size, dim, head_num)

  def forward(self, enc_input, dec_input, mask):
    enc_output = self.encoder(enc_input)
    output = self.decoder(dec_input, enc_output, mask)
    return output

# Load Dataset & DataLoader

In [14]:
train_dataset = Dataset(
    train_data,
    train_target,
    jp_text_transform,
    en_text_transform
)
train_data_loader = DataLoader(train_dataset,
                          batch_size=BATCH_SIZE,
                          num_workers=4,
                          drop_last=True,
                          shuffle=False)

# data = next(iter(train_data_loader))
# text, dec_input, target = data["text"], data["dec_input"], data["dec_target"]
# print(text[0], dec_input[0], target[0], sep="\n")

valid_dataset = Dataset(
    valid_data,
    valid_target,
    jp_text_transform,
    en_text_transform
)
valid_data_loader = DataLoader(valid_dataset,
                          batch_size=BATCH_SIZE,
                          num_workers=4,
                          drop_last=True,
                          shuffle=False)

# Define tranlate function

In [110]:
def translate(texts):
    max_len = MAX_LEN
    texts = texts.to(DEVICE)
    memory = model.encoder(texts)
    ys = torch.ones(len(texts), 1).fill_(jp_vocab['<bos>']).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(ys.size(1)).to(DEVICE)
        out = model.decoder(ys, memory, tgt_mask)

        pred = out[:,-1].argmax(dim=1)
        ys = torch.cat([ys, pred.unsqueeze(1)], dim=1)
    text_outputs = []
    for toks in ys.detach().cpu().numpy().tolist():
        text_outputs.append(" ".join(en_vocab.lookup_tokens(toks)))
    return text_outputs

# Train

In [112]:
model = Transformer(len(jp_vocab), len(en_vocab), dim = EMBEDDING_DIMENSION, head_num = ATTENTION_HEADER_NUM).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

epoch_num = 200
print_coef = 10
train_length = len(train_dataset)

history = {"train_loss": []}
n = 0
train_loss = 0

except_token = [1,2,3]
for epoch in range(epoch_num):

    for i, data in enumerate(train_data_loader):
        model.train()
        optimizer.zero_grad()
        text, dec_input, target = data["text"].to(DEVICE), data["dec_input"].to(DEVICE), data["dec_target"].to(DEVICE)
        mask = nn.Transformer.generate_square_subsequent_mask(MAX_LEN - 2 + 1).to(DEVICE)

        outputs = model(text, dec_input, mask)
        target = nn.functional.one_hot(target, len(en_vocab)).to(torch.float32)

        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        history["train_loss"].append(loss.item())
        n += 1
        if i % ((train_length//BATCH_SIZE)//print_coef) == (train_length//BATCH_SIZE)//print_coef - 1:
            print(f"epoch:{epoch+1}  index:{i+1}  loss:{train_loss/n:.10f}")
            train_loss = 0
            n = 0

            # just one sample validation
            model.eval()
            for i, data in enumerate(train_data_loader):
                with torch.no_grad():
                    text, dec_input, target = data["text"].to(DEVICE), data["dec_input"].to(DEVICE), data["dec_target"].to(DEVICE)
                    mask = nn.Transformer.generate_square_subsequent_mask(MAX_LEN - 2 + 1).to(DEVICE)
                    outputs = model(text, dec_input, mask)
                    pred = torch.argmax(outputs, dim=2)
                    input_token = text[0].detach().cpu().numpy().tolist()
                    input_token = [t for t in input_token if t not in except_token]
                    input_text = "".join(jp_vocab.lookup_tokens(input_token))

                    target_token = target[0].detach().cpu().numpy().tolist()
                    target_token = [t for t in target_token if t not in except_token]
                    target_text = " ".join(en_vocab.lookup_tokens(target_token))

                    output_text = translate(text)[0]

                    print(f"input_text:{input_text},\n target_text:{target_text},\n output_text:{output_text}")
                break

epoch:1  index:53  loss:0.0055686241
input_text:彼がせっかくのチャンスを逃す,
 target_text:He blows the chance of a lifetime,
 output_text:<bos> He will was finally was the consulting . <eos> <pad> <pad> <pad> <pad> <pad> <pad>
epoch:1  index:106  loss:0.0050161803
input_text:彼がせっかくのチャンスを逃す,
 target_text:He blows the chance of a lifetime,
 output_text:<bos> He will a lot of the first . <eos> <pad> <pad> <pad> <pad> <pad> <pad>
epoch:1  index:159  loss:0.0047984165
input_text:彼がせっかくのチャンスを逃す,
 target_text:He blows the chance of a lifetime,
 output_text:<bos> He will be a lot of the first . <eos> <pad> <pad> <pad> <pad> <pad>
epoch:1  index:212  loss:0.0046845753
input_text:彼がせっかくのチャンスを逃す,
 target_text:He blows the chance of a lifetime,
 output_text:<bos> He will be a lot of . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
epoch:1  index:265  loss:0.0044649337
input_text:彼がせっかくのチャンスを逃す,
 target_text:He blows the chance of a lifetime,
 output_text:<bos> He will be a lot of the first . <eos> <pad> <pad>

KeyboardInterrupt: ignored