In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!pip install janome

Collecting janome
[?25l  Downloading https://files.pythonhosted.org/packages/79/f0/bd7f90806132d7d9d642d418bdc3e870cfdff5947254ea3cab27480983a7/Janome-0.3.10-py2.py3-none-any.whl (21.5MB)
[K     |████████████████████████████████| 21.5MB 29.4MB/s 
[?25hInstalling collected packages: janome
Successfully installed janome-0.3.10


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
import torch
import spacy
from torchtext import data
from torchtext import datasets
from janome.tokenizer import Tokenizer
from collections import defaultdict
from tqdm import tqdm
import csv

In [0]:
j_t = Tokenizer()
def tokenizer_ja(text): 
    return [tok for tok in j_t.tokenize(text, wakati=True)]

spacy_en = spacy.load('en')
def tokenizer_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT_ja = data.Field(tokenize=tokenizer_ja,
                            lower=True)
TEXT_en = data.Field(tokenize=tokenizer_en,
                            lower=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
#90
def dataloader(path, tokenizer):
    with open(path) as f:
        result=[]
        for line in tqdm(f):
            result.append(tokenizer(line.strip()))
    return result

train_en = dataloader("/content/drive/My Drive/orig/kyoto-train.en", tokenizer=tokenizer_en)
#trainの_jaはtokenizerの処理が遅いため、csvに書き出したものを読み込む
with open('/content/drive/My Drive/orig/kyoto-train-tokenized.csv') as f:
  reader = csv.reader(f)
  train_ja = [row for row in reader]

#train_ja  = dataloader("/content/drive/My Drive/orig/kyoto-train.ja", tokenizer=tokenizer_ja)
valid_en = dataloader("/content/drive/My Drive/orig/kyoto-dev.en", tokenizer=tokenizer_en)
valid_ja  = dataloader("/content/drive/My Drive/orig/kyoto-dev.ja", tokenizer=tokenizer_ja)
test_en = dataloader("/content/drive/My Drive/orig/kyoto-test.en", tokenizer=tokenizer_en)
test_ja  = dataloader("/content/drive/My Drive/orig/kyoto-test.ja", tokenizer=tokenizer_ja)

440288it [02:29, 2939.04it/s]
1166it [00:00, 1488.84it/s]
1166it [00:04, 291.42it/s]
1160it [00:00, 1414.89it/s]
1160it [00:03, 352.17it/s]


train, valid, test = data.TabularDataset.splits(
        path='../data/kftt-data-1.0/data/orig/',
        train          ='kyoto-train.tsv',
        validation ="kyoto-dev.tsv",
        test           = "kyoto-test.tsv", format='tsv',
        fields=[('ja', TEXT_ja),('en', TEXT_en)])

TEXT_ja.build_vocab(train, min_freq=2)
TEXT_en.build_vocab(train, min_freq=2)

In [0]:
def make_dict(df):
    frequency = defaultdict(int)
    for text in df:
        for token in text:
            frequency[token] += 1
    return frequency


def make_id_dict(dic):
    id_dict={}
    for i , (k,v) in enumerate(sorted(dic.items(), key=lambda x : -x[1])):
    #軽量化のためv>=5に。本来はv>=2
        if v>=2:
            id_dict[k]=i+1
        else:
            id_dict[k]=0
    return id_dict

word_dict_en=make_id_dict(make_dict(train_en))
word_dict_ja=make_id_dict(make_dict(train_ja))

In [0]:
def word_to_id(batch, id_dic, dim):
    #バッチごとに通す
    
    result=torch.zeros([len(batch), dim], dtype=torch.long) 
    for i, sentence in enumerate(batch):
        for u, word in enumerate(sentence):
            try:
                result[i, u]=id_dic[word]
            except:
                continue
    return result

#fieldがない行を削除
for i in range(len(train.examples)):
    try:
        train.examples[-i-1].en
        train.examples[-i-1].ja
    except:
        train.examples.pop(-i-1)
        print("delete number.",i)

BATCH_SIZE = 256

train_iterator, valid_iterator, test_iterator= data.Iterator.splits(
    datasets=(train, valid, test),
    batch_size = BATCH_SIZE,
    sort=False,
    device = device)

In [8]:
#91#92#93
import random
from typing import Tuple

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor


class Encoder(nn.Module):
    def __init__(self,
                 input_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: float):
        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)

        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self,
                src: Tensor) -> Tuple[Tensor]:

        embedded = self.dropout(self.embedding(src))

        outputs, hidden = self.rnn(embedded)

        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))

        return outputs, hidden


class Attention(nn.Module):
    def __init__(self,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 attn_dim: int):
        super().__init__()

        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim

        self.attn = nn.Linear(self.attn_in, attn_dim)

    def forward(self,
                decoder_hidden: Tensor,
                encoder_outputs: Tensor) -> Tensor:

        src_len = encoder_outputs.shape[0]

        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        energy = torch.tanh(self.attn(torch.cat((
            repeated_decoder_hidden,
            encoder_outputs),
            dim = 2)))

        attention = torch.sum(energy, dim=2)

        return F.softmax(attention, dim=1)


class Decoder(nn.Module):
    def __init__(self,
                 output_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: int,
                 attention: nn.Module):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)

        self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)

        self.dropout = nn.Dropout(dropout)


    def _weighted_encoder_rep(self,
                              decoder_hidden: Tensor,
                              encoder_outputs: Tensor) -> Tensor:

        a = self.attention(decoder_hidden, encoder_outputs)

        a = a.unsqueeze(1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        weighted_encoder_rep = torch.bmm(a, encoder_outputs)

        weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)

        return weighted_encoder_rep


    def forward(self,
                input: Tensor,
                decoder_hidden: Tensor,
                encoder_outputs: Tensor) -> Tuple[Tensor]:

        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))

        weighted_encoder_rep = self._weighted_encoder_rep(decoder_hidden,
                                                          encoder_outputs)

        rnn_input = torch.cat((embedded, weighted_encoder_rep), dim = 2)

        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted_encoder_rep = weighted_encoder_rep.squeeze(0)

        output = self.out(torch.cat((output,
                                     weighted_encoder_rep,
                                     embedded), dim = 1))

        return output, decoder_hidden.squeeze(0)


class Seq2Seq(nn.Module):
    def __init__(self,
                 encoder: nn.Module,
                 decoder: nn.Module,
                 device: torch.device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self,
                src: Tensor,
                trg: Tensor,
                teacher_forcing_ratio: float = 0.5) -> Tensor:

        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)

        # first input to the decoder is the <sos> token
        output = trg[0,:]

        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)

        return outputs


INPUT_DIM = len(word_dict_ja)
OUTPUT_DIM = len(word_dict_en)
# ENC_EMB_DIM = 256
# DEC_EMB_DIM = 256
# ENC_HID_DIM = 512
# DEC_HID_DIM = 512
# ATTN_DIM = 64
# ENC_DROPOUT = 0.5
# DEC_DROPOUT = 0.5

ENC_EMB_DIM = 32
DEC_EMB_DIM = 32
ENC_HID_DIM = 64
DEC_HID_DIM = 64
ATTN_DIM = 8
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)

attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)

dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)


def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)


model.apply(init_weights)

optimizer = optim.Adam(model.parameters())


def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 50,516,725 trainable parameters


In [0]:
#PAD_IDX = TEXT_ja.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=0)

In [14]:
import time
import math



def train(model,X_train, y_train, optimizer,
          criterion,clip: float):
    
    batch_size= 16
    
    model.train()

    epoch_loss = 0
    
    for idx in tqdm(range(0, len(X_train), batch_size)):
        
        batch_X = X_train[idx : idx+batch_size if idx+batch_size<=len(X_train) else len(X_train)]
        batch_y = y_train[idx : idx+batch_size if idx+batch_size<=len(X_train) else len(X_train)]
        dim= max(max([len(i) for i in batch_X]), max([len(i) for i in batch_y]))
        src = word_to_id(batch_X, word_dict_ja, dim)
        trg = word_to_id(batch_y, word_dict_en, dim)
        optimizer.zero_grad()

        output = model(src.cuda(), trg.cuda())

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg.cuda())

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / (len(X_train)//batch_size)


def evaluate(model, X_train, y_train,criterion):

    model.eval()

    epoch_loss = 0
    batch_size= 16
    with torch.no_grad():
        
        for idx in tqdm(range(0, len(X_train), batch_size)):
            batch_X = X_train[idx : idx+batch_size if idx+batch_size<=len(X_train) else len(X_train)]
            batch_y = y_train[idx : idx+batch_size if idx+batch_size<=len(X_train) else len(X_train)]
            dim= max(max([len(i) for i in batch_X]), max([len(i) for i in batch_y]))
            src = word_to_id(batch_X, word_dict_ja, dim)
            trg = word_to_id(batch_y, word_dict_en, dim)
            optimizer.zero_grad()


            output = model(src.cuda() , trg.cuda(), 0) #turn off teacher forcing

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            loss = criterion(output, trg.cuda())

            epoch_loss += loss.item()

    return epoch_loss / (len(X_train)//batch_size)



def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_ja,train_en, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_ja, valid_en, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')



test_loss = evaluate(model, test_ja, test_en, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')


  0%|          | 0/625 [00:00<?, ?it/s][A
  0%|          | 1/625 [00:00<10:14,  1.02it/s][A
  0%|          | 2/625 [00:01<09:44,  1.07it/s][A
  0%|          | 3/625 [00:02<09:14,  1.12it/s][A
  1%|          | 4/625 [00:03<08:50,  1.17it/s][A
  1%|          | 5/625 [00:04<08:51,  1.17it/s][A
  1%|          | 6/625 [00:05<08:52,  1.16it/s][A
  1%|          | 7/625 [00:06<09:01,  1.14it/s][A
  1%|▏         | 8/625 [00:07<09:59,  1.03it/s][A
  1%|▏         | 9/625 [00:07<09:25,  1.09it/s][A
  2%|▏         | 10/625 [00:08<09:34,  1.07it/s][A
  2%|▏         | 11/625 [00:10<12:02,  1.18s/it][A
  2%|▏         | 12/625 [00:12<12:58,  1.27s/it][A

RuntimeError: ignored