# Seq2seqの応用：機械翻訳

In [57]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import spacy
import numpy as np
import random
import math
import time
import re
import pandas as pd
from sklearn.model_selection import train_test_split

device = "cuda" if torch.cuda.is_available() else "cpu"
path = './Data/raw'


In [22]:
with open(path, 'r') as f:
  raw_data = f.readlines()
raw_list = [re.sub('\n', '', s).split('\t') for s in raw_data]

In [26]:
raw_df = pd.DataFrame(raw_list,
                  columns=['English', 'Japanese'])
raw_df.head()

Unnamed: 0,English,Japanese
0,"you are back, aren't you, harold?",あなたは戻ったのね ハロルド?
1,my opponent is shark.,俺の相手は シャークだ。
2,this is one thing in exchange for another.,引き換えだ ある事とある物の
3,"yeah, i'm fine.",もういいよ ごちそうさま ううん
4,don't come to the office anymore. don't call m...,もう会社には来ないでくれ 電話もするな


In [27]:
!python3 -m spacy download ja_core_news_md
!python3 -m spacy download en_core_web_md

Collecting ja-core-news-md==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.7.0/ja_core_news_md-3.7.0-py3-none-any.whl (42.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting sudachipy!=0.6.1,>=0.5.2 (from ja-core-news-md==3.7.0)
  Downloading SudachiPy-0.6.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hCollecting sudachidict-core>=20211220 (from ja-core-news-md==3.7.0)
  Downloading SudachiDict_core-20230927-py3-none-any.whl.metadata (2.5 kB)
Downloading SudachiDict_core-20230927-py3-none-any.whl (71.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/71.7 MB[0m [31m63.9 MB/s[0m

In [28]:
JA = spacy.load("ja_core_news_md")
EN = spacy.load("en_core_web_md")

In [37]:
[token.text for token in EN.tokenizer(raw_df["English"][0])]

['you', 'are', 'back', ',', 'are', "n't", 'you', ',', 'harold', '?']

In [38]:
[token.text for token in JA.tokenizer(raw_df["Japanese"][0])]

['あなた', 'は', '戻っ', 'た', 'の', 'ね', 'ハロルド', '?']

In [29]:
def tokenize_ja(sentence):
    return [tok.text for tok in JA.tokenizer(sentence)]

def tokenize_en(sentence):
    return [tok.text for tok in EN.tokenizer(sentence)]

In [34]:
train_val_df, test_df = train_test_split(raw_df, test_size=0.2)
# Split the training plus validation set into separate training and validation sets
train_df, val_df = train_test_split(train_val_df, test_size=0.25)

In [35]:
train_df

Unnamed: 0,English,Japanese
2395933,where did you get this?,− いいから 落ち着け...
2649086,it's not what i wish. it's what you wish.,違う お前が欲しいと言ったものだ
2190506,he brings that experience to others,障害の有無や 健康か否か
891536,their mother and her boyfriend were taking the...,母親が男友達と アビィモアに ゴーカート乗りに連れて行ってた
2167442,"no, it's not me.",待って! 誰も降りないでそこの生徒も降りないで
...,...,...
2294619,i'll always be very grateful.,このご恩は 忘れないわ
289491,"well, that should do it.",彼に伝えるべきだ
876370,red meat for the hood.,フードの内容
1678407,but he was not strong enough to pole all the w...,しかし 彼は湿地帯を横切って全部に 棒をたてるほど強くなかった


In [53]:
def tokenize_example(row, EN, JA, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in EN.tokenizer(row["English"])][:max_length]
    jp_tokens = [token.text for token in JA.tokenizer(row["Japanese"])][:max_length]
    
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
    
    en_tokens = [sos_token] + en_tokens + [eos_token]
    jp_tokens = [sos_token] + jp_tokens + [eos_token]
    
    return {"en_tokens": en_tokens, "jp_tokens": jp_tokens}

In [44]:
max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

In [45]:
fn_kwargs = {
    "EN": EN, 
    "JA": JA, 
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}


In [54]:
train_df = train_df.apply(lambda row: tokenize_example(row, EN, JA, max_length, lower, sos_token, eos_token), axis=1)
val_df = val_df.apply(lambda row: tokenize_example(row, EN, JA, max_length, lower, sos_token, eos_token), axis=1)
test_df = test_df.apply(lambda row: tokenize_example(row, EN, JA, max_length, lower, sos_token, eos_token), axis=1)

In [68]:
train_df[0]

{'en_tokens': ['<sos>',
  'you',
  'are',
  'back',
  ',',
  'are',
  "n't",
  'you',
  ',',
  'harold',
  '?',
  '<eos>'],
 'jp_tokens': ['<sos>', 'あなた', 'は', '戻っ', 'た', 'の', 'ね', 'ハロルド', '?', '<eos>']}

In [92]:
train_df=pd.DataFrame(train_df.tolist())
val_df=pd.DataFrame(val_df.tolist())
test_df=pd.DataFrame(test_df.tolist())

In [81]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_df_dataframe["en_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

jp_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_df_dataframe["jp_tokens"],
    min_freq=min_freq,
    specials=special_tokens,  
)

In [82]:
en_vocab.get_itos()[:10]


['<unk>', '<pad>', '<sos>', '<eos>', '.', ',', 'the', 'you', 'i', '?']

In [83]:
assert en_vocab[unk_token] == jp_vocab[unk_token]
assert en_vocab[pad_token] == jp_vocab[pad_token]

In [84]:
unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [85]:
en_vocab.set_default_index(unk_index)
jp_vocab.set_default_index(unk_index)

In [96]:
def numericalize_example(example, en_vocab, jp_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    ja_ids = jp_vocab.lookup_indices(example["jp_tokens"])
    return {"en_ids": en_ids, "ja_ids": ja_ids}

In [97]:

def numericalize_dataframe(df, en_vocab, jp_vocab):
    numericalized = df.apply(lambda row: numericalize_example(row, en_vocab, jp_vocab), axis=1)
    return pd.DataFrame(numericalized.tolist())

In [99]:
train_df=numericalize_dataframe(train_df,en_vocab, jp_vocab)
val_df=numericalize_dataframe(val_df,en_vocab, jp_vocab)
test_df=numericalize_dataframe(test_df,en_vocab, jp_vocab)

In [100]:
train_df

Unnamed: 0,en_ids,ja_ids
0,"[2, 96, 53, 7, 55, 23, 9, 3]","[2, 586, 52, 27, 346, 1241, 16, 16, 16, 3]"
1,"[2, 13, 12, 33, 24, 8, 527, 4, 13, 12, 24, 7, ...","[2, 220, 71, 10, 340, 15, 87, 8, 84, 11, 3]"
2,"[2, 31, 1877, 14, 763, 10, 614, 3]","[2, 1301, 4, 11796, 83, 1158, 18, 7704, 18, 3]"
3,"[2, 139, 266, 15, 73, 1131, 83, 404, 88, 57744...","[2, 621, 10, 171, 384, 15, 0, 6, 42346, 1086, ..."
4,"[2, 43, 5, 13, 12, 33, 27, 4, 3]","[2, 201, 7, 25, 81, 19, 1399, 20, 13, 155, 4, ..."
...,...,...
1680827,"[2, 8, 65, 200, 37, 124, 2352, 4, 3]","[2, 41, 196, 5673, 5, 401, 20, 51, 3]"
1680828,"[2, 69, 5, 14, 126, 21, 13, 4, 3]","[2, 47, 6, 2196, 211, 11, 3]"
1680829,"[2, 516, 1656, 28, 6, 3475, 4, 3]","[2, 4137, 4, 1761, 3]"
1680830,"[2, 40, 31, 26, 33, 618, 259, 10, 3765, 46, 6,...","[2, 207, 47, 5, 12995, 6344, 9, 13425, 7, 358,..."
