# Translation with a Sequence to Sequence Network and Attention

- [tutorial](http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html)
- [最近のDeep Learning (NLP) 界隈におけるAttention事情](https://www.google.co.jp/search?q=%E6%9C%80%E8%BF%91%E3%81%AEdeeplearning%E3%80%80attention%E4%BA%8B%E6%83%85&ie=utf-8&oe=utf-8&client=firefox-b-ab&gfe_rd=cr&dcr=0&ei=2PtRWpXvGufZ8Aev54TwCQ)
- [Effective Approaches to Attention-based Neural Machine Translation](http://aclweb.org/anthology/D15-1166)

簡略化のために、10単語未満かつ決まったprefixで始まる文のみを使用する

In [None]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name): # コンストラクタの引数はnameのみ
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
def readLangs(lang1, lang2, reverse=False): # 逆向きの翻訳をしたいときは、reverseをTrueにする
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    # 各行を[入力文、翻訳文]にしている (単語に分けたりはしていない)
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [None]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

# 後で使うので注目
input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

## The Seq2Seq Model

### The Encoder

![](http://pytorch.org/tutorials/_images/encoder-network.png)

outputはどうせ使わないので活性化関数にかけないと思われる

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size

        # 上の図のように、必要なlayerは2つ (embeddingとgru)
        # ただしgruはn_layers
        # embeddingの大きさもhidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        # num_layersを引数として渡していないので、デフォルトの1
        self.gru = nn.GRU(hidden_size, hidden_size)

    # inputは、1単語のindicesからなるVariable (sizeは[1, 1])
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        # GRUのnum_layersを1にしたので、自分でforループを回す必要がある
        for i in range(self.n_layers):
            output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

### The Decoder

#### Simple Decoder

![](http://pytorch.org/tutorials/_images/decoder-network.png)

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1):
        super(DecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        for i in range(self.n_layers):
            output = F.relu(output)
            output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

#### Attention Decoder

今回はglobal attention

![](http://pytorch.org/tutorials/_images/attention-decoder-network.png)

- attn_combineをreluに掛けたものがc_tに相当する
- attention_weightsを計算する
  - another feed-forward layer *attn* によって計算される
    - decoderの現在のtにおける入力とprev hidden stateを入力として用いる
- attention weightsは、encoderの各tの出力vectorに掛け合わせることで、重み付きのcombinationが作成される
  - その結果が *attn_applied* で、input sequenceの特定のパートの情報を含む
    - decoderが正しい出力の単語を選ぶのに役立つ
    
bmmは、バッチについてmatmulを行うということ

``` python
>>> batch1 = torch.randn(10, 3, 4)
>>> batch2 = torch.randn(10, 4, 5)
>>> res = torch.bmm(batch1, batch2)
>>> res.size()
torch.Size([10, 3, 5])
```

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        self.max_length = max_length

        # layers
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        
        # 出力文字を決定するlayerなので、outputの大きさがoutput_size
        self.out = nn.Linear(self.hidden_size, self.output_size)

    # inputは1単語分のindex (t-1でのdecoderの出力またはteacher forcing時の正解のtargetの1単語分)
    # encoder_outputsは各tでのsourceのhidden stateのこと
    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        # 論文の(9)式のやり方 (location-based)
        # なぜならば、sourceのhidden stateを与えていない
        # torch.cat((embedded[0], attn_applied[0]), 1)は、h_(t_1)とinputからh_t (ただしattentionalではない)を生成している
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        for i in range(self.n_layers):
            output = F.relu(output)
            output, hidden = self.gru(output, hidden)

        # softmaxに掛けて、出力文字を決定
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    # forwardではhiddenを与える必要があるため、一番最初のforwardのためにこのメソッドでhiddenを作成する
    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

## Training

### Preparing Training Data

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def variableFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    # EOSをappend
    indexes.append(EOS_token)
    # view(-1, 1)で1次元の配列を2次元にしている
    result = Variable(torch.LongTensor(indexes).view(-1, 1))
    if use_cuda:
        return result.cuda()
    else:
        return result

# trainEpochs内で呼び出される
# pairは[入力文、翻訳文]の形式
def variablesFromPair(pair):
    input_variable = variableFromSentence(input_lang, pair[0])
    target_variable = variableFromSentence(output_lang, pair[1])
    return (input_variable, target_variable)

### Training the Model

“Teacher forcing” is the concept of using the real target outputs as each next input, instead of using the decoder’s guess as the next input. Using teacher forcing causes it to converge faster but when the trained network is exploited, it may exhibit instability.

In [None]:
teacher_forcing_ratio = 0.5

# trainEpochs内で、1sentence分が渡される
def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # input_variableの1次元目の大きさを取得 (単語の数ということ)
    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs

    loss = 0

    # 1文字ずつencoderに掛ける
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_variable[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0][0]

    decoder_input = Variable(torch.LongTensor([[SOS_token]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_variable[di])
            decoder_input = target_variable[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.data.topk(1)
            ni = topi[0][0]

            decoder_input = Variable(torch.LongTensor([[ni]]))
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input

            loss += criterion(decoder_output, target_variable[di])
            if ni == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0] / target_length

#### 時間計測

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

# percent: これまでに完了したepochの割合
def timeSince(since, percent):
    now = time.time()
    s = now - since
    # 全部でどれだけかかるか予測
    es = s / (percent)
    rs = es - s
    # 残り時間の予測
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

以下を実行して学習する

- Start a timer
- Initialize optimizers and criterion
- Create set of training pairs
- Start empty losses array for plotting

In [None]:
# n_epochsはepochの数というよりは、学習データの量
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    # 1. timer起動
    start = time.time()   
    # 4. plottingのためのlosses arrayを初期化
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    # 2. optimizerとcriterionを初期化
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    # 3. training pairsを作成 (量はn_epochs)
    # 重複を許容して、pairsからランダムにn_epochs数分Variableを作成
    training_pairs = [variablesFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_variable = training_pair[0]
        target_variable = training_pair[1]

        # 学習を行う
        loss = train(input_variable, target_variable, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)
