# Finding Maximum Contiguous Subsequence Sum Using Deep Learning
## 问题描述

设计一个网络结构，解决如下问题：输入一个sequence和一个标量query，找出该序列中连续query个数相加最大的子序列的起始位置

Constraints：
1. batch = 32, lr=0.01, optimizer=Adam，只能跑一个epoch
2. 为避免直接利用数字信息，将sequence和query统一embed到8维空间作为输入

考察指标
最后100个batch的平均准确率大于96%

### Step 1. Label数据

In [35]:
import pandas as pd
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter

#### 1.1 Load Data

In [2]:
sequences = pd.read_csv('data/task3_passage.csv', header=None)
sequences.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,6,7,5,9,1,2,8,0,4,3
1,2,0,3,7,8,5,9,1,6,4
2,9,3,2,0,7,1,6,5,8,4


In [3]:
queries = pd.read_csv('data/task3_query.csv', header=None)
queries.head(3)

Unnamed: 0,0
0,3
1,5
2,9


In [4]:
seqs_concat = pd.concat([sequences, queries], axis=1)  # concat data for easy processing
seqs_concat.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,0.1
0,6,7,5,9,1,2,8,0,4,3,3
1,2,0,3,7,8,5,9,1,6,4,5
2,9,3,2,0,7,1,6,5,8,4,9


#### 1.2 Label Data

In [5]:
def find_max_subseq_i(row):
    '''
    Find contiguous subsequence with maximum sum using 1-layer convolution
    '''
    seq = torch.tensor(row.iloc[:-1].values, dtype=torch.float).view(1, 1, -1)  # retrieve and reshape original seq to 3 dim
    query = row.iloc[-1]  # retrieve query
    conv = nn.Conv1d(1, 1, query, bias=False)  # build convolution with kernel size 3
    conv.weight.data.fill_(torch.tensor(1))  # fill fixed value 1 into conv weight
    sum_ = conv(seq)
    
    return sum_.argmax().item()  # get max starting index

labels = seqs_concat.apply(find_max_subseq_i, axis=1)
labels.to_csv("data/task3_labels.csv", index=False)

In [19]:
labels.head(3)

0    1
1    2
2    0
dtype: int64

In [181]:
# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def oneHot(x, length=11):
    tensor = torch.zeros(length)
    tensor[x+1] = 1
#     print(tensor)
    return tensor

# one_hot_labels = labels.apply(oneHot)

#### 1.3 Prepare Training Data

In [20]:
training_data = list(zip(seqs_concat.values, labels.values))
BATCH_SIZE = 32
trainloader=DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)

### Step 2. 设计网络结构

#### 2.1 LSTM

In [17]:
# define hyperparameter
EMBEDDING_DIM = 8
HIDDEN_DIM = 64

In [428]:
class LSTMIdxer(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, idx_size, batch_size):
        super(LSTMIdxer, self).__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
    
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # The linear layer that maps from hidden state space to index space
        self.hidden2idx = nn.Linear(hidden_dim, idx_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.zeros(1, self.batch_size, self.hidden_dim),
                torch.zeros(1, self.batch_size, self.hidden_dim))

    def forward(self, seq, target_loss="CrossEntropyLoss"):
        embeds = self.embeddings(seq)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden) 
#         idx_space = self.hidden2idx(self.hidden[0])[0]
        idx_space = self.hidden2idx(self.hidden[0]).unsqueeze(0)
        if target_loss == "CrossEntropyLoss":
            return F.log_softmax(idx_space, dim=1)
        elif target_loss == "MSELoss":
            return idx_space
#         return idx_scores

Train the model:

In [463]:
def train_model(model, loss_function, name_scope, epochs=1):
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    writer = SummaryWriter()
    for epoch in range(epochs):
        total_loss = total_correct = total_sample = 0
        for i, data in enumerate(trainloader):
            indicator = i % 100

            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Also, we need to clear out the hidden state of the LSTM,
            # detaching it from its history on the last instance.
            model.hidden = model.init_hidden()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
            inputs, idx = data
            # Step 3. Run our forward pass.
            if type(loss_function) == torch.nn.modules.loss.CrossEntropyLoss:
                idx_scores = model(inputs, "CrossEntropyLoss")
                predicted = torch.argmax(idx_scores, 1)
            elif type(loss_function) == torch.nn.modules.loss.MSELoss:
                idx =idx.float()  # cast to float
                idx_scores = model(inputs, "MSELoss").view(idx.size())
                predicted = torch.round(idx_scores)
            total_correct += (predicted == idx).sum().item()
            total_sample += idx_scores.size(0)
            
            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(idx_scores, idx)
            total_loss += loss
            if indicator == 99:
                ith = int(i/100)
                avg_loss = total_loss/(indicator+1)
                avg_acc = total_correct/total_sample
                print("{}th 100 batches, average loss: {}, average accuracy: {}".format(ith, avg_loss, avg_acc))
                writer.add_scalar(name_scope+'/avg_loss', avg_loss, ith)
                writer.add_scalar(name_scope+'/avg_acc', avg_acc, ith)
                total_loss = total_correct = total_sample = 0

            loss.backward()
            optimizer.step()

    writer.close()

LSTM with cross entropy Loss:

In [429]:
lstm_cr = LSTMIdxer(EMBEDDING_DIM, HIDDEN_DIM, 11, 10, BATCH_SIZE)
cr_loss = nn.CrossEntropyLoss()
print(lstm_cr)

LSTMIdxer(
  (embeddings): Embedding(11, 8)
  (lstm): LSTM(8, 64, batch_first=True)
  (hidden2idx): Linear(in_features=64, out_features=10, bias=True)
)


In [462]:
train_model(lstm_cr, cr_loss, "lstm_cr", epochs=1)

0th 100 batches, average loss: 0.7154936790466309, average accuracy: 0.718125
1th 100 batches, average loss: 0.679318904876709, average accuracy: 0.7296875
2th 100 batches, average loss: 0.650575578212738, average accuracy: 0.735625
3th 100 batches, average loss: 0.6050158739089966, average accuracy: 0.77125
4th 100 batches, average loss: 0.6724005341529846, average accuracy: 0.7340625
5th 100 batches, average loss: 0.6295040249824524, average accuracy: 0.7540625
6th 100 batches, average loss: 0.5945451855659485, average accuracy: 0.770625
7th 100 batches, average loss: 0.632266104221344, average accuracy: 0.7575
8th 100 batches, average loss: 0.6017310619354248, average accuracy: 0.7615625
9th 100 batches, average loss: 0.5905725955963135, average accuracy: 0.7671875
10th 100 batches, average loss: 0.5930705666542053, average accuracy: 0.768125
11th 100 batches, average loss: 0.5778584480285645, average accuracy: 0.7753125
12th 100 batches, average loss: 0.5924740433692932, average ac

LSTM with mean squared error Loss:

In [431]:
lstm_mse = LSTMIdxer(EMBEDDING_DIM, HIDDEN_DIM, 11, 1, BATCH_SIZE)
mse_loss = nn.MSELoss()
print(lstm_mse)

LSTMIdxer(
  (embeddings): Embedding(11, 8)
  (lstm): LSTM(8, 64, batch_first=True)
  (hidden2idx): Linear(in_features=64, out_features=1, bias=True)
)


In [454]:
train_model(lstm_mse, mse_loss, "lstm_mse", epochs=1)

0th 100 batches, average loss: 1.0872138738632202, average accuracy: 0.5240625
1th 100 batches, average loss: 1.1440348625183105, average accuracy: 0.4890625
2th 100 batches, average loss: 1.1651543378829956, average accuracy: 0.5115625
3th 100 batches, average loss: 0.9776139259338379, average accuracy: 0.524375
4th 100 batches, average loss: 0.9620973467826843, average accuracy: 0.555625
5th 100 batches, average loss: 0.9998934864997864, average accuracy: 0.5465625
6th 100 batches, average loss: 0.9990928173065186, average accuracy: 0.539375
7th 100 batches, average loss: 0.975043773651123, average accuracy: 0.5496875
8th 100 batches, average loss: 0.9823878407478333, average accuracy: 0.555
9th 100 batches, average loss: 0.9894894361495972, average accuracy: 0.5721875
10th 100 batches, average loss: 0.8785991668701172, average accuracy: 0.5784375
11th 100 batches, average loss: 0.867185115814209, average accuracy: 0.599375
12th 100 batches, average loss: 0.8007153272628784, average 

In [385]:
# See what the scores are after training
with torch.no_grad():
    predicted = lstm_cr(inputs)
    print("Predicted:\n{}\n\nActual:\n{}".format(predicted, idx))

torch.Size([32, 10])
Predicted:
None

Actual:
tensor([ 3,  4,  0,  4,  6,  0,  0,  0,  2,  8,  2,  0,  2,  1,
         3,  0,  0,  3,  3,  2,  3,  0,  2,  2,  4,  2,  2,  2,
         0,  3,  5,  1])


#### 2.2 Sequence to Sequence Network and Attention

In [53]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True)

    def forward(self, seq, hidden):
        embedded = self.embedding(seq)
#         print(seq.size(), embedded.size(), hidden.size())
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

In [182]:
MAX_LENGTH = 11

In [258]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
#         self.dropout_p = dropout_p
        self.max_length = max_length

#         self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
#         self.attn = nn.Linear(self.hidden_size+1, self.max_length)
#         self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.attn_combine = nn.Linear(self.hidden_size+1, self.hidden_size)
#         self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size, batch_first=True)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs, di):
#         embedded = self.embedding(input)
#         embedded = self.dropout(embedded)
#         print(input.size(), hidden.size())
#         attn_weights = F.softmax(
#             self.attn(torch.cat((input, hidden[0]), 1)), dim=1)
        attn_weights = F.softmax(
            self.attn(torch.cat((encoder_outputs, hidden.transpose(0, 1).repeat(1, encoder_outputs.size(1), 1)), -1)), dim=-1).transpose(1, 2)
#         print(attn_weights.size(), encoder_outputs.size())
        attn_applied = torch.bmm(attn_weights,
                                 encoder_outputs)
#         print(attn_applied.size())
        input = input.unsqueeze(1).float()
#         print(input.size(), attn_applied[:, di, :].size())
#         print(input, attn_applied[:, di, :])
        output = torch.cat((input, attn_applied[:, di, :]), 1)
#         print(output.size())
        output = self.attn_combine(output).unsqueeze(1)
#         print(output.size())

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
#         print(output.size())
        output = F.log_softmax(self.out(output), dim=-1)
#         print(output.size())
        return output, hidden, attn_weights

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [260]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden(BATCH_SIZE)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # input_length = input_tensor.size(0)
    input_length = 11
    # target_length = target_tensor.size(0)
    target_length = 11

#     encoder_outputs = torch.zeros(max_length, encoder.hidden_size)

    loss = 0
    
    encoder_outputs, encoder_hidden = encoder(
            input_tensor, encoder_hidden)
#     for ei in range(input_length):
#         encoder_output, encoder_hidden = encoder(
#             input_tensor[:, ei], encoder_hidden)
#         encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([0]*BATCH_SIZE, dtype=torch.long)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    current_correct = 0
#     if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
    for di in range(target_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs, di)
#             print(decoder_output.squeeze().size(), target_tensor[:, di].size())
#             print(target_tensor)
        loss += criterion(decoder_output.squeeze(), target_tensor[:, di])
#         print(decoder_output.size())
        topv, topi = decoder_output.topk(1, -1)
#         print(topi)
#         print(topi.size(), target_tensor.size())

        # Teacher forcing: Feed the target as the next input
        if use_teacher_forcing:
            decoder_input = target_tensor[:, di]  # Teacher forcing
        else:
            decoder_input = topi.squeeze().detach()  # detach from history as input
        
        # find true positive samples for current position
#         print(di)
        topi = topi.squeeze().long()
#         print(topi)
        tp = topi + target_tensor[:, di] == 2
#         print(target_tensor[:, di], tp)
        current_correct += tp.sum()
#         print(current_correct)
#         if tp.sum() != 0:
#             print(current_correct)
#             print("!!!!!!!!!!", tp.sum())
            
            
#             return
        
#         if topi == 1 and target_tensor[di] == 1:
            

#     else:
#         # Without teacher forcing: use its own predictions as the next input
#         for di in range(target_length):
#             decoder_output, decoder_hidden, decoder_attention = decoder(
#                 decoder_input, decoder_hidden, encoder_outputs, di)
# #             topv, topi = decoder_output.topk(1)
#             decoder_input = topi.squeeze().detach()  # detach from history as input

#             loss += criterion(decoder_output.squeeze(), target_tensor[:, di])
    
            
#         topv, topi = decoder_output.data.topk(1, -1)
#         print(topv, topi)
#         if topi == 1 and di

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()
#     print(current_correct)
    return (loss.item() / target_length, current_correct)

def trainIters(encoder, decoder, epochs=1, learning_rate=0.01):

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    # training_pairs = [tensorsFromPair(random.choice(pairs))
    #                   for i in range(n_iters)]
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = total_correct = total_sample = 0
        for i, data in enumerate(trainloader):
            indicator = i % 100
            input_tensor, idx = data
            target_tensor = torch.zeros(idx.size(0), 11, dtype=torch.long)
            for j, v in enumerate(idx):
                target_tensor[j] = oneHot(v)
#             print(input_tensor.size())

            loss, current_correct = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
#             print(current_correct)
            total_loss += loss
            total_correct += current_correct
#             print(total_correct)
            total_sample += idx.size(0)
            if indicator == 99:
                ith = int(i/100)
                avg_loss = total_loss/(indicator+1)
                avg_acc = total_correct.item()/total_sample
#                 print(total_correct, total_sample, avg_acc)
                print("{}th 100 batches, average loss: {}, average accuracy: {}".format(ith, avg_loss, avg_acc))
                total_loss = total_correct = total_sample = 0

hidden_size = 32
encoder1 = EncoderRNN(11, EMBEDDING_DIM, hidden_size)
attn_decoder1 = AttnDecoderRNN(hidden_size, 2, dropout_p=0.1)

trainIters(encoder1, attn_decoder1)

0th 100 batches, average loss: 0.25839762449264514, average accuracy: 0.1040625
1th 100 batches, average loss: 0.1723626926812258, average accuracy: 0.3828125
2th 100 batches, average loss: 0.16276010545817285, average accuracy: 0.4203125
3th 100 batches, average loss: 0.15036865223537793, average accuracy: 0.473125
4th 100 batches, average loss: 0.12807980076833203, average accuracy: 0.576875
5th 100 batches, average loss: 0.1276687277988955, average accuracy: 0.5834375
6th 100 batches, average loss: 0.12831343618306248, average accuracy: 0.580625
7th 100 batches, average loss: 0.11694440717046911, average accuracy: 0.6440625
8th 100 batches, average loss: 0.11716491756114097, average accuracy: 0.62875
9th 100 batches, average loss: 0.11737928368828511, average accuracy: 0.631875
10th 100 batches, average loss: 0.11287084183909675, average accuracy: 0.6475
11th 100 batches, average loss: 0.10700342032042419, average accuracy: 0.6821875
12th 100 batches, average loss: 0.112205465923656

In [274]:
class AttnDecoderRNN1(nn.Module):
    def __init__(self, hidden_size, output_size, max_length=10):
        super(AttnDecoderRNN1, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length

        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size, batch_first=True)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, hidden, encoder_outputs):
        attn_weights = F.softmax(
            self.attn(torch.cat((encoder_outputs, hidden.transpose(0, 1).repeat(1, encoder_outputs.size(1), 1)), -1)), dim=-1).transpose(1, 2)
        attn_applied = torch.bmm(attn_weights,
                                 encoder_outputs)
        output = F.relu(attn_applied)
        output, hidden = self.gru(output, hidden)
        output = F.log_softmax(self.out(output), dim=-1)
        return output, hidden, attn_weights

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

def trainIters1(encoder, decoder, epochs=1, learning_rate=0.01):

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    # training_pairs = [tensorsFromPair(random.choice(pairs))
    #                   for i in range(n_iters)]
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = total_correct = total_sample = 0
        for i, data in enumerate(trainloader):
            indicator = i % 100
            input_tensor, target_tensor = data
            
            encoder_hidden = encoder.initHidden(BATCH_SIZE)

            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            input_length = 11
            target_length = 10

            loss = 0

            encoder_outputs, encoder_hidden = encoder(
                    input_tensor, encoder_hidden)

            decoder_input = torch.tensor([0]*BATCH_SIZE, dtype=torch.long)
            decoder_hidden = encoder_hidden

            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_hidden, encoder_outputs)
            
            predicted = torch.argmax(decoder_output[:, -1, :], -1)
            total_correct += (predicted == target_tensor).sum().item()
            total_sample += target_tensor.size(0)
            
#             print(decoder_output.size(), target_tensor.size())
            loss += criterion(decoder_output[:, -1, :], target_tensor)
            total_loss += loss
            if indicator == 99:
                ith = int(i/100)
                avg_loss = total_loss/(indicator+1)
                avg_acc = total_correct/total_sample
                print("{}th 100 batches, average loss: {}, average accuracy: {}".format(ith, avg_loss, avg_acc))
                total_loss = total_correct = total_sample = 0
                
            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()
    

hidden_size = 32
encoder1 = EncoderRNN(11, EMBEDDING_DIM, hidden_size)
attn_decoder2 = AttnDecoderRNN1(hidden_size, 10)

trainIters1(encoder1, attn_decoder2)

0th 100 batches, average loss: 1.7152857780456543, average accuracy: 0.3646875
1th 100 batches, average loss: 1.2777975797653198, average accuracy: 0.491875
2th 100 batches, average loss: 1.1254957914352417, average accuracy: 0.551875
3th 100 batches, average loss: 1.0514452457427979, average accuracy: 0.5746875
4th 100 batches, average loss: 0.9812842011451721, average accuracy: 0.6065625
5th 100 batches, average loss: 0.9451970458030701, average accuracy: 0.62625
6th 100 batches, average loss: 0.9167865514755249, average accuracy: 0.6471875
7th 100 batches, average loss: 0.881909966468811, average accuracy: 0.6575
8th 100 batches, average loss: 0.8493098616600037, average accuracy: 0.6721875
9th 100 batches, average loss: 0.8256438970565796, average accuracy: 0.676875
10th 100 batches, average loss: 0.7834854125976562, average accuracy: 0.6925
11th 100 batches, average loss: 0.768293559551239, average accuracy: 0.69875
12th 100 batches, average loss: 0.7726684808731079, average accur

In [277]:
class AttnDecoderRNN1(nn.Module):
    def __init__(self, hidden_size, output_size, max_length=10):
        super(AttnDecoderRNN1, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length

        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size, batch_first=True)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, hidden, encoder_outputs):
        attn_weights = F.softmax(
            self.attn(torch.cat((encoder_outputs, hidden.transpose(0, 1).repeat(1, encoder_outputs.size(1), 1)), -1)), dim=-1).transpose(1, 2)
        attn_applied = torch.bmm(attn_weights,
                                 encoder_outputs)
#         output = F.relu(attn_applied)
        output, hidden = self.gru(output, hidden)
        output = F.log_softmax(self.out(output), dim=-1)
        return output, hidden, attn_weights

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

def trainIters1(encoder, decoder, epochs=1, learning_rate=0.01):

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = total_correct = total_sample = 0
        for i, data in enumerate(trainloader):
            indicator = i % 100
            input_tensor, target_tensor = data
            
            encoder_hidden = encoder.initHidden(BATCH_SIZE)

            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            input_length = 11
            target_length = 10

            loss = 0

            encoder_outputs, encoder_hidden = encoder(
                    input_tensor, encoder_hidden)

            decoder_hidden = decoder.initHidden(BATCH_SIZE)  # Modified

            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_hidden, encoder_outputs)
            
            predicted = torch.argmax(decoder_output[:, -1, :], -1)
            total_correct += (predicted == target_tensor).sum().item()
            total_sample += target_tensor.size(0)
            
#             print(decoder_output.size(), target_tensor.size())
            loss += criterion(decoder_output[:, -1, :], target_tensor)
            total_loss += loss
            if indicator == 99:
                ith = int(i/100)
                avg_loss = total_loss/(indicator+1)
                avg_acc = total_correct/total_sample
                print("{}th 100 batches, average loss: {}, average accuracy: {}".format(ith, avg_loss, avg_acc))
                total_loss = total_correct = total_sample = 0
                
            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()

hidden_size = 32
encoder1 = EncoderRNN(11, EMBEDDING_DIM, hidden_size)
attn_decoder2 = AttnDecoderRNN1(hidden_size, 10)

trainIters1(encoder1, attn_decoder2)

0th 100 batches, average loss: 1.9289747476577759, average accuracy: 0.3309375
1th 100 batches, average loss: 1.5320793390274048, average accuracy: 0.43125
2th 100 batches, average loss: 1.3366148471832275, average accuracy: 0.5021875
3th 100 batches, average loss: 1.208532452583313, average accuracy: 0.5378125
4th 100 batches, average loss: 1.1551700830459595, average accuracy: 0.5378125
5th 100 batches, average loss: 1.1031537055969238, average accuracy: 0.5578125
6th 100 batches, average loss: 1.0327820777893066, average accuracy: 0.590625
7th 100 batches, average loss: 1.0193594694137573, average accuracy: 0.595
8th 100 batches, average loss: 0.9857317209243774, average accuracy: 0.6278125
9th 100 batches, average loss: 0.9393985271453857, average accuracy: 0.630625
10th 100 batches, average loss: 0.9076352119445801, average accuracy: 0.6528125
11th 100 batches, average loss: 0.8802374005317688, average accuracy: 0.65625
12th 100 batches, average loss: 0.8838822245597839, average a

In [292]:
class EncoderLSTM(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, idx_size, batch_size):
        super(EncoderLSTM, self).__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
    
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # The linear layer that maps from hidden state space to index space
        self.hidden2idx = nn.Linear(hidden_dim, idx_size)

    def init_hidden(self):
        return (torch.zeros(1, self.batch_size, self.hidden_dim),
                torch.zeros(1, self.batch_size, self.hidden_dim))

    def forward(self, seq, hidden):
        embeds = self.embeddings(seq)
        out, hidden = self.lstm(embeds, hidden) 
        return out, hidden[0]

class AttnDecoderRNN2(nn.Module):
    def __init__(self, hidden_size, output_size, max_length=10):
        super(AttnDecoderRNN2, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length

        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size, batch_first=True)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, hidden, encoder_outputs):
        hidden_concat = torch.cat((encoder_outputs, hidden.transpose(0, 1).repeat(1, encoder_outputs.size(1), 1)), -1)
        attn_weights = F.softmax(self.attn(hidden_concat), dim=-1).transpose(1, 2)
        attn_applied = torch.bmm(attn_weights,
                                 encoder_outputs)
        output = F.relu(attn_applied)
        output, hidden = self.gru(output, hidden)
        output = F.log_softmax(self.out(output), dim=-1)
        return output, hidden, attn_weights

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

def trainIters1(encoder, decoder, epochs=1, learning_rate=0.01):

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = total_correct = total_sample = 0
        for i, data in enumerate(trainloader):
            indicator = i % 100
            input_tensor, target_tensor = data
            
            encoder_hidden = encoder.init_hidden()

            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            input_length = 11
            target_length = 10

            loss = 0

            encoder_outputs, encoder_hidden = encoder(
                    input_tensor, encoder_hidden)

            decoder_hidden = encoder_hidden

            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_hidden, encoder_outputs)
            
            predicted = torch.argmax(decoder_output[:, -1, :], -1)
            total_correct += (predicted == target_tensor).sum().item()
            total_sample += target_tensor.size(0)
            
            loss += criterion(decoder_output[:, -1, :], target_tensor)
            total_loss += loss
            if indicator == 99:
                ith = int(i/100)
                avg_loss = total_loss/(indicator+1)
                avg_acc = total_correct/total_sample
                print("{}th 100 batches, average loss: {}, average accuracy: {}".format(ith, avg_loss, avg_acc))
                total_loss = total_correct = total_sample = 0
                
            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()

hidden_size = 32
encoder2 = EncoderLSTM(EMBEDDING_DIM, hidden_size, 11, 10, BATCH_SIZE)
attn_decoder2 = AttnDecoderRNN2(hidden_size, 10)

trainIters1(encoder2, attn_decoder2)

0th 100 batches, average loss: 1.6825774908065796, average accuracy: 0.3715625
1th 100 batches, average loss: 1.3043633699417114, average accuracy: 0.46625
2th 100 batches, average loss: 1.1780871152877808, average accuracy: 0.5334375
3th 100 batches, average loss: 1.0982396602630615, average accuracy: 0.5675
4th 100 batches, average loss: 1.0089439153671265, average accuracy: 0.61125
5th 100 batches, average loss: 0.9695562124252319, average accuracy: 0.6215625
6th 100 batches, average loss: 0.9329012036323547, average accuracy: 0.635
7th 100 batches, average loss: 0.8980787396430969, average accuracy: 0.6578125
8th 100 batches, average loss: 0.8861342072486877, average accuracy: 0.6503125
9th 100 batches, average loss: 0.8497655987739563, average accuracy: 0.6675
10th 100 batches, average loss: 0.8327069878578186, average accuracy: 0.6753125
11th 100 batches, average loss: 0.7890624403953552, average accuracy: 0.6896875
12th 100 batches, average loss: 0.7894924283027649, average accu

In [298]:
class EncoderLSTM1(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, idx_size, batch_size):
        super(EncoderLSTM1, self).__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
    
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # The linear layer that maps from hidden state space to index space
        self.hidden2idx = nn.Linear(hidden_dim, idx_size)

    def init_hidden(self):
        return (torch.zeros(1, self.batch_size, self.hidden_dim),
                torch.zeros(1, self.batch_size, self.hidden_dim))

    def forward(self, seq, hidden):
        embeds = self.embeddings(seq)
        out, hidden = self.lstm(embeds, hidden) 
        return out, hidden

class AttnDecoderRNN3(nn.Module):
    def __init__(self, hidden_size, output_size, max_length=10):
        super(AttnDecoderRNN3, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length

        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, batch_first=True)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, hidden, encoder_outputs):
        hidden_concat = torch.cat((encoder_outputs, hidden[0].transpose(0, 1).repeat(1, encoder_outputs.size(1), 1)), -1)
        attn_weights = F.softmax(self.attn(hidden_concat), dim=-1).transpose(1, 2)
        attn_applied = torch.bmm(attn_weights,
                                 encoder_outputs)
        output = F.relu(attn_applied)
        output, hidden = self.lstm(output, hidden)
        output = F.log_softmax(self.out(output), dim=-1)
        return output, hidden[0], attn_weights

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

def trainIters1(encoder, decoder, epochs=1, learning_rate=0.01):

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = total_correct = total_sample = 0
        for i, data in enumerate(trainloader):
            indicator = i % 100
            input_tensor, target_tensor = data
            
            encoder_hidden = encoder.init_hidden()

            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            input_length = 11
            target_length = 10

            loss = 0

            encoder_outputs, encoder_hidden = encoder(
                    input_tensor, encoder_hidden)

            decoder_hidden = encoder_hidden

            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_hidden, encoder_outputs)
            
            predicted = torch.argmax(decoder_output[:, -1, :], -1)
            total_correct += (predicted == target_tensor).sum().item()
            total_sample += target_tensor.size(0)
            
            loss += criterion(decoder_output[:, -1, :], target_tensor)
            total_loss += loss
            if indicator == 99:
                ith = int(i/100)
                avg_loss = total_loss/(indicator+1)
                avg_acc = total_correct/total_sample
                print("{}th 100 batches, average loss: {}, average accuracy: {}".format(ith, avg_loss, avg_acc))
                total_loss = total_correct = total_sample = 0
                
            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()

hidden_size = 128
encoder3 = EncoderLSTM1(EMBEDDING_DIM, hidden_size, 11, 10, BATCH_SIZE)
attn_decoder3 = AttnDecoderRNN3(hidden_size, 10)

trainIters1(encoder3, attn_decoder3)

0th 100 batches, average loss: 1.9334077835083008, average accuracy: 0.3490625
1th 100 batches, average loss: 1.4360328912734985, average accuracy: 0.448125
2th 100 batches, average loss: 1.1527937650680542, average accuracy: 0.545
3th 100 batches, average loss: 1.0687216520309448, average accuracy: 0.584375
4th 100 batches, average loss: 0.9297465682029724, average accuracy: 0.6390625
5th 100 batches, average loss: 0.8858238458633423, average accuracy: 0.6665625
6th 100 batches, average loss: 0.8168993592262268, average accuracy: 0.67375
7th 100 batches, average loss: 0.769267201423645, average accuracy: 0.7065625
8th 100 batches, average loss: 0.7238100171089172, average accuracy: 0.7203125
9th 100 batches, average loss: 0.7250379323959351, average accuracy: 0.7303125
10th 100 batches, average loss: 0.670809268951416, average accuracy: 0.7525
11th 100 batches, average loss: 0.611447274684906, average accuracy: 0.761875
12th 100 batches, average loss: 0.633238673210144, average accura

In [299]:
hidden_size = 256
encoder3 = EncoderLSTM1(EMBEDDING_DIM, hidden_size, 11, 10, BATCH_SIZE)
attn_decoder3 = AttnDecoderRNN3(hidden_size, 10)

trainIters1(encoder3, attn_decoder3)

0th 100 batches, average loss: 1.8035305738449097, average accuracy: 0.3596875
1th 100 batches, average loss: 1.4236353635787964, average accuracy: 0.4425
2th 100 batches, average loss: 1.2316842079162598, average accuracy: 0.5128125
3th 100 batches, average loss: 1.0613094568252563, average accuracy: 0.5803125
4th 100 batches, average loss: 0.9940977692604065, average accuracy: 0.6040625
5th 100 batches, average loss: 0.8811402916908264, average accuracy: 0.66375
6th 100 batches, average loss: 0.8457968831062317, average accuracy: 0.6790625
7th 100 batches, average loss: 0.7843658328056335, average accuracy: 0.701875
8th 100 batches, average loss: 0.7652029991149902, average accuracy: 0.7090625
9th 100 batches, average loss: 0.709677517414093, average accuracy: 0.731875
10th 100 batches, average loss: 0.6592293381690979, average accuracy: 0.74625
11th 100 batches, average loss: 0.6432557702064514, average accuracy: 0.7484375
12th 100 batches, average loss: 0.6190547347068787, average 

In [306]:
class EncoderLSTM1(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, idx_size, batch_size, num_layers):
        super(EncoderLSTM1, self).__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = num_layers
    
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)

        # The linear layer that maps from hidden state space to index space
        self.hidden2idx = nn.Linear(hidden_dim, idx_size)

    def init_hidden(self):
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))

    def forward(self, seq, hidden):
        embeds = self.embeddings(seq)
        out, hidden = self.lstm(embeds, hidden) 
        return out, hidden

class AttnDecoderRNN3(nn.Module):
    def __init__(self, hidden_size, output_size, max_length=10):
        super(AttnDecoderRNN3, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length

        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, batch_first=True)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, hidden, encoder_outputs):
        hidden_concat = torch.cat((encoder_outputs, hidden[0].transpose(0, 1).repeat(1, encoder_outputs.size(1), 1)), -1)
        attn_weights = F.softmax(self.attn(hidden_concat), dim=-1).transpose(1, 2)
        attn_applied = torch.bmm(attn_weights,
                                 encoder_outputs)
        output = F.relu(attn_applied)
        output, hidden = self.lstm(output, hidden)
        output = F.log_softmax(self.out(output), dim=-1)
        return output, hidden[0], attn_weights

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

def trainIters1(encoder, decoder, epochs=1, learning_rate=0.01):

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = total_correct = total_sample = 0
        for i, data in enumerate(trainloader):
            indicator = i % 100
            input_tensor, target_tensor = data
            
            encoder_hidden = encoder.init_hidden()

            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            input_length = 11
            target_length = 10

            loss = 0

            encoder_outputs, encoder_hidden = encoder(
                    input_tensor, encoder_hidden)

            decoder_hidden = [s[1].unsqueeze(0) for s in encoder_hidden]
#             print(decoder_hidden[0].size(), decoder_hidden[1].size())

            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_hidden, encoder_outputs)
            
            predicted = torch.argmax(decoder_output[:, -1, :], -1)
            total_correct += (predicted == target_tensor).sum().item()
            total_sample += target_tensor.size(0)
            
            loss += criterion(decoder_output[:, -1, :], target_tensor)
            total_loss += loss
            if indicator == 99:
                ith = int(i/100)
                avg_loss = total_loss/(indicator+1)
                avg_acc = total_correct/total_sample
                print("{}th 100 batches, average loss: {}, average accuracy: {}".format(ith, avg_loss, avg_acc))
                total_loss = total_correct = total_sample = 0
                
            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()

hidden_size = 128
encoder3 = EncoderLSTM1(EMBEDDING_DIM, hidden_size, 11, 10, BATCH_SIZE, 2)
attn_decoder3 = AttnDecoderRNN3(hidden_size, 10)

trainIters1(encoder3, attn_decoder3)

0th 100 batches, average loss: 1.9609668254852295, average accuracy: 0.3334375
1th 100 batches, average loss: 1.5360926389694214, average accuracy: 0.4009375
2th 100 batches, average loss: 1.2768477201461792, average accuracy: 0.48875
3th 100 batches, average loss: 1.1215893030166626, average accuracy: 0.561875
4th 100 batches, average loss: 0.9726362824440002, average accuracy: 0.62625
5th 100 batches, average loss: 0.9060349464416504, average accuracy: 0.64875
6th 100 batches, average loss: 0.84651780128479, average accuracy: 0.67125
7th 100 batches, average loss: 0.8034326434135437, average accuracy: 0.6853125
8th 100 batches, average loss: 0.7657501697540283, average accuracy: 0.7040625
9th 100 batches, average loss: 0.7156890034675598, average accuracy: 0.724375
10th 100 batches, average loss: 0.7101845741271973, average accuracy: 0.735625
11th 100 batches, average loss: 0.6642599701881409, average accuracy: 0.7503125
12th 100 batches, average loss: 0.6326753497123718, average acc

In [311]:
class EncoderLSTM1(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, idx_size, batch_size, num_layers):
        super(EncoderLSTM1, self).__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = num_layers
    
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)

        # The linear layer that maps from hidden state space to index space
        self.hidden2idx = nn.Linear(hidden_dim, idx_size)

    def init_hidden(self):
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))

    def forward(self, seq, hidden):
        embeds = self.embeddings(seq)
        out, hidden = self.lstm(embeds, hidden) 
        return out, hidden

class AttnDecoderRNN3(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, max_length=10):
        super(AttnDecoderRNN3, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length
        self.num_layers = num_layers
        
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, self.num_layers, batch_first=True)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, hidden, encoder_outputs):
        hidden_concat = torch.cat((encoder_outputs, hidden[0][-1].unsqueeze(1).repeat(1, encoder_outputs.size(1), 1)), -1)
        attn_weights = F.softmax(self.attn(hidden_concat), dim=-1).transpose(1, 2)
        attn_applied = torch.bmm(attn_weights,
                                 encoder_outputs)
        output = F.relu(attn_applied)
        output, hidden = self.lstm(output, hidden)
        output = F.log_softmax(self.out(output), dim=-1)
        return output, hidden[0], attn_weights

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

def trainIters1(encoder, decoder, epochs=1, learning_rate=0.01):

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = total_correct = total_sample = 0
        for i, data in enumerate(trainloader):
            indicator = i % 100
            input_tensor, target_tensor = data
            
            encoder_hidden = encoder.init_hidden()

            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            input_length = 11
            target_length = 10

            loss = 0

            encoder_outputs, encoder_hidden = encoder(
                    input_tensor, encoder_hidden)

            decoder_hidden = encoder_hidden
#             print(decoder_hidden[0].size(), decoder_hidden[1].size())

            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_hidden, encoder_outputs)
            
            predicted = torch.argmax(decoder_output[:, -1, :], -1)
            total_correct += (predicted == target_tensor).sum().item()
            total_sample += target_tensor.size(0)
            
            loss += criterion(decoder_output[:, -1, :], target_tensor)
            total_loss += loss
            if indicator == 99:
                ith = int(i/100)
                avg_loss = total_loss/(indicator+1)
                avg_acc = total_correct/total_sample
                print("{}th 100 batches, average loss: {}, average accuracy: {}".format(ith, avg_loss, avg_acc))
                total_loss = total_correct = total_sample = 0
                
            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()

hidden_size = 128
encoder3 = EncoderLSTM1(EMBEDDING_DIM, hidden_size, 11, 10, BATCH_SIZE, 2)
attn_decoder3 = AttnDecoderRNN3(hidden_size, 10, 2)

trainIters1(encoder3, attn_decoder3)

0th 100 batches, average loss: 1.8175301551818848, average accuracy: 0.3578125
1th 100 batches, average loss: 1.4232324361801147, average accuracy: 0.4371875
2th 100 batches, average loss: 1.257949948310852, average accuracy: 0.5065625
3th 100 batches, average loss: 1.116391658782959, average accuracy: 0.55625
4th 100 batches, average loss: 0.9897517561912537, average accuracy: 0.630625
5th 100 batches, average loss: 0.9154134392738342, average accuracy: 0.658125
6th 100 batches, average loss: 0.8813705444335938, average accuracy: 0.66125
7th 100 batches, average loss: 0.850842297077179, average accuracy: 0.6790625
8th 100 batches, average loss: 0.7693002223968506, average accuracy: 0.7096875
9th 100 batches, average loss: 0.7447303533554077, average accuracy: 0.72125
10th 100 batches, average loss: 0.702324628829956, average accuracy: 0.7275
11th 100 batches, average loss: 0.6373677849769592, average accuracy: 0.761875
12th 100 batches, average loss: 0.6514743566513062, average accura

In [328]:
class EncoderLSTM1(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, idx_size, batch_size, num_layers):
        super(EncoderLSTM1, self).__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = num_layers
    
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)

        # The linear layer that maps from hidden state space to index space
        self.hidden2idx = nn.Linear(hidden_dim, idx_size)

    def init_hidden(self):
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))

    def forward(self, seq, hidden):
        embeds = self.embeddings(seq)
        out, hidden = self.lstm(embeds, hidden) 
        return out, hidden

class AttnDecoderRNN3(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, max_length=10):
        super(AttnDecoderRNN3, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length
        self.num_layers = num_layers
        
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, self.num_layers, batch_first=True)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, hidden, encoder_outputs):
        hidden_concat = torch.cat((encoder_outputs, hidden[0][-1].unsqueeze(1).repeat(1, encoder_outputs.size(1), 1)), -1)
        attn_weights = F.softmax(self.attn(hidden_concat), dim=-1).transpose(1, 2)
        attn_applied = torch.bmm(attn_weights,
                                 encoder_outputs)
        output = F.relu(attn_applied)
        output, hidden = self.lstm(output, hidden)
        output = F.log_softmax(self.out(output), dim=-1)
        return output, hidden[0], attn_weights

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

def trainIters1(encoder, decoder, epochs=1, learning_rate=0.01):

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = total_correct = total_sample = 0
        for i, data in enumerate(trainloader):
            indicator = i % 100
            input_tensor, target_tensor = data
            
            encoder_hidden = encoder.init_hidden()

            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            input_length = 11
            target_length = 10

            loss = 0

            encoder_outputs, encoder_hidden = encoder(
                    input_tensor, encoder_hidden)

            decoder_hidden = encoder_hidden
#             print(decoder_hidden[0].size(), decoder_hidden[1].size())

            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_hidden, encoder_outputs)
            
            predicted = torch.argmax(decoder_output[:, -1, :], -1)
            total_correct += (predicted == target_tensor).sum().item()
            total_sample += target_tensor.size(0)
            
            loss += criterion(decoder_output[:, -1, :], target_tensor)
            total_loss += loss
            if indicator == 99:
                ith = int(i/100)
                avg_loss = total_loss/(indicator+1)
                avg_acc = total_correct/total_sample
                print("{}th 100 batches, average loss: {}, average accuracy: {}".format(ith, avg_loss, avg_acc))
                total_loss = total_correct = total_sample = 0
                
            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()

hidden_size = 128
encoder3 = EncoderLSTM1(EMBEDDING_DIM, hidden_size, 11, 10, BATCH_SIZE, 2)
attn_decoder3 = AttnDecoderRNN3(hidden_size, 10, 2)

trainIters1(encoder3, attn_decoder3)

0th 100 batches, average loss: 1.9746804237365723, average accuracy: 0.326875
1th 100 batches, average loss: 1.8767398595809937, average accuracy: 0.3390625
2th 100 batches, average loss: 1.7952020168304443, average accuracy: 0.3634375
3th 100 batches, average loss: 1.8201353549957275, average accuracy: 0.3471875
4th 100 batches, average loss: 1.7820048332214355, average accuracy: 0.3465625
5th 100 batches, average loss: 1.535702109336853, average accuracy: 0.4103125
6th 100 batches, average loss: 1.3702223300933838, average accuracy: 0.4715625
7th 100 batches, average loss: 1.3061786890029907, average accuracy: 0.49875
8th 100 batches, average loss: 1.2352449893951416, average accuracy: 0.52
9th 100 batches, average loss: 1.1664303541183472, average accuracy: 0.5353125
10th 100 batches, average loss: 1.134343147277832, average accuracy: 0.5615625
11th 100 batches, average loss: 1.1096116304397583, average accuracy: 0.5721875
12th 100 batches, average loss: 1.0959386825561523, average 

In [None]:
# See what the scores are after training
with torch.no_grad():
    predicted = lstm_cr(inputs)
    print("Predicted:\n{}\n\nActual:\n{}".format(predicted, idx))