In [2]:
import numpy as np
import matplotlib.pyplot as plt
import torch

In [3]:
with open("train.sources") as f:
    train_sources = f.readlines()
with open("train.targets") as f:
    train_targets = f.readlines()

train_sources_tokenized = []
for i in range(len(train_sources)):
    train_sources_tokenized.append(train_sources[i].replace("\n", "").split(" "))

train_targets_tokenized = []
for i in range(len(train_targets)):
    train_targets_tokenized.append(train_targets[i].replace("\n", "").split(" "))

train_sources_vocabulary = []
for i in range(len(train_sources_tokenized)):
    for j in range(len(train_sources_tokenized[i])):
        if train_sources_tokenized[i][j] not in train_sources_vocabulary:
            train_sources_vocabulary.append(train_sources_tokenized[i][j])
            
train_targets_vocabulary = []
for i in range(len(train_targets_tokenized)):
    for j in range(len(train_targets_tokenized[i])):
        if train_targets_tokenized[i][j] not in train_targets_vocabulary:
            train_targets_vocabulary.append(train_targets_tokenized[i][j])

In [4]:
MAX_SEQUENCE_LENGTH = 500  
train_sources_integer_sequences_padded = np.zeros((len(train_sources_tokenized), MAX_SEQUENCE_LENGTH))
train_targets_integer_sequences_padded = np.zeros((len(train_targets_tokenized), MAX_SEQUENCE_LENGTH))

for i in range(len(train_sources_tokenized)):
    for j in range(min(MAX_SEQUENCE_LENGTH, len(train_sources_tokenized[i]))):
        token = train_sources_tokenized[i][j]
        if token in train_sources_vocabulary:
            train_sources_integer_sequences_padded[i, j] = train_sources_vocabulary.index(token)
        else:
            train_sources_integer_sequences_padded[i, j] = train_sources_vocabulary.index('<OOV>')

for i in range(len(train_targets_tokenized)):
    for j in range(min(MAX_SEQUENCE_LENGTH, len(train_targets_tokenized[i]))):
        token = train_targets_tokenized[i][j]
        if token in train_targets_vocabulary:
            train_targets_integer_sequences_padded[i, j] = train_targets_vocabulary.index(token)
        else:
            train_targets_integer_sequences_padded[i, j] = train_targets_vocabulary.index('<OOV>')


In [5]:
X = torch.from_numpy(train_sources_integer_sequences_padded).long()
Y = torch.from_numpy(train_targets_integer_sequences_padded).long()

In [6]:
print(X.shape)
print(Y.shape)

torch.Size([172719, 500])
torch.Size([172719, 500])


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)



In [8]:
from torch.utils.data import TensorDataset, DataLoader
train_data = TensorDataset(X_train, Y_train)
test_data = TensorDataset(X_test, Y_test)
batch_size = 32
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [12]:
import random

class Encoder(torch.nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.embedding = torch.nn.Embedding(input_size, embedding_size)
        self.lstm = torch.nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout, bidirectional=True)

    def forward(self, x):
        embedded = self.embedding(x) 
        outputs, (hidden, cell) = self.lstm(embedded)

        hidden = (hidden[::2, :, :] + hidden[1::2, :, :]) / 2
        cell = (cell[::2, :, :] + cell[1::2, :, :]) / 2

        return outputs, hidden, cell

class Attention(torch.nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.attention = torch.nn.Linear(hidden_size*3, hidden_size)
        self.v = torch.nn.Parameter(torch.rand(hidden_size))
        self.v.data.normal_(mean=0, std=1. / np.sqrt(self.v.size(0)))
        
    def forward(self, hidden, encoder_outputs):
        # hidden: (batch_size, hidden_size)
        # encoder_outputs: (seq_length, batch_size, hidden_size*2)
        seq_length = encoder_outputs.shape[0]
        hidden = hidden.repeat(seq_length, 1, 1).transpose(0, 1)  # (batch_size, seq_length, hidden_size)
        encoder_outputs = encoder_outputs.transpose(0, 1)  # (batch_size, seq_length, hidden_size*2)
        energy = torch.tanh(self.attention(torch.cat((hidden, encoder_outputs), dim=2)))  # (batch_size, seq_length, hidden_size)
        attention = torch.softmax(torch.sum(self.v * energy, dim=2), dim=1).unsqueeze(1)  # (batch_size, 1, seq_length)
        return attention
    
# Decoder
class Decoder(torch.nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout):
        super(Decoder, self).__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.embedding = torch.nn.Embedding(input_size, embedding_size)
        self.lstm = torch.nn.LSTM(embedding_size+hidden_size*2, hidden_size, num_layers, dropout=dropout)
        self.attention = Attention(hidden_size)
        self.fc = torch.nn.Linear(hidden_size, input_size)
        
    def forward(self, x, hidden, cell, encoder_outputs):
        # x: (batch_size)
        # hidden: (num_layers, batch_size, hidden_size)
        # cell: (num_layers, batch_size, hidden_size)
        # encoder_outputs: (seq_length, batch_size, hidden_size*2)
        x = x.unsqueeze(0)  # (1, batch_size)
        embedded = self.embedding(x)  # (1, batch_size, embedding_size)
        attention = self.attention(hidden[-1], encoder_outputs)  # (batch_size, 1, seq_length)
        encoder_outputs = encoder_outputs.transpose(0, 1)  # (batch_size, seq_length, hidden_size*2)
        weighted = torch.bmm(attention, encoder_outputs)  # (batch_size, 1, hidden_size*2)
        weighted = weighted.transpose(0, 1)  # (1, batch_size, hidden_size*2)
        output, (hidden, cell) = self.lstm(torch.cat((embedded, weighted), dim=2), (hidden, cell))  # (1, batch_size, hidden_size)
        prediction = self.fc(output.squeeze(0))  # (batch_size, input_size)
        return prediction, hidden, cell # (batch_size, input_size), (num_layers, batch_size, hidden_size), (num_layers, batch_size, hidden_size)
    
# Seq2Seq
class Seq2Seq(torch.nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, source, target, teacher_forcing_ratio=0.5):
        # source: (seq_length, batch_size)
        # target: (seq_length, batch_size)
        batch_size = source.shape[1]
        seq_length = target.shape[0]
        input_size = self.decoder.input_size
        outputs = torch.zeros(seq_length, batch_size, input_size).to(self.device)  # (seq_length, batch_size, input_size)
        encoder_outputs, hidden, cell = self.encoder(source)  # (seq_length, batch_size, hidden_size*2), (num_layers, batch_size, hidden_size), (num_layers, batch_size, hidden_size)
        x = target[0]  # (batch_size)
        for i in range(1, seq_length):
            output, hidden, cell = self.decoder(x, hidden, cell, encoder_outputs)  # (batch_size, input_size), (num_layers, batch_size, hidden_size), (num_layers, batch_size, hidden_size)
            outputs[i] = output
            best_guess = output.argmax(1)  # (batch_size)
            x = target[i] if random.random() < teacher_forcing_ratio else best_guess
        return outputs
    
# Hyperparameters
input_size_encoder = len(train_sources_vocabulary)
input_size_decoder = len(train_targets_vocabulary)
output_size = len(train_targets_vocabulary)
encoder_embedding_size = 512
decoder_embedding_size = 512
hidden_size = 512
num_layers = 2
dropout = 0.5

# Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, dropout).to(device)
decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, num_layers, dropout).to(device)
model = Seq2Seq(encoder_net, decoder_net, device).to(device)

In [10]:
# optimizer
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# loss function
criterion = torch.nn.CrossEntropyLoss(ignore_index=0)

# train
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        source = batch[0].to(device)
        target = batch[1].to(device)
        optimizer.zero_grad()
        output = model(source, target)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        target = target[1:].view(-1)
        loss = criterion(output, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [13]:
# train the model
num_epochs = 5
clip = 1
for epoch in range(num_epochs):
    loss = train(model, train_loader, optimizer, criterion, clip)
    print("Epoch: {}, Loss: {}".format(epoch, loss))

: 