# LING 334 Final Project: Machine Translation using LSTM

## Imports

In [1]:
import torch
import math
import time
from torch import nn, utils, Tensor
import numpy as np
from linecache import getline
from sklearn.model_selection import train_test_split
import pickle
import spacy

# spacy.prefer_gpu()
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

## Dataset considerations

### File path definitions

In [2]:
EN_PATH = 'data/fr-en/europarl-v7.fr-en.en'
FR_PATH = 'data/fr-en/europarl-v7.fr-en.fr'
FR_EN_PAIRS_PATH = 'data/fr-en_pairs.pickle'

EN_TRAIN_PATH = 'data/train.en'
EN_TEST_PATH = 'data/test.en'
FR_TRAIN_PATH = 'data/train.fr'
FR_TEST_PATH = 'data/test.fr'

### Combining data into one file

In [None]:
sentence_pairs = []
with open(EN_PATH) as en:
    en = en.readlines()
    with open(FR_PATH) as fr:
        fr = fr.readlines()
        for i in range(len(en)):
            sentence_pairs.append({'en': en[i].strip(),
                                   'fr': fr[i].strip()})
with open(FR_EN_PAIRS_PATH, 'wb') as pairs_file:
    pickle.dump(sentence_pairs, pairs_file)

In [3]:
with open(FR_EN_PAIRS_PATH, 'rb') as fr_en_file:
    pairs = pickle.load(fr_en_file)

### Initializing spaCy languages

In [3]:
en_nlp = spacy.load("en_core_web_lg", enable=[])
fr_nlp = spacy.load("fr_core_news_lg", enable=[])

### Training and testing data split

In [5]:
with open(EN_PATH) as en:
    with open(FR_PATH) as fr:
        en_train, en_test, fr_train, fr_test = train_test_split(en.readlines(), fr.readlines(), train_size=0.8, test_size=0.2)
        with open(EN_TRAIN_PATH, 'w') as f:
            f.writelines(en_train)
        with open(EN_TEST_PATH, 'w') as f:
            f.writelines(en_test)
        with open(FR_TRAIN_PATH, 'w') as f:
            f.writelines(fr_train)
        with open(FR_TEST_PATH, 'w') as f:
            f.writelines(fr_test)

### Dataset class definition

In [4]:
class SentencePairDataset(utils.data.Dataset):
    def __init__(self, source_path, target_path):
        self.source_path = source_path
        self.target_path = target_path

    def __len__(self):
        with open(self.source_path) as f:
            return len(f.readlines())
        
    def __getitem__(self, idx):
        return (getline(self.source_path, idx), getline(self.target_path, idx))

## Model

### LSTM node definition

In [5]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.W = nn.Parameter(Tensor(input_size, hidden_size * 4))
        self.U = nn.Parameter(Tensor(hidden_size, hidden_size * 4))
        self.b = nn.Parameter(torch.zeros(hidden_size * 4))
        self.initialize_parameters()

    def initialize_parameters(self):
        std_dev = 1.0 / math.sqrt(self.hidden_size)
        nn.init.uniform_(self.W, -std_dev, std_dev)
        nn.init.uniform_(self.U, -std_dev, std_dev)
    
    def forward(self, input, hidden):
        h_t, c_t = hidden

        gates = (input @ self.W) + (h_t @ self.U) + self.b

        in_g, forget_g, cell_g, out_g = gates.chunk(4, 1)
        in_g = torch.sigmoid(in_g)
        forget_g = torch.sigmoid(forget_g)
        cell_g = torch.tanh(cell_g)
        out_g = torch.sigmoid(out_g)

        c_t_1 = (forget_g * c_t) + (in_g * cell_g)
        return out_g * torch.tanh(c_t_1), c_t_1

### Encoder module

In [6]:
class Encoder(nn.Module):
    def __init__(self, node, input_size, hidden_size, depth, dropout):
        super().__init__()
        self.node = node
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.depth = depth
        # if depth == 1:
        #     self.layers = [node(input_size, hidden_size)]
        # elif depth > 1:
        #     self.layers = [node(input_size, hidden_size)].extend([node(hidden_size, hidden_size) for _ in range(depth - 1)])
        self.layers = [node(input_size, hidden_size).to(device)]
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_sequence):
        batch_size, sequence_len, input_size = input_sequence.size()
        assert input_size == self.input_size

        input_sequence = self.dropout(input_sequence)

        hidden = (torch.zeros(batch_size, self.hidden_size).to(device), torch.zeros(batch_size, self.hidden_size).to(device))

        for layer in self.layers:
            for t in range(sequence_len):
                input = input_sequence[:, t, :]
                hidden = layer(input, hidden)
        
        return hidden


### Decoder module

In [7]:
class Decoder(nn.Module):
    def __init__(self, node, output_size, hidden_size, depth, dropout):
        super().__init__()
        self.node = node
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.depth = depth
        # if depth == 1:
        #     self.layers = [LSTM(output_size, hidden_size)]
        # elif depth > 1:
        #     self.layers = [LSTM(output_size, hidden_size)].extend([LSTM(hidden_size, hidden_size) for _ in range(depth - 1)])
        self.layer = node(output_size, hidden_size).to(device)
        self.out_layer = nn.Linear(hidden_size, output_size).to(device)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_token, hidden):
        batch_size, input_size = input_token.size()
        assert input_size == self.output_size

        input_token = self.dropout(input_token)

        h_t_1, c_t_1 = self.layer(input_token, hidden)

        output = self.out_layer(h_t_1)

        return output, (h_t_1, c_t_1)

### Translator module

In [28]:
class Translator(nn.Module):
    def __init__(self, encoder, decoder, device, source, target, max_length):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.source = source
        self.target = target
        self.max_length = max_length

    def forward(self, input_sentence):
        batch_size = len(input_sentence)
        documents = list(self.source.pipe(input_sentence))
        
        input_tokens = torch.zeros(batch_size, self.max_length, self.encoder.input_size).to(device)
        for i, doc in enumerate(documents):
            if len(doc) >= self.max_length:
                for token_i in range(self.max_length):
                    input_tokens[i, token_i] = Tensor(doc[token_i].vector)
            else:
                for token_i in range(self.max_length):
                    if token_i in range(len(doc)):
                        input_tokens[i, token_i] = Tensor(doc[token_i].vector)
                    else:
                        input_tokens[i, token_i] = torch.zeros(self.encoder.input_size)

        hidden = self.encoder(input_tokens)

        decoder_input = torch.zeros(batch_size, self.decoder.output_size).to(device)

        output_tokens = torch.zeros(batch_size, self.max_length, self.decoder.output_size).to(device)

        for t in range(1, self.max_length):
            output, hidden = self.decoder(decoder_input, hidden)

            output_tokens[:, t, :] = output

            decoder_input = output

        outputs = []
        for sentence in output_tokens:
            words = []
            for token in sentence:
                query = token.unsqueeze(0).detach().cpu()
                words.append(self.target.vocab[self.target.vocab.vectors.most_similar(query)[0][0]].text)
            outputs.append(' '.join(words))
        return outputs

## Training experiments

In [29]:
HIDDEN_SIZE = 64
N_LAYERS = 2
MAX_LENGTH = 50
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
N_EPOCHS = 10
LR = 0.001
GRAD_CLIP = 1
BATCH_SIZE = 32

encoder = Encoder(LSTM, en_nlp.vocab.vectors_length, HIDDEN_SIZE, N_LAYERS, ENC_DROPOUT).to(device)
decoder = Decoder(LSTM, fr_nlp.vocab.vectors_length, HIDDEN_SIZE, N_LAYERS, DEC_DROPOUT).to(device)
model = Translator(encoder, decoder, device, en_nlp, fr_nlp, MAX_LENGTH).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = torch.nn.CrossEntropyLoss(ignore_index=0)

train_set = SentencePairDataset(EN_TRAIN_PATH, FR_TRAIN_PATH)
test_set = SentencePairDataset(EN_TEST_PATH, FR_TEST_PATH)
train_loader = utils.data.DataLoader(train_set, batch_size=BATCH_SIZE)
test_loader = utils.data.DataLoader(test_set, batch_size=BATCH_SIZE)

training_loss = []
testing_loss = []

def train(model, loader, optimizer, criterion, grad_clip):
    model.train()
    epoch_loss = 0

    for i, batch in enumerate(loader):
        (source, target) = batch

        optimizer.zero_grad()
        output = model(source)

        if i == 0:
            print(target)
            print(output)

        output_for_loss = [[token.orth for token in doc] for doc in model.target.pipe(output)]
        target_for_loss = [[token.orth for token in doc] for doc in model.target.pipe(target)]
        loss = criterion(output_for_loss, target_for_loss)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()

        epoch_loss += loss.item()
    
    return epoch_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(loader):
            (source, target) = batch
            
            output = model(source)

            output_for_loss = [[token.orth for token in doc] for doc in model.target.pipe(output)]
            target_for_loss = [[token.orth for token in doc] for doc in model.target.pipe(target)]
            loss = criterion(output_for_loss, target_for_loss)
            
            epoch_loss += loss.item()
    
    return epoch_loss / len(loader)

def epoch_time(start, end):
    elapsed = end - start
    mins = int(elapsed / 60)
    secs = int(elapsed - (mins * 60)) 
    return mins, secs

start_time = time.time()
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, GRAD_CLIP)
    test_loss = evaluate(model, test_loader, criterion)
    training_loss.append(train_loss)
    testing_loss.append(test_loss)
    
    end_time = time.time()
    mins, secs = epoch_time(start_time, end_time)
    print(f'Epoch: {epoch+1:02} | Time: {mins}m {secs}s') 
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Test Loss: {test_loss:.3f}')
    start_time = time.time()