In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import random
import gc
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from collections import Counter
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
try:
    from bert_score import score as bert_scorer
except ImportError:
    bert_scorer = None

In [2]:
class Vocab:
    def __init__(self, max_size=10000):
        self.max_size = max_size
        self.word2idx = {"<pad>": 0, "<start>": 1, "<end>": 2, "<unk>": 3}
        self.idx2word = {0: "<pad>", 1: "<start>", 2: "<end>", 3: "<unk>"}
        self.counter = Counter()

    def build_vocab(self, sentences):
        for s in sentences:
            self.counter.update(str(s).split())
        for word, _ in self.counter.most_common(self.max_size - 4):
            if word not in self.word2idx:
                idx = len(self.word2idx)
                self.word2idx[word] = idx
                self.idx2word[idx] = word
        self.vocab_size = len(self.word2idx)

    def tokenize(self, sentence):
        return np.array([self.word2idx.get(w, 3) for w in str(sentence).split()], dtype=np.int32)

class TranslationDataset(Dataset):
    def __init__(self, src_data, trg_data):
        self.pairs = sorted(zip(src_data, trg_data), key=lambda x: len(x[0]))
        
    def __len__(self): return len(self.pairs)
    
    def __getitem__(self, idx): 
        return torch.from_numpy(self.pairs[idx][0]).long(), \
               torch.from_numpy(self.pairs[idx][1]).long()

def collate_fn(batch):
    src, trg = zip(*batch)
    return pad_sequence(src, padding_value=0), pad_sequence(trg, padding_value=0)

In [3]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, bidirectional=True)
        self.fc = nn.Linear(hid_dim * 2, hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        _, hidden = self.rnn(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.rnn(embedded, hidden.unsqueeze(0))
        return self.fc_out(output.squeeze(0)), hidden.squeeze(0)

In [4]:
print("Initialing the Seq2Seq encoder model")
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder, self.decoder, self.device = encoder, decoder, device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len, batch_size = trg.shape
        outputs = torch.zeros(trg_len, batch_size, self.decoder.output_dim).to(self.device)
        hidden = self.encoder(src)
        
        input = trg[0,:]
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            input = trg[t] if teacher_force else output.argmax(1)
        return outputs

Initialing the Seq2Seq encoder model


In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
df = pd.read_csv('clean_data.csv').dropna()

def simple_clean(s): return str(s).lower().replace('.', ' .').strip()

en_vocab, sp_vocab = Vocab(10000), Vocab(10000)
en_vocab.build_vocab(df['english'].apply(simple_clean))
sp_vocab.build_vocab(df['spanish'].apply(lambda x: f"<start> {simple_clean(x)} <end>"))

en_tokens = [en_vocab.tokenize(simple_clean(s)) for s in df['english']]
sp_tokens = [sp_vocab.tokenize(f"<start> {simple_clean(s)} <end>") for s in df['spanish']]

del df
gc.collect()

x_train, x_test, y_train, y_test = train_test_split(en_tokens, sp_tokens, test_size=0.2)

train_loader = DataLoader(TranslationDataset(x_train, y_train), batch_size=32, collate_fn=collate_fn, shuffle=True)
test_loader = DataLoader(TranslationDataset(x_test, y_test), batch_size=32, collate_fn=collate_fn)

enc = Encoder(en_vocab.vocab_size, 128, 256, 0.5)
dec = Decoder(sp_vocab.vocab_size, 128, 256, 0.5)
model = Seq2Seq(enc, dec, device).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=0)

print(f"Training on {device}...")

Training on cuda...


In [6]:
print("Training on database")
for epoch in range(20):
    model.train()
    total_loss = 0
    for src, trg in train_loader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        loss = criterion(output[1:].view(-1, sp_vocab.vocab_size), trg[1:].view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

print("Test evaluation.")
model.eval()
raw_preds, raw_refs = [], []
with torch.no_grad():
    for src, trg in test_loader:
        src, trg = src.to(device), trg.to(device)
        output = model(src, trg, teacher_forcing_ratio=0)
        preds = output.argmax(2).t().cpu().numpy()
        for i, p in enumerate(preds):
            p_words = [sp_vocab.idx2word[idx] for idx in p if idx > 3]
            t_words = [sp_vocab.idx2word[idx] for idx in trg.t()[i].cpu().numpy() if idx > 3]
            raw_preds.append(p_words)
            raw_refs.append([t_words])

Training on database
Epoch 1 Loss: 4.3061
Epoch 2 Loss: 3.4025
Epoch 3 Loss: 3.0381
Epoch 4 Loss: 2.8106
Epoch 5 Loss: 2.6513
Epoch 6 Loss: 2.5317
Epoch 7 Loss: 2.4316
Epoch 8 Loss: 2.3497
Epoch 9 Loss: 2.2773
Epoch 10 Loss: 2.2160
Epoch 11 Loss: 2.1692
Epoch 12 Loss: 2.1264
Epoch 13 Loss: 2.0777
Epoch 14 Loss: 2.0359
Epoch 15 Loss: 2.0018
Epoch 16 Loss: 1.9703
Epoch 17 Loss: 1.9380
Epoch 18 Loss: 1.9110
Epoch 19 Loss: 1.8827
Epoch 20 Loss: 1.8612
Test evaluation.


In [7]:
def get_eval_metrics(model, test_loader, sp_vocab, device):
    model.eval()
    hypotheses, references = [], []
    raw_hyp, raw_ref = [], []       
    
    with torch.no_grad():
        for src, trg in test_loader:
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, teacher_forcing_ratio=0)
            
            preds = output.argmax(2).t().cpu().numpy()
            targets = trg.t().cpu().numpy()

            for i in range(preds.shape[0]):
                p_tokens = []
                for idx in preds[i]:
                    if idx == 2: break 
                    if idx > 3: p_tokens.append(sp_vocab.idx2word[idx])
                
                t_tokens = [sp_vocab.idx2word[idx] for idx in targets[i] if idx > 3]

                if len(t_tokens) > 0:
                    final_p_tokens = p_tokens if len(p_tokens) > 0 else ["<empty>"]
                    
                    hypotheses.append(final_p_tokens)
                    references.append([t_tokens]) 
                    raw_hyp.append(" ".join(final_p_tokens))
                    raw_ref.append(" ".join(t_tokens))

    if not references:
        print("No valid reference sentences found.")
        return
    #since there was some empty lines within the rows, i needed to clear out the output
    chencherry = SmoothingFunction()
    avg_bleu = np.mean([sentence_bleu(ref, hyp, smoothing_function=chencherry.method1) 
                       for ref, hyp in zip(references, hypotheses)])
    
    avg_meteor = np.mean([meteor_score(ref, hyp) for ref, hyp in zip(references, hypotheses)])

    avg_bert = 0.0
    if bert_scorer:
        P, R, F1 = bert_scorer(raw_hyp, raw_ref, lang="es", verbose=False)
        avg_bert = F1.mean().item()

    print(f"BLEU Score: {avg_bleu:.4f}")
    print(f"METEOR Score: {avg_meteor:.4f}")
    if bert_scorer:
        print(f"BERTScore F1: {avg_bert:.4f}")
    
    return None

get_eval_metrics(model, test_loader, sp_vocab, device)

BLEU Score: 0.2328
METEOR Score: 0.5164
BERTScore F1: 0.8490
