Code source: PyTorch tutorial on Seq-to-Seq Networks (https://docs.pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html)

Imports and Preprocessing
===========

In [1]:
!pip install matplotlib
!pip install torch
!pip install torchvision
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install nltk
!pip install spacy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1

In [2]:
%matplotlib inline

In [3]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/Users/naalamleboye/Downloads/rnn_translation/.venv/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/naalamleboye/Downloads/rnn_translation/.venv/lib/pytho

In [4]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [6]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [7]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [8]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 11445 sentence pairs
Counting words...
Counted words:
fra 4601
eng 2991
['c est toi le prisonnier', 'you re the prisoner']


Encoder Setup
=============



In [9]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

Decoder Setup
=============


In [10]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden



Training Setup
==============


In [11]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepareData('eng', 'fra', True)

    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader

In [12]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [13]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [14]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

Plotting Setup
================

In [15]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

Evaluation Setup
================



In [None]:
# import sacrebleu
# from jiwer import wer

# def evaluate(encoder, decoder, test_dataset, input_lang, output_lang):
#     references = []
#     hypotheses = []
#     all_attentions = []

#     with torch.no_grad():
#         for data_entry in test_dataset:
#             sentence = data_entry['source_sentence']  # Adjust key as per your dataset (e.g., 'en' or source lang)
#             reference_sentence = data_entry['target_sentence']  # Adjust for reference text

#             # Prepare input tensor
#             input_tensor = tensorFromSentence(input_lang, sentence)

#             # Model forward pass
#             encoder_outputs, encoder_hidden = encoder(input_tensor)
#             decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

#             # Decode output ids
#             _, topi = decoder_outputs.topk(1)
#             decoded_ids = topi.squeeze()

#             decoded_words = []
#             for idx in decoded_ids:
#                 if idx.item() == EOS_token:
#                     decoded_words.append('<EOS>')
#                     break
#                 decoded_words.append(output_lang.index2word[idx.item()])

#             # Remove EOS for metric calculation
#             if '<EOS>' in decoded_words:
#                 decoded_words = decoded_words[:decoded_words.index('<EOS>')]

#             predicted_sentence = ' '.join(decoded_words)

#             # Append for corpus metrics
#             references.append(reference_sentence.strip())
#             hypotheses.append(predicted_sentence.strip())
#             all_attentions.append(decoder_attn)

#     # Calculate aggregate metrics
#     overall_wer = wer(references, hypotheses)
#     bleu = sacrebleu.corpus_bleu(hypotheses, [references])

#     return hypotheses, all_attentions, overall_wer, bleu.score


In [24]:
import sacrebleu
from jiwer import wer

def evaluate(encoder, decoder, test_input, input_lang, output_lang):
    references = []
    hypotheses = []
    all_attentions = []

    with torch.no_grad():
        # Check if test_input is a dataset (iterable) or single sentence (string)
        if isinstance(test_input, str):
            # Single sentence evaluation
            sentence = test_input
            reference_sentence = ""  # No reference for single sentence evaluation
            
            # Prepare input tensor
            input_tensor = tensorFromSentence(input_lang, sentence)

            # Model forward pass
            encoder_outputs, encoder_hidden = encoder(input_tensor)
            decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

            # Decode output ids
            _, topi = decoder_outputs.topk(1)
            decoded_ids = topi.squeeze()

            decoded_words = []
            for idx in decoded_ids:
                if idx.item() == EOS_token:
                    decoded_words.append('<EOS>')
                    break
                decoded_words.append(output_lang.index2word[idx.item()])

            # Remove EOS for final output
            if '<EOS>' in decoded_words:
                decoded_words = decoded_words[:decoded_words.index('<EOS>')]

            predicted_sentence = ' '.join(decoded_words)
            
            # For single sentence, return just the words and attention
            return decoded_words, decoder_attn
            
        else:
            # Dataset evaluation (original functionality)
            for data_entry in test_input:
                sentence = data_entry['source_sentence']  # Adjust key as per your dataset (e.g., 'en' or source lang)
                reference_sentence = data_entry['target_sentence']  # Adjust for reference text

                # Prepare input tensor
                input_tensor = tensorFromSentence(input_lang, sentence)

                # Model forward pass
                encoder_outputs, encoder_hidden = encoder(input_tensor)
                decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

                # Decode output ids
                _, topi = decoder_outputs.topk(1)
                decoded_ids = topi.squeeze()

                decoded_words = []
                for idx in decoded_ids:
                    if idx.item() == EOS_token:
                        decoded_words.append('<EOS>')
                        break
                    decoded_words.append(output_lang.index2word[idx.item()])

                # Remove EOS for metric calculation
                if '<EOS>' in decoded_words:
                    decoded_words = decoded_words[:decoded_words.index('<EOS>')]

                predicted_sentence = ' '.join(decoded_words)

                # Append for corpus metrics
                references.append(reference_sentence.strip())
                hypotheses.append(predicted_sentence.strip())
                all_attentions.append(decoder_attn)

            # Calculate aggregate metrics
            overall_wer = wer(references, hypotheses)
            bleu = sacrebleu.corpus_bleu(hypotheses, [references])

            return hypotheses, all_attentions, overall_wer, bleu.score

In [17]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

Training and Evaluating
=======================

In [18]:
hidden_size = 128
batch_size = 32

input_lang, output_lang, train_dataloader = get_dataloader(batch_size)

encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)

train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)

Reading lines...
Read 135842 sentence pairs
Trimmed to 11445 sentence pairs
Counting words...
Counted words:
fra 4601
eng 2991
3m 35s (- 53m 53s) (5 6%) 1.7020
12m 4s (- 84m 28s) (10 12%) 0.8874
15m 34s (- 67m 31s) (15 18%) 0.5785
18m 46s (- 56m 20s) (20 25%) 0.3974
21m 44s (- 47m 50s) (25 31%) 0.2819
25m 32s (- 42m 34s) (30 37%) 0.2064
30m 8s (- 38m 44s) (35 43%) 0.1545
33m 22s (- 33m 22s) (40 50%) 0.1196
37m 23s (- 29m 5s) (45 56%) 0.0965
40m 36s (- 24m 21s) (50 62%) 0.0782
44m 30s (- 20m 13s) (55 68%) 0.0666
47m 37s (- 15m 52s) (60 75%) 0.0579
51m 2s (- 11m 46s) (65 81%) 0.0515
53m 27s (- 7m 38s) (70 87%) 0.0473
56m 49s (- 3m 47s) (75 93%) 0.0433
60m 12s (- 0m 0s) (80 100%) 0.0405


In [25]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

> vous n etes pas aussi maligne que moi
= you re not as smart as me
< you re not as smart as me

> elle est incapable de faire face a la tension
= she is unable to cope with stress
< she is unable to cope with stress

> je n y vais pas
= i m not going
< i m not going to want to study

> j ai peur qu il echoue
= i am afraid he will fail
< i am afraid it s too late

> elle est toujours gentille avec tout le monde
= she is always kind to everyone
< she s always kind of hungry

> nous sommes enfin seuls
= we re finally alone
< we re about to your opinion

> elles sont toutes les deux amoureuses du meme type
= they re both in love with the same guy
< they re both in love with the same guy

> ils sont onereux
= they re expensive
< they re mad at you

> moi je n en ai pas fini avec vous
= i m not done with you
< i m not done with you

> je suis en train de dormir
= i m sleeping
< i m using that cup



In [20]:
# Install metrics packages
!pip install sacrebleu jiwer


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [21]:
import sacrebleu
from jiwer import wer

encoder.eval()
decoder.eval()

references = []
hypotheses = []

for input_sentence, reference_sentence in pairs:
    try:
        # Forward pass through encoder and decoder
        input_tensor = tensorFromSentence(input_lang, input_sentence)
        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)
        
        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                break
            decoded_words.append(output_lang.index2word[idx.item()])

        predicted_sentence = " ".join(decoded_words)
        references.append(reference_sentence.strip())
        hypotheses.append(predicted_sentence.strip())
    except Exception as e:
        print(f"Sentence skipped due to error: {input_sentence} -- {e}")

# Compute corpus BLEU
bleu = sacrebleu.corpus_bleu(hypotheses, [references])
print(f"Corpus BLEU score: {bleu.score:.2f}")

# Compute corpus WER
corpus_wer = wer(references, hypotheses)
print(f"Corpus Word Error Rate (WER): {corpus_wer:.3f}")




Corpus BLEU score: 47.42
Corpus Word Error Rate (WER): 0.520
