# Reference: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html#exercises

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random
import os
import time

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

import gc
from tqdm import tqdm
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device('cpu')
device

device(type='cuda')

## Load Data

In [2]:
start = time.time()

if os.path.isfile('./data/valid_subset.csv'):
    # cleaned, filtered by length, 10% dataset
    df = pd.read_csv('./data/valid_subset.csv', index_col=False)
    
elif os.path.isfile('./data/valid_cleaned_data.csv'):
    # cleaned, filtered by length dataset
    df = pd.read_csv('./data/valid_cleaned_data.csv', index_col=False)
    
elif os.path.isfile('./data/cleaned_data.csv'):
    # cleaned dataset
    df = pd.read_csv('./data/cleaned_data.csv', index_col=False)
else:

    df = pd.read_csv('./data/en-fr.csv')

end = time.time()
display(end - start)

df.head()

0.4302253723144531

Unnamed: 0,en,fr
0,another easily recognizable form of ar technol...,une autre forme connue de ra est l ecran de vi...
1,however since sao paulo is a big and scattered...,cependant comme sao paulo est une grande ville...
2,this provision provides among other things tha...,cette provision prevoit notamment qu une deduc...
3,where would you expect to find a document that...,ou crois tu que tu trouveras un document qui t...
4,at the request of agency staff additional comm...,a la demande du personnel de l office royal a ...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146993 entries, 0 to 146992
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   en      146993 non-null  object
 1   fr      146993 non-null  object
dtypes: object(2)
memory usage: 2.2+ MB


## Clean the data

In [4]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [5]:
# Clean data only if not available    

start = time.time()
df.dropna(inplace=True)

if not os.path.isfile('./data/cleaned_data.csv'):
    df['en'] = df['en'].apply(lambda x: normalizeString(str(x)))
    df['fr'] = df['fr'].apply(lambda x: normalizeString(str(x)))
    df.to_csv('./data/cleaned_data.csv', index=False)
    
end = time.time()
display(end-start)

0.02099299430847168

In [6]:
### Filter dataset by length
MAX_LENGTH = 35

if not os.path.isfile('./data/valid_cleaned_data.csv'):
    df['en_len'] = df['en'].apply(lambda sent: len(sent.split(" ")))
    df['fr_len'] = df['fr'].apply(lambda sent: len(sent.split(" ")))

    df = df[df['en_len'] < MAX_LENGTH]
    df = df[df['fr_len'] < MAX_LENGTH]
    
    df = df[['en', 'fr']]
    
    df.to_csv('./data/valid_cleaned_data.csv', index=False)


df

Unnamed: 0,en,fr
0,another easily recognizable form of ar technol...,une autre forme connue de ra est l ecran de vi...
1,however since sao paulo is a big and scattered...,cependant comme sao paulo est une grande ville...
2,this provision provides among other things tha...,cette provision prevoit notamment qu une deduc...
3,where would you expect to find a document that...,ou crois tu que tu trouveras un document qui t...
4,at the request of agency staff additional comm...,a la demande du personnel de l office royal a ...
...,...,...
146988,it would be most beneficial and effective for ...,il serait tres avantageux et efficace pour le ...
146989,cost reductions particular importance shall be...,reductions des couts une importance particulie...
146990,this inconsistent approach to marking had sign...,cette methode de cotation non uniforme a eu de...
146991,testimony of lgen gervais transcripts vol,temoignage du lgne gervais transcriptions vol


In [7]:
### Prepare only subset of data
frac = 0.1

if not os.path.isfile('./data/valid_subset.csv'):
    df_subset = df.sample(frac=frac)
    df_subset.to_csv('./data/valid_subset.csv', index=False)
else:
    df_subset = df

df_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146993 entries, 0 to 146992
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   en      146993 non-null  object
 1   fr      146993 non-null  object
dtypes: object(2)
memory usage: 2.2+ MB


In [8]:
df_subset[df_subset.isna().any(axis=1)]

Unnamed: 0,en,fr


## Create helpers to construct vocabulary

In [9]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [10]:
len(df_subset)

146993

## Preparing Data for training

In [11]:
gc.collect()
#df_small = df_subset.sample(frac=0.1)


def prepareData(df):
    en_lang = Lang('en')
    fr_lang = Lang('fr')
    
    en_vocab = []
    fr_vocab = []
    
    for index, row in tqdm(df.iterrows(), total=df.shape[0], position=0, leave=True):
        en_sent = row['en']
        fr_sent = row['fr']
        
        en_vocab += en_sent.split(" ")
        fr_vocab += fr_sent.split(" ")
        
    
    # Construct word2index and index2word dicts for the two languages
    en_vocab = set(en_vocab)
    fr_vocab = set(fr_vocab)
    
    en_word2index = dict([(word, i+2) for i, word in enumerate(en_vocab)])
    fr_word2index = dict([(word, i+2) for i, word in enumerate(fr_vocab)])
    
    en_index2word = {v: k for k, v in en_word2index.items()}
    fr_index2word = {v: k for k, v in fr_word2index.items()}
    
    en_lang.word2index = en_word2index
    fr_lang.word2index = fr_word2index
    
    en_lang.index2word.update(en_index2word)
    fr_lang.index2word.update(fr_index2word)
    
    en_lang.n_words = len(en_lang.index2word.keys())
    fr_lang.n_words = len(fr_lang.index2word.keys())
    
    return en_lang, fr_lang
        

start = time.time()
en_lang, fr_lang = prepareData(df_subset)
end = time.time() 
display(end - start)



100%|██████████| 146993/146993 [00:05<00:00, 26945.28it/s]


6.003048419952393

In [12]:
fr_lang.word2index

{'attarder': 2,
 'humble': 3,
 'cpmite': 4,
 'surevaluees': 5,
 'prefere': 6,
 'photochimie': 7,
 'aylward': 8,
 'parus': 9,
 'ali': 10,
 'netcom': 11,
 'deniers': 12,
 'souterrains': 13,
 'sugars': 14,
 'ecoulees': 15,
 'padano': 16,
 'strobiles': 17,
 'strategijas': 18,
 'paysagere': 19,
 'valorisees': 20,
 'prevedendo': 21,
 'ipa': 22,
 'cpa': 23,
 'buy': 24,
 'maugerville': 25,
 'hematological': 26,
 'preventable': 27,
 'conjugales': 28,
 'proud': 29,
 'zielarskie': 30,
 'biotechnologie': 31,
 'larepe': 32,
 'rhs': 33,
 'verifiez': 34,
 'delgado': 35,
 'macronutriments': 36,
 'carrol': 37,
 'neoformees': 38,
 'batteries': 39,
 'lakefield': 40,
 'emil': 41,
 'costache': 42,
 'eternelle': 43,
 'aministration': 44,
 'intramoleculaires': 45,
 'tomodensitometrie': 46,
 'deflates': 47,
 'penh': 48,
 'imprimeur': 49,
 'kw': 50,
 'lanieres': 51,
 'dubia': 52,
 'terribles': 53,
 'incitait': 54,
 'litterales': 55,
 'nettoyeurs': 56,
 'fournissaient': 57,
 'pieges': 58,
 'pathologiqe': 59,
 '

In [13]:
df_train = df_subset.sample(frac=0.75)

df_test = pd.concat([df_subset, df_train])
df_test.drop_duplicates(keep=False, inplace=True)
df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)

df_test

Unnamed: 0,en,fr
0,however since sao paulo is a big and scattered...,cependant comme sao paulo est une grande ville...
1,at the request of agency staff additional comm...,a la demande du personnel de l office royal a ...
2,i firmly believe firefighters police paramedic...,je crois fermement que les pompiers policiers ...
3,a substantial amount of effort is devoted to m...,des efforts considerables sont consacres a l e...
4,link between apf and profitability strongly en...,l attention portee au csa pourrait se faire au...
...,...,...
36708,date and place of next meeting monday th may f...,date et lieu de la prochaine reunion lundi mai...
36709,number of poor children on the rise,le nombre d enfants pauvres est a la hausse
36710,wildlife must move about to find food water an...,les especes sauvages doivent se deplacer a la ...
36711,he could not handle the pressure brought on by...,il n a pas resiste a la pression qu a apportee...


In [14]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def get_dataloader(batch_size, en_lang, fr_lang, df):
    
    n = len(df)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    
    
    for idx, row in tqdm(df.iterrows(), total=n, position=0, leave=True):
        en_sent = row[en_lang.name]
        fr_sent = row[fr_lang.name]
        
        en_ids = indexesFromSentence(en_lang, en_sent)
        fr_ids = indexesFromSentence(fr_lang, fr_sent)
                
        en_ids.append(EOS_token)
        fr_ids.append(EOS_token)
        
        input_ids[idx, :len(en_ids)] = en_ids
        target_ids[idx, :len(fr_ids)] = fr_ids
        
    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) #, num_workers=8)
    return train_dataloader

In [15]:
start = time.time()
train_loader = get_dataloader(8, en_lang, fr_lang, df_train)
end = time.time()

display(end - start)

next(iter(train_loader))

100%|██████████| 110245/110245 [00:05<00:00, 19092.33it/s]


5.945578336715698

[tensor([[64008, 41462, 10746, 64008,   565, 35914, 45824, 35297,     1,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0],
         [45125, 43209, 45672, 27450, 30756, 44908, 10492, 65116, 44285, 23784,
          30756, 38577, 49772, 30625, 29549, 45015, 39804,  4560,     1,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0],
         [62836, 53086, 22024, 26417, 10492, 10907, 35914,  2652, 65527, 37091,
          43512, 49772, 24968,  9984, 53190, 23376, 19564, 22853, 19947, 26417,
           6713, 50112, 49772, 22853, 19947, 26417, 61623, 50112,     1,     0,
              0,     0,     0,     0,     0],
         [41688, 37468, 12684, 32010, 11380, 43209, 55291, 23784, 30756, 12298,
           3561, 51646, 27195, 66439, 54223, 35914, 38192, 356

## Building LSTM Model

In [16]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.LSTM = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.LSTM(embedded)
        return output, hidden

In [17]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.LSTM = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.LSTM(output, hidden)
        output = self.out(output)
        return output, hidden

## Training the model

In [18]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):
    
    encoder.train()
    decoder.train()

    total_loss = 0
    batch_bar   = tqdm(total=len(dataloader), dynamic_ncols=True, leave=True, position=0, desc='Train')
    
    for i, data in enumerate(dataloader):
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()
        
        batch_bar.set_postfix(train_loss="{:.04f}".format(float(total_loss / (i + 1))))
        batch_bar.update()

    return total_loss / len(dataloader)

In [19]:
def test_epoch(dataloader, encoder, decoder, criterion):
    encoder.eval()
    decoder.eval()
    total_loss = 0
    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True, leave=True, position=0, desc='Test')
    
    for i, data in enumerate(dataloader):
        with torch.no_grad():
            input_tensor, target_tensor = data

            encoder_outputs, encoder_hidden = encoder(input_tensor)
            decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

            loss = criterion(
                decoder_outputs.view(-1, decoder_outputs.size(-1)),
                target_tensor.view(-1)
            )

        total_loss += loss.item()
        
        batch_bar.set_postfix(test_loss="{:.04f}".format(float(total_loss / (i + 1))))
        batch_bar.update()

    return total_loss / len(dataloader)

In [20]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [50]:
def train(train_dataloader, test_dataloader, encoder, decoder, n_epochs,
          encoder_optimizer, decoder_optimizer, encoder_scheduler, decoder_scheduler,
          criterion):
    
    start = time.time()
    train_losses = []
    test_losses = []


    for epoch in range(1, n_epochs + 1):
        print(f"Epoch {epoch} / {n_epochs}")
        
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        test_loss = test_epoch(test_dataloader, encoder, decoder, criterion)
        
        train_losses.append(loss)
        test_losses.append(test_loss)
        print(f"encoder lr = {encoder_scheduler.get_lr()}, decoder lr = {decoder_scheduler.get_lr()}")
        

        encoder_scheduler.step()
        decoder_scheduler.step()

        if epoch % 5 == 0:
            print('%s (%d %d%%)' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100))

    showPlot(train_losses)
    showPlot(test_losses)
    
    return train_losses, test_losses

In [22]:
device

device(type='cuda')

In [23]:
batch_size = 32

train_loader = get_dataloader(batch_size, en_lang, fr_lang, df_train)
test_loader = get_dataloader(batch_size, en_lang, fr_lang, df_test)




100%|██████████| 110245/110245 [00:05<00:00, 19157.46it/s]
100%|██████████| 36713/36713 [00:01<00:00, 19223.77it/s]


In [32]:
hidden_size = 128
learning_rate = 0.01

gc.collect()
torch.cuda.empty_cache()

encoder = EncoderRNN(en_lang.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, fr_lang.n_words).to(device)

encoder_optimizer = optim.AdamW(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.AdamW(decoder.parameters(), lr=learning_rate)

encoder_scheduler = optim.lr_scheduler.StepLR(encoder_optimizer, step_size=2, gamma=0.5)
decoder_scheduler = optim.lr_scheduler.StepLR(decoder_optimizer, step_size=2, gamma=0.5)

criterion = nn.NLLLoss()




In [33]:
epochs = 5

train_losses, test_losses = train(train_loader, test_loader, encoder, decoder, epochs,
                                 encoder_optimizer, decoder_optimizer,
                                 encoder_scheduler, decoder_scheduler,
                                 criterion)

Epoch 1 / 5


Train: 100%|██████████| 3446/3446 [11:17<00:00,  5.09it/s, train_loss=3.4034]
Test: 100%|██████████| 1148/1148 [01:06<00:00, 17.24it/s, test_loss=3.1413]


Epoch 2 / 5


Train: 100%|██████████| 3446/3446 [11:18<00:00,  5.08it/s, train_loss=2.9427]
Test: 100%|██████████| 1148/1148 [01:06<00:00, 17.22it/s, test_loss=3.0235]


Epoch 3 / 5


Train: 100%|██████████| 3446/3446 [11:18<00:00,  5.08it/s, train_loss=2.6794]
Test: 100%|██████████| 1148/1148 [01:06<00:00, 17.19it/s, test_loss=2.9686]


Epoch 4 / 5


Train: 100%|██████████| 3446/3446 [11:18<00:00,  5.08it/s, train_loss=2.5416]
Test: 100%|██████████| 1148/1148 [01:06<00:00, 17.22it/s, test_loss=2.9654]


Epoch 5 / 5


Train: 100%|██████████| 3446/3446 [11:18<00:00,  5.08it/s, train_loss=2.3838]
Test: 100%|██████████| 1148/1148 [01:06<00:00, 17.20it/s, test_loss=2.9590]

62m 4s (- 0m 0s) (5 100%)





In [43]:
test_loss = sum(test_losses) / len(test_losses)

torch.save({
            'epoch': epochs,
            'encoder_state_dict': encoder.state_dict(),
            'decoder_state_dict': decoder.state_dict(),
            'encoder_optimizer_state_dict': encoder_optimizer.state_dict(),
            'decoder_optimizer_state_dict': decoder_optimizer.state_dict(),
            'criterion': criterion
            }, f'./checkpoints/checkpoint_testloss-{test_loss:.4f}')

In [35]:
import datetime


x = datetime.datetime.now()
print(f"Finished at {x}")

Finished at 2023-11-15 12:10:21.009473


## Evaluate the model

In [36]:
def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

In [45]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, _ = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words

In [48]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        print(f"Testing {i+1} / {n}")
        pair = df_test.sample(1).values.tolist()[0]
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0], en_lang, fr_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [49]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

Testing 1 / 10
> corporate income tax rate deductible percentage of existing resource allowance deductible percentage of royalties and mining taxes new tax credit for mineral exploration
= taux d imposition des benefices des societes pourcentage deductible de la deduction relative a des ressources actuelle de pourcentage deductible des redevances et des impots miniers nouveau credit d impot pour exploration miniere
< agence de sante canada et les services gouvernementaux canada et les premieres nations et les canadiennes en matiere de sante et de la sante <EOS>

Testing 2 / 10
> plants infected at the primary leaf stage display typical damping off disease symptoms
= les pieds infectes au stade de la premiere feuille presentent habituellement les symptomes typiques de la fonte des semis
< on a egalement fait l objet d une augmentation de la croissance des emissions de ges <EOS>

Testing 3 / 10
> this type of product is already emerging on the market
= en fait ce genre de produit fait de