In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

In [2]:
import warnings
from pathlib import Path

import torch
from torch import nn
import pandas as pd

from attention import models
from attention import utils
from attention.vectorizer import Vectorizer
from attention.constants import ENGLISH, FRENCH, SEQ_SIZE, DECODER_INPUT, ENCODER_INPUT, SOS_token


warnings.filterwarnings('ignore')

SOURCE_DIR = Path('../')
DATA_DIR = SOURCE_DIR / 'data'
translation_fp = DATA_DIR / 'eng-fra.txt'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data

In [3]:
# vectorizer code
from attention.data import load_sentences_dataframe, assign_rows_to_split, TranslationDataset, generate_batches
from attention.vectorizer import Vectorizer

df = load_sentences_dataframe(translation_fp)
df = assign_rows_to_split(df, train_ratio=0.9, valid_ratio=0.05, test_ratio=0.05)

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)
df[ENGLISH] = df[ENGLISH].str.lower()
df = df[df[ENGLISH].str.startswith(eng_prefixes)]

dataset = TranslationDataset.from_dataframe(df)

# Model Construction

In [4]:
# Model params
from attention.models import EncoderRNN, DecoderRNN
from attention.embeddings import create_spacy_embeddings

hidden_size = 256
embedding_size = 300
english_vocab_size = len(dataset.vectorizer.english_vocab)
french_vocab_size = len(dataset.vectorizer.french_vocab)

use_pretrained_embeddings = False
english_embedding_matrix, french_embedding_matrix = create_spacy_embeddings(dataset.vectorizer)

# use pretrained embeddings?
if use_pretrained_embeddings:
    encoder = EncoderRNN(english_vocab_size, hidden_size, embedding_matrix=english_embedding_matrix)
    decoder = DecoderRNN(hidden_size, french_vocab_size, embedding_matrix=french_embedding_matrix)
else:
    encoder = EncoderRNN(english_vocab_size, hidden_size, embedding_size)
    decoder = DecoderRNN(hidden_size, french_vocab_size, embedding_size)

# Train

In [7]:
from attention.train import train_simpleRNN_batch

# training params
nb_epochs = 15
learning_rate = 0.001
ignore_index = dataset.vectorizer.english_vocab.lookup_token(dataset.vectorizer.english_vocab.pad)
criterion = nn.CrossEntropyLoss(ignore_index=ignore_index)
batch_size = 32

# create optimizers
encoder_optim = torch.optim.Adam(encoder.parameters(), learning_rate)
decoder_optim = torch.optim.Adam(decoder.parameters(), learning_rate)

# run epochs
for epoch in range(nb_epochs):
    epoch_losses = []
    for batch in generate_batches(dataset, batch_size=batch_size):
        input_batch, target_batch = batch[ENCODER_INPUT], batch[DECODER_INPUT]
        loss = train_simpleRNN_batch(
            input_batch, 
            target_batch, 
            encoder, 
            decoder, 
            encoder_optim, 
            decoder_optim,
            criterion,
            device
        )
        epoch_losses.append(loss)
    avg_loss = sum(epoch_losses) / len(epoch_losses)
    print(f'loss at {epoch+1} epoch is {avg_loss:.2f}')

loss at 1 epoch is 1.08
loss at 2 epoch is 0.86
loss at 3 epoch is 0.86
loss at 4 epoch is 0.79
loss at 5 epoch is 0.84
loss at 6 epoch is 0.80
loss at 7 epoch is 0.79
loss at 8 epoch is 0.83
loss at 9 epoch is 0.75
loss at 10 epoch is 0.75
loss at 11 epoch is 0.79
loss at 12 epoch is 0.76
loss at 13 epoch is 0.75
loss at 14 epoch is 0.76
loss at 15 epoch is 0.72


# Inference (Translation)

In [16]:
from attention.utils import translate_simple_rnn

sentences = [
    "i am only warming up now.",
    "you are both in the wrong.",
    "he is said to have died",
    "i am bored out of my mind.",
    "i am going to stay here for a couple of days.",
    "they are out shopping.",
    "i am afraid he will make a mistake.",
    "we are worried about you.",
    "he likes to go to work",
    "he is not at all foolish"
]
for sent in sentences:
    translation = translate_simple_rnn(sent, encoder, decoder, dataset.vectorizer, device)
    print(sent, '=>', translation)

i am only warming up now. => pour l instant je sejourne a un poisson
you are both in the wrong. => il est dans le pays entier .
he is said to have died => on dit qu il est mon parent
i am bored out of my mind. => je suis a present a l heroine .
i am going to stay here for a couple of days. => je suis desolee est ce que je ne le
they are out shopping. => elles sont sorties faire des deux
i am afraid he will make a mistake. => j ai peur de ce que j ai des
we are worried about you. => nous sommes inquiets pour leur securite .
he likes to go to work => il travaille dans la recherche sur le
he is not at all foolish => il n est pas du tout un


In [13]:
df.sample(5)

Unnamed: 0,english,french,split
32577,he is sure to succeed.,Il est sûr de son succès.,train
53211,i am bored out of my mind.,Je m'ennuie tellement.,train
47651,he is not at all foolish.,Il n'est vraiment pas fou.,valid
100980,we are going to climb that mountain.,Nous allons gravir cette montagne.,train
51149,we are worried about you.,Nous sommes inquiets à votre sujet.,train
