In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

In [2]:
import warnings
from pathlib import Path

import torch
from torch import nn
import pandas as pd

from attention import models
from attention import utils
from attention.vectorizer import Vectorizer
from attention.constants import ENGLISH, FRENCH, SEQ_SIZE, DECODER_INPUT, ENCODER_INPUT, SOS_token


warnings.filterwarnings('ignore')

SOURCE_DIR = Path('../')
DATA_DIR = SOURCE_DIR / 'data'
translation_fp = DATA_DIR / 'eng-fra.txt'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data

In [3]:
# vectorizer code
from attention.data import load_sentences_dataframe, assign_rows_to_split, TranslationDataset, generate_batches
from attention.vectorizer import Vectorizer

df = load_sentences_dataframe(translation_fp)
df = assign_rows_to_split(df, train_ratio=0.9, valid_ratio=0.05, test_ratio=0.05)

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)
df[ENGLISH] = df[ENGLISH].str.lower()
df = df[df[ENGLISH].str.startswith(eng_prefixes)]

dataset = TranslationDataset.from_dataframe(df)

# Model Construction

In [14]:
# Model params
from attention.models import EncoderRNN, DecoderRNN
from attention.embeddings import create_spacy_embeddings

hidden_size = 256
embedding_size = 300
english_vocab_size = len(dataset.vectorizer.english_vocab)
french_vocab_size = len(dataset.vectorizer.french_vocab)

use_pretrained_embeddings = False
english_embedding_matrix, french_embedding_matrix = create_spacy_embeddings(dataset.vectorizer)

# use pretrained embeddings?
if use_pretrained_embeddings:
    encoder = EncoderRNN(english_vocab_size, hidden_size, embedding_matrix=english_embedding_matrix)
    decoder = DecoderRNN(hidden_size, french_vocab_size, embedding_matrix=french_embedding_matrix)
else:
    encoder = EncoderRNN(english_vocab_size, hidden_size, embedding_size)
    decoder = DecoderRNN(hidden_size, french_vocab_size, embedding_size)

# Train

In [17]:
from attention.train import train_simpleRNN_batch

# training params
nb_epochs = 15
learning_rate = 0.001
ignore_index = dataset.vectorizer.english_vocab.lookup_token(dataset.vectorizer.english_vocab.pad)
criterion = nn.CrossEntropyLoss(ignore_index=ignore_index)
batch_size = 32

# create optimizers
encoder_optim = torch.optim.Adam(encoder.parameters(), learning_rate)
decoder_optim = torch.optim.Adam(decoder.parameters(), learning_rate)

# run epochs
for epoch in range(nb_epochs):
    epoch_losses = []
    for batch in generate_batches(dataset, batch_size=batch_size):
        input_batch, target_batch = batch[ENCODER_INPUT], batch[DECODER_INPUT]
        loss = train_simpleRNN_batch(
            input_batch, 
            target_batch, 
            encoder,
            decoder,
            encoder_optim,
            decoder_optim,
            criterion,
            device,
            use_teacher_forcing=True
        )
        epoch_losses.append(loss)
    avg_loss = sum(epoch_losses) / len(epoch_losses)
    print(f'loss at {epoch+1} epoch is {avg_loss:.2f}')

loss at 1 epoch is 3.35
loss at 2 epoch is 2.94
loss at 3 epoch is 2.49
loss at 4 epoch is 2.60
loss at 5 epoch is 2.74
loss at 6 epoch is 2.30
loss at 7 epoch is 2.49
loss at 8 epoch is 2.43
loss at 9 epoch is 2.27
loss at 10 epoch is 2.43
loss at 11 epoch is 2.22
loss at 12 epoch is 2.40
loss at 13 epoch is 2.07
loss at 14 epoch is 2.27
loss at 15 epoch is 2.25


# Inference (Translation)

In [18]:
from attention.utils import translate_simple_rnn

sentences = [
    "i am only warming up now.",
    "you are both in the wrong.",
    "he is said to have died",
    "i am bored out of my mind.",
    "i am going to stay here for a couple of days.",
    "they are out shopping.",
    "i am afraid he will make a mistake.",
    "we are worried about you.",
    "he likes to go to work",
    "he is not at all foolish"
]
for sent in sentences:
    translation = translate_simple_rnn(sent, encoder, decoder, dataset.vectorizer, device)
    print(sent, '=>', translation)

i am only warming up now. => pour l instant je m entraine seulement .
you are both in the wrong. => vous etes en train de s .
he is said to have died => on le dit qu il n est
i am bored out of my mind. => je suis en train de ma parole .
i am going to stay here for a couple of days. => je suis etonnee que vous que vous etes la
they are out shopping. => elles sont sorties faire les enfants
i am afraid he will make a mistake. => je crains qu il commette commette une erreur .
we are worried about you. => ce n est pas infirmiere mais docteur
he likes to go to work => je suis interesse par ton audace .
he is not at all foolish => il n est pas vraiment en difficulte


In [30]:
translate_simple_rnn('i am interested.', encoder, decoder, dataset.vectorizer, device)

'je suis interesse par l'

In [31]:
df

Unnamed: 0,english,french,split
40263,they are great friends.,Elles sont de grandes amies.,train
68831,he is our teacher of english.,Il est notre professeur d'anglais.,train
41358,you are hearing things.,Vous entendez des choses.,train
5278,she is french.,Elle est française.,train
46335,you are blinded by love.,Vous êtes aveuglé par l'amour.,train
...,...,...,...
62648,you are taller than she is.,Tu es plus grand qu'elle.,test
23459,i am like my mother.,Je suis comme ma mère.,test
107563,i am much obliged to you for your help.,J'apprécie beaucoup ton aide.,test
29833,she is angry with me.,Elle est en colère après moi.,test


In [32]:
len(df)

3203