In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
from pathlib import Path

import torch
import pandas as pd

from attention import models
from attention import utils
from attention.vectorizer import Vectorizer
from attention.constants import ENGLISH, FRENCH


warnings.filterwarnings('ignore')

SOURCE_DIR = Path('../')
DATA_DIR = SOURCE_DIR / 'data'
translation_fp = DATA_DIR / 'eng-fra.txt'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data & Vectorizer

In [3]:
# vectorizer code
from attention.data import load_sentences_dataframe, assign_rows_to_split, TranslationDataset, generate_batches
from attention.vectorizer import Vectorizer

df = load_sentences_dataframe(translation_fp)
df = assign_rows_to_split(df, train_ratio=0.3, valid_ratio=0.05, test_ratio=0.65)

dataset = TranslationDataset.from_dataframe(df)

# Embeddings

In [4]:
import fasttext.util

fasttext.util.download_model('en', if_exists='ignore')
fasttext.util.download_model('fr', if_exists='ignore')

ft_english = fasttext.load_model('cc.en.300.bin')
ft_french = fasttext.load_model('cc.fr.300.bin')

vectorizer = dataset.vectorizer
english_embedding_matrix = vectorizer.build_embedding_matrix_from_fasttext(ft_english, lang=ENGLISH)
french_embedding_matrix = vectorizer.build_embedding_matrix_from_fasttext(ft_french, lang=FRENCH)



# Model Construction

In [5]:
import torch
from attention.models import Encoder, DecoderRNN, TranslationModel
from attention.constants import SEQ_SIZE, SOS_token
from attention.utils import normalize_string

# build encoder
hidden_size = 100
encoder = Encoder(english_embedding_matrix, hidden_size)

# build decoder
output_vocab_size = len(vectorizer.french_vocab)
decoder = DecoderRNN(french_embedding_matrix, hidden_size, output_vocab_size)

# build model
model = TranslationModel(encoder, decoder, output_vocab_size)

# Training

In [6]:
import torch
from attention.train import Translation_Trainer
from attention.utils import handle_dirs
from attention.constants import PAD_token

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def loss_func(decoder_outputs, decoder_input):
    avg_batch_loss, batch_size = 0, 0
    criterion = torch.nn.CrossEntropyLoss(ignore_index=vectorizer.english_vocab.lookup_token(PAD_token))
    for predicted_seq, actual_seq in zip(decoder_outputs, decoder_input):
        avg_batch_loss += criterion(predicted_seq, actual_seq)
        batch_size += 1
    return avg_batch_loss / batch_size
        
model_dir = SOURCE_DIR / 'models' / 'simple_rnn'
handle_dirs(model_dir)
trainer = Translation_Trainer(generate_batches, optimizer, model, model_dir, loss_func, device)

nb_epochs = 20
trainer.run(nb_epochs, dataset, batch_size=32, checkpoint=True)

Completed Epoch 0 with average training loss of 9.86
Completed Epoch 0 with average validation loss of 9.90
Completed Epoch 1 with average training loss of 9.85
Completed Epoch 1 with average validation loss of 9.90
Completed Epoch 2 with average training loss of 9.85
Completed Epoch 2 with average validation loss of 9.90
Completed Epoch 3 with average training loss of 9.85
Completed Epoch 3 with average validation loss of 9.90
Completed Epoch 4 with average training loss of 9.85
Completed Epoch 4 with average validation loss of 9.90
Completed Epoch 5 with average training loss of 9.85
Completed Epoch 5 with average validation loss of 9.90


KeyboardInterrupt: 

# Evaluation

In [7]:
from attention.utils import translate

english_sent = "I went to the hospital"
res = translate(english_sent, model, vectorizer)
res

'<eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> cinematographique'

In [8]:
french_vocab = vectorizer.french_vocab
french_vocab.lookup_index(80)

'moment'

In [9]:
len(vectorizer.french_vocab)

21336

# scratchpad

In [None]:
model.eval()

sentences = ['I worked on a farm', 'I went to the beach']
vectorized_sentences = [vectorizer.vectorize_sentence(sent, language=ENGLISH) for sent in sentences]
encoder_inputs = torch.stack(vectorized_sentences)

french_vocab = vectorizer.french_vocab
sos_token = french_vocab.lookup_token(french_vocab.sos)
prediction = model(encoder_inputs, sos_token=sos_token)
prediction.shape

In [None]:
from attention.constants import VALID, ENCODER_INPUT, DECODER_INPUT

french_vocab = vectorizer.french_vocab
sos_token = french_vocab.lookup_token(french_vocab.sos)

dataset.set_split(VALID)
gen = generate_batches(dataset, batch_size=32)
model.eval()
for batch in gen:
    encoder_inputs, y_true = batch[ENCODER_INPUT], batch[DECODER_INPUT]
    prediction = model(encoder_inputs, sos_token=sos_token)
    print(prediction.shape, y_true.shape)
    break

In [None]:
from torch import nn

from attention.models import Encoder, DecoderRNN

batch_size = 2
seq_size = 10

encoder = Encoder(english_embedding_matrix, hidden_size)
init_hidden = encoder.init_hidden(batch_size=batch_size, device=device)
sentences = ['I worked on a farm', 'I went to the beach']
vectorized_sentences = [vectorizer.vectorize_sentence(sent, language=ENGLISH) for sent in sentences]
encoder_inputs = torch.stack(vectorized_sentences)
encoder_outputs, encoder_hidden = encoder(encoder_inputs, init_hidden)


french_vocab = vectorizer.french_vocab
decoder = DecoderRNN(
    french_embedding_matrix, 
    hidden_size=hidden_size, 
    output_size=len(french_vocab)
)

# decoder_input -> (batch_size, embedding_size) one token at a time 
sos_token_index = french_vocab.lookup_token(french_vocab.sos)

output_size = len(french_vocab)
decoder_outputs = torch.zeros(batch_size, seq_size, output_size)
next_tokens = torch.tensor(sos_token_index).expand(batch_size)

hidden = encoder_hidden
for token in range(seq_size-1):
    decoder_output, hidden = decoder(next_tokens, hidden)
    decoder_outputs[:, token] = decoder_output
    next_tokens = torch.argmax(decoder_output, dim=1)