In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

In [2]:
import warnings
from pathlib import Path

import torch
from torch import nn
import pandas as pd

from attention import models
from attention import utils
from attention.vectorizer import Vectorizer
from attention.constants import ENGLISH, FRENCH, SEQ_SIZE, DECODER_INPUT, ENCODER_INPUT


warnings.filterwarnings('ignore')

SOURCE_DIR = Path('../')
DATA_DIR = SOURCE_DIR / 'data'
translation_fp = DATA_DIR / 'eng-fra.txt'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data

In [3]:
# vectorizer code
from attention.data import load_sentences_dataframe, assign_rows_to_split, TranslationDataset, generate_batches
from attention.vectorizer import Vectorizer

df = load_sentences_dataframe(translation_fp)
df = assign_rows_to_split(df, train_ratio=0.9, valid_ratio=0.05, test_ratio=0.05)

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)
df[ENGLISH] = df[ENGLISH].str.lower()
df = df[df[ENGLISH].str.startswith(eng_prefixes)]

dataset = TranslationDataset.from_dataframe(df)

# Source code

In [55]:
# model
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size, embedding_matrix=None):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        # embedding layer
        if embedding_matrix is None:
            self.embedding = nn.Embedding(input_size, embedding_size)
        else:
            embedding_size = embedding_matrix.shape[-1]
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
            
        
        # rnn layer
        self.gru = nn.GRU(embedding_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
    

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, embedding_size, embedding_matrix=None):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size

        # embedding layer
        if embedding_matrix is None:
            self.embedding = nn.Embedding(output_size, embedding_size)
        else:
            embedding_size = embedding_matrix.shape[-1]
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix, )
            
        
        # rnn layer
        self.gru = nn.GRU(embedding_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = torch.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

    
class SimpleRnn(nn.Module):
    def __init__(self, encoder, decoder):
        super(SimpleRnn, self).__init__()

        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, input_sentence, output_sentence):
        hidden = self.encoder.initHidden()
        encoder_outputs = torch.zeros(1, SEQ_SIZE, self.encoder.hidden_size)
        
        for idx in range(SEQ_SIZE-1):
            token = input_sentence[idx]
            output, hidden = self.encoder(token, hidden)
            encoder_outputs[0, idx] = output[-1]

        decoder_outputs = torch.zeros(1, SEQ_SIZE-1, self.decoder.output_size)
        for idx in range(SEQ_SIZE-1):
            token = output_sentence[idx]
            output, hidden = self.decoder(token, hidden)
            decoder_outputs[0, idx] = output[-1]
            
        return decoder_outputs, hidden
    
def train_one_sent(sent, translation, model, optim):
    optim.zero_grad()
    decoder_outputs, hidden = model(sent, translation)
    criterion = nn.CrossEntropyLoss()
    loss = criterion(decoder_outputs.squeeze(0), translation_sentence[1:])
    loss.backward()
    optim.step()
    return loss.item()


# translate
def translate(sentence, model, vectorizer):
    vectorized = vectorizer.vectorize_sentence(sentence, language=ENGLISH)
    french_vocab = vectorizer.french_vocab
    
    encoder_hidden = encoder.initHidden()
    for idx in range(SEQ_SIZE):
        token = vectorized[idx]
        _, encoder_hidden = encoder_output, encoder_hidden = encoder(token, encoder_hidden)

    decoder_outputs = torch.zeros(1, SEQ_SIZE-1, decoder.output_size)

    for idx in range(SEQ_SIZE-1):
        token = vectorized[idx]
        output, hidden =  decoder(token, encoder_hidden)
        decoder_outputs[0, idx] = output[-1]

    
    indices = torch.argmax(decoder_outputs, dim=-1).squeeze(0)
    words = [french_vocab.lookup_index(idx) for idx in indices]
    print(' '.join(words))

## run

In [5]:
# run
hidden_size = 256
vectorizer = dataset.vectorizer
english_vocab_size = len(vectorizer.english_vocab)
french_vocab_size = len(vectorizer.french_vocab)

encoder = EncoderRNN(english_vocab_size, hidden_size)
decoder = DecoderRNN(hidden_size, french_vocab_size)
model = SimpleRnn(encoder, decoder)

optim = torch.optim.Adam(model.parameters(), 0.001)


nb_epochs = 10
for epoch in range(nb_epochs):
    for batch in generate_batches(dataset, batch_size=1):
        single_sentence, translation_sentence = batch[ENCODER_INPUT], batch[DECODER_INPUT]
        single_sentence = single_sentence.squeeze(0)
        translation_sentence = translation_sentence.squeeze(0)
        loss = train_one_sent(
            single_sentence, 
            translation_sentence, 
            model,
            optim
        )
    print(f'loss at {epoch+1} epoch is {loss:.2f}')

loss at 1 epoch is 3.87
loss at 2 epoch is 0.44
loss at 3 epoch is 1.67
loss at 4 epoch is 0.73
loss at 5 epoch is 0.91
loss at 6 epoch is 0.82
loss at 7 epoch is 0.53
loss at 8 epoch is 0.31
loss at 9 epoch is 0.53
loss at 10 epoch is 0.14


In [47]:
sentence = "he went to school by bus"
translate(sentence, model, vectorizer)

il ce de il il il de <pad> <pad>


# Embeddings

In [48]:
import fasttext.util

fasttext.util.download_model('en', if_exists='ignore')
fasttext.util.download_model('fr', if_exists='ignore')

ft_english = fasttext.load_model('cc.en.300.bin')
ft_french = fasttext.load_model('cc.fr.300.bin')

vectorizer = dataset.vectorizer
english_embedding_matrix = vectorizer.build_embedding_matrix_from_fasttext(ft_english, lang=ENGLISH)
french_embedding_matrix = vectorizer.build_embedding_matrix_from_fasttext(ft_french, lang=FRENCH)



In [None]:
# run
hidden_size = 256
vectorizer = dataset.vectorizer
english_vocab_size = len(vectorizer.english_vocab)
french_vocab_size = len(vectorizer.french_vocab)
embedding_size = 100

encoder = EncoderRNN(english_vocab_size, hidden_size, embedding_size, english_embedding_matrix)
decoder = DecoderRNN(hidden_size, french_vocab_size, embedding_size, french_embedding_matrix)
model = SimpleRnn(encoder, decoder)

optim = torch.optim.Adam(model.parameters(), 0.001)


nb_epochs = 10
for epoch in range(nb_epochs):
    epoch_losses = []
    for batch in generate_batches(dataset, batch_size=1):
        single_sentence, translation_sentence = batch[ENCODER_INPUT], batch[DECODER_INPUT]
        single_sentence = single_sentence.squeeze(0)
        translation_sentence = translation_sentence.squeeze(0)
        loss = train_one_sent(
            single_sentence, 
            translation_sentence, 
            model,
            optim
        )
        epoch_losses.append(loss)
    avg_loss = sum(epoch_losses) / len(epoch_losses)
    print(f'loss at {epoch+1} epoch is {avg_loss:.2f}')

loss at 1 epoch is 3.27
loss at 2 epoch is 2.42
loss at 3 epoch is 1.97
loss at 4 epoch is 1.63
loss at 5 epoch is 1.34
loss at 6 epoch is 1.10
loss at 7 epoch is 0.91


In [62]:
sentence = "he likes school"
translate(sentence, model, vectorizer)

il il il il il il il il il
