In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

In [2]:
import warnings
from pathlib import Path

import torch
from torch import nn
import pandas as pd

from attention import models
from attention import utils
from attention.vectorizer import Vectorizer
from attention.constants import ENGLISH, FRENCH, SEQ_SIZE, DECODER_INPUT, ENCODER_INPUT, SOS_token


warnings.filterwarnings('ignore')

SOURCE_DIR = Path('../')
DATA_DIR = SOURCE_DIR / 'data'
translation_fp = DATA_DIR / 'eng-fra.txt'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data

In [3]:
# vectorizer code
from attention.data import (
    load_sentences_dataframe,
    filter_by_prefixes, 
    assign_rows_to_split,
     NMTDataset, 
     generate_batches
     )
from attention.constants import ENGLISH
from attention.text_preprocessing import preprocess_text
from attention.vectorizer import Vectorizer

# load dataframe
df = load_sentences_dataframe(translation_fp)

# split dataset
df = assign_rows_to_split(df, train_ratio=0.7, valid_ratio=0.15, test_ratio=0.15)

# preprocess text
df[ENGLISH] = df[ENGLISH].apply(preprocess_text)
df[FRENCH] = df[FRENCH].apply(preprocess_text)

# filter by prefixes to simplify the NMT task
eng_prefixes = ("i am ", "i m ", "he is", "she is", "she s ", "you are", "you re ")
df = filter_by_prefixes(df, eng_prefixes, column=ENGLISH)

# create dataset
dataset = NMTDataset.from_dataframe(df)
print(f'dataset has {len(dataset)} records')

dataset has 7297 records


In [10]:
dataset.set_split('valid')
print(len(dataset))
dataset

1551


<attention.data.NMTDataset at 0x7fb1c948a730>

# Model Construction

In [4]:
from attention.models import AttentionModel

source_vocab = dataset.vectorizer.source_vocab
target_vocab = dataset.vectorizer.target_vocab

source_vocab_size = len(source_vocab)
target_vocab_size = len(target_vocab)

source_embedding_size= 300
target_embedding_size = 300

encoding_size = 200
target_sos_index = target_vocab.sos_index
            
model = AttentionModel(
    source_vocab_size,
    source_embedding_size,
    target_vocab_size,
    target_embedding_size,
    encoding_size,
    target_sos_index
)

In [6]:
from attention.train import Translation_Trainer
from attention.utils import handle_dirs
from attention.losses import average_loss
from attention.data import generate_nmt_batches

# training parameters
nb_epochs = 10
batch_size = 32

# create optimizer
optimizer = torch.optim.Adam(model.parameters(), 0.001)

# configure model storage
model_id = 'attention_1'
model_dir = SOURCE_DIR / f'models_store/{model_id}'
handle_dirs(model_dir)

# create trainer object
tainer = Translation_Trainer(
    data_loader=generate_nmt_batches, 
    optimizer=optimizer, 
    model=model, 
    model_dir=model_dir, 
    loss_func=average_loss, 
    device=device
)

# run training
tainer.run(nb_epochs, dataset, batch_size, checkpoint=True)

Completed Epoch 0 with average training loss of 3.06
Completed Epoch 0 with average validation loss of 3.99
Completed Epoch 1 with average training loss of 1.73
Completed Epoch 1 with average validation loss of 3.61
Completed Epoch 2 with average training loss of 1.13
Completed Epoch 2 with average validation loss of 3.52
Completed Epoch 3 with average training loss of 0.79
Completed Epoch 3 with average validation loss of 3.58
Completed Epoch 4 with average training loss of 0.60
Completed Epoch 4 with average validation loss of 3.64
Completed Epoch 5 with average training loss of 0.48
Completed Epoch 5 with average validation loss of 3.60
Completed Epoch 6 with average training loss of 0.40
Completed Epoch 6 with average validation loss of 3.57
Completed Epoch 7 with average training loss of 0.35
Completed Epoch 7 with average validation loss of 3.65


KeyboardInterrupt: 

# Inference

In [None]:
def translate_attention_rnn(sent, model, vectorizer, device):
    data_dict = vectorizer.vectorize(sent, '')
    source_vector = torch.tensor(data_dict['source_vector']).unsqueeze(0)
    source_length = torch.tensor(data_dict['source_length']).unsqueeze(0)
    target_seq = torch.tensor(data_dict['target_x_vector']).unsqueeze(0)
    
    model.eval()
    
    output = model(source_vector, source_length, target_seq)
    sent = torch.argmax(output[:, 0, :], dim=-1)
    tokens = []
    for index in sent.numpy():
        token = vectorizer.target_vocab.lookup_index(index)
        if token == '<eos>':
            break
        tokens.append(token)
        
    return ' '.join(tokens)
    

sentences = [
    "i am only warming up now.",
    "you are both in the wrong.",
    "he is said to have died",
    "i am bored out of my mind.",
    "i am going to stay here for a couple of days.",
    "they are out shopping.",
    "i am afraid he will make a mistake.",
    "we are worried about you.",
    "he likes to go to work",
    "he is not at all foolish",
    "he went to school by bus",
    "he likes to go shopping",
    "he loves to play music",
    "i am happy."
]
for sent in sentences:
    translation = translate_attention_rnn(sent, model, dataset.vectorizer, device)
    print(sent, '=>', translation)