In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd

from attention import models
from attention import utils
from attention.vectorizer import Vectorizer
from attention.constants import ENGLISH, FRENCH

SOURCE_DIR = Path('../')
DATA_DIR = SOURCE_DIR / 'data'
translation_fp = DATA_DIR / 'eng-fra.txt'

# Data & Vectorizer

In [65]:
# vectorizer code
from attention.data import load_sentences_dataframe, assign_rows_to_split, TranslationDataset, generate_batches
from attention.vectorizer import Vectorizer

df = load_sentences_dataframe(translation_fp)
df = assign_rows_to_split(df, train_ratio=0.01, valid_ratio=0.5, test_ratio=0.49)

dataset = TranslationDataset.from_dataframe(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_rows[SPLIT] = TRAIN
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_rows[SPLIT] = VALID
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_rows[SPLIT] = TEST


# Embeddings

In [4]:
import fasttext.util

fasttext.util.download_model('en', if_exists='ignore')
fasttext.util.download_model('fr', if_exists='ignore')

ft_english = fasttext.load_model('cc.en.300.bin')
ft_french = fasttext.load_model('cc.fr.300.bin')

vectorizer = dataset.vectorizer
english_embedding_matrix = vectorizer.build_embedding_matrix_from_fasttext(ft_english, lang=ENGLISH)
french_embedding_matrix = vectorizer.build_embedding_matrix_from_fasttext(ft_french, lang=FRENCH)



# Model Construction

In [68]:
import torch
from attention.models import Encoder, DecoderRNN, TranslationModel
from attention.constants import SEQ_SIZE, SOS_token
from attention.utils import normalize_string

# build encoder
encoder = Encoder(english_embedding_matrix, hidden_size)

# build decoder
output_vocab_size = len(vectorizer.french_vocab)
decoder = DecoderRNN(french_embedding_matrix, hidden_size, output_vocab_size)

# build model
model = TranslationModel(encoder, decoder, output_vocab_size)

# forward pass
decoder_outputs = model(encoder_inputs, decoder_inputs)

# Training

In [75]:
import torch
from attention.train import Translation_Trainer
from attention.utils import handle_dirs
from attention.constants import PAD_token

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def loss_func(decoder_outputs, decoder_input):
    avg_batch_loss, batch_size = 0, 0
    criterion = torch.nn.CrossEntropyLoss(ignore_index=vectorizer.english_vocab.lookup_token(PAD_token))
    for predicted_seq, actual_seq in zip(decoder_outputs, decoder_input):
        avg_batch_loss += criterion(predicted_seq, actual_seq)
        batch_size += 1
    return avg_batch_loss / batch_size
        
model_dir = SOURCE_DIR / 'models' / 'simple_rnn'
handle_dirs(model_dir)
trainer = Translation_Trainer(generate_batches, optimizer, model, model_dir, loss_func, device)

nb_epochs = 20
trainer.run(nb_epochs, dataset, batch_size=32, checkpoint=True)

Completed Epoch 0 with average loss of 5.91


KeyboardInterrupt: 