## Setting Up Your Drive for Development
This section will cover importing a private git repository to a folder in your drive and setting up project environment variables. If you've already downloaded the repository, feel free to skip steps labeled with #SKIP.

In [None]:
from google.colab import drive

# default location for the drive
ROOT = "/content/gdrive"

drive.mount(ROOT) 

# Check that can access the shared drive
!ls "{ROOT}/Shareddrives/CS 175 Project"

Mounted at /content/gdrive
 Datasets   'Papers Other Resources'	    token.txt
 Diagrams   'Progress Reports'		   'Yanqi making it repo public.gdoc'
 models      Proposal
 Notebooks  'Team AElfrictoAlbert.gsheet'


In [None]:
# Clone github repository setup
# import join used to join ROOT path and MY_GOOGLE_DRIVE_PATH
from os.path import join  

# path to your project on Google Drive
MY_GOOGLE_DRIVE_PATH = 'My Drive/cs175-Aelfric-to-Albert' 
GIT_USERNAME = "mayaschwarz" 

# Put your Token here! Do not save to the repo with it!
GIT_TOKEN_PATH = join(ROOT, "Shareddrives/CS 175 Project/token.txt")
GIT_TOKEN = ""

with open(GIT_TOKEN_PATH, 'r') as f:
  GIT_TOKEN = f.readline().strip()

if not GIT_TOKEN:
  raise ValueError("GIT_TOKEN MISSING")

GIT_REPOSITORY = "cs175--lfric-to-Albert" 

PROJECT_PATH = join(ROOT, MY_GOOGLE_DRIVE_PATH)

# It's good to print out the value if you are not sure 
print("PROJECT_PATH: ", PROJECT_PATH)   

#GIT_PATH = "https://{GIT_TOKEN}@github.com/{GIT_USERNAME}/{GIT_REPOSITORY}.git" this return 400 Bad Request for me
GIT_PATH = "https://" + GIT_TOKEN + "@github.com/" + GIT_USERNAME + "/" + GIT_REPOSITORY + ".git"
print("GIT_PATH: ", GIT_PATH)

PROJECT_PATH:  /content/gdrive/My Drive/cs175-Aelfric-to-Albert
GIT_PATH:  https://5724b257c777c6dbb9bc086821f822ef220e3126@github.com/mayaschwarz/cs175--lfric-to-Albert.git


In [None]:
# Answer input query for downloading git repository
while True:
    response = input("Are you sure you want to download the repo? Doing so will delete all unpush work. [y|N] ").lower().strip()
    if not response or response[0] == 'n':
        break
    elif response[0] == "y":
        !if test -d "{PROJECT_PATH}"; then rm -rv "{PROJECT_PATH}"
        !mkdir -p "{PROJECT_PATH}" 
        !git clone "{GIT_PATH}" "{PROJECT_PATH}"
        break

# cd into the repository
%cd "{PROJECT_PATH}"

Are you sure you want to download the repo? Doing so will delete all unpush work. [y|N] N
/content/gdrive/My Drive/cs175-Aelfric-to-Albert


In [None]:
# Check that repository is up to date
!git pull 
# Check which branch you're on
!git branch

# Development
Here begins coding

## Libraries

In [None]:
# download the required libraries
!pip install cltk contractions tqdm -q

In [None]:
# Standard library
from __future__ import unicode_literals, print_function, division
from queue import PriorityQueue
import math
import re
import random
import string
import unicodedata
import time

# additional libraries (pip install ..)
import numpy as np
# change to just tqdm if not running in notebook
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

# nltk and nltk related libraries
import nltk
import cltk

# pytorch and torch libraries
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torchtext.data import Dataset, Example, Field, BucketIterator, Iterator

# local libraries
from src.data_manager import *

In [None]:
# download nltk packages
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# get device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Data Preprocessing

In [None]:
def reset_random(seed: int = 1234):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

reset_random()

In [None]:
from nltk.tokenize import word_tokenize
from cltk.corpus.middle_english.alphabet import normalize_middle_english
from cltk.phonology.old_english.phonology import Word


def old_english_tokenizer(text: str) -> [str]:
    # tokenize lowercase string
    tokens = word_tokenize(text.lower())

    # converts to ascii_encoding (romanization and removes diacritical marks)
    tokens = [Word(t).ascii_encoding() for t in tokens]
    return tokens

def middle_english_tokenizer(text: str) -> [str]:
    # convert to canonical form (modernized characters)
    # þ and ð become th, 3 becomes y
    text = normalize_middle_english(text, to_lower=True, alpha_conv=True)

    # tokenize lowercase string
    tokens = word_tokenize(text)
    return tokens

def modern_english_tokenizer(text: str) -> [str]:
   # tokenize lowercase string
   tokens = word_tokenize(text.lower())
   return tokens

In [None]:
# DATA PREPROCESSING GLOBALS
MAX_SEQUENCE_LENGTH = 60
MIN_WORD_FREQ = 2
BATCH_SIZE = 64
BATCH_FIRST = False

INIT_TOKEN = '<sos>'
EOS_TOKEN = '<eos>'
PAD_TOKEN = '<pad>'

In [None]:
class BibleDataset(Dataset):
    def __init__(self, source: [str], target: [str], fields: (Field, ), **kwargs):
        """
        Create a Dataset of Bible verses given samples from source and target.

        Arguments:
            source{[str]} -- A list of sample verses to be translated from
            target{[str]} -- A list of sample verses to be translated to
            fields{(Field,)} -- A tuple containing the fields that will be used
                                for data in each language
            **kwargs -- remaining keyword arguments passed to Dataset
        """
        if not isinstance(fields[0], (tuple, list)):
            fields = [('src', fields[0]), ('trg', fields[1])]
        
        examples = [Example.fromlist([src, trg], fields) for src, trg in zip(source, target)]
        super().__init__(examples, fields, **kwargs)

    @classmethod
    def split(cls, source: str, target: str, fields: (Field,), train_split=0.8, preprocess_operations: [Callable[[dict], dict]] = [], **kwargs):
        """
        Create dataset objects for splits of a BibleDataset.

        Arguments:
            source{str} -- key name of source version
            target{str} -- key name of target version
            fields{(Field,)} -- A tuple containing the fields that will be used
                                for data in each language
            train_split{float} -- training validation split (default: 0.8)
            **kwargs -- remaining keyword arguments passed to Dataset
        """

        # Generate splits
        versions = get_bible_versions_by_file_name([source, target])

        datasets = create_datasets(versions, train_split, 
                        preprocess_operations = preprocess_operations,
                        write_files=False, verbose=True);

        train = cls(datasets['training'][source],
                         datasets['training'][target],
                         fields, **kwargs)
        
        validation = cls(datasets['validation'][source], 
                         datasets['validation'][target],
                         fields, **kwargs)
        
        test = cls(datasets['test'][source], 
                         datasets['test'][target],
                         fields, **kwargs)
        
        return train, validation, test

In [None]:
# Pre and post-processing functions to call when generating Datasets
def pad_to_max_len(batch, vocab, max_length=MAX_SEQUENCE_LENGTH+2):
    pad_idx = vocab.stoi[PAD_TOKEN]
    for idx, ex in enumerate(batch):
        if len(ex) < max_length:
            batch[idx] = ex + [pad_idx] * (max_length - len(ex))
    return batch

In [None]:
SRC = Field(tokenize=modern_english_tokenizer,
            init_token=INIT_TOKEN,
            eos_token=EOS_TOKEN,
            batch_first=BATCH_FIRST,
            include_lengths = True)

TRG = Field(tokenize=modern_english_tokenizer,
            init_token=INIT_TOKEN,
            eos_token=EOS_TOKEN,
            batch_first=BATCH_FIRST)

In this notebook, we're going to attempt to convert Bible in Basic English (BBE) to King James (KJV)


In [None]:
SOURCE_VER = 't_wyc'
TARGET_VER = 't_kjv'

preprocessing = [preprocess_filter_num_words(MAX_SEQUENCE_LENGTH), 
                 preprocess_expand_contractions(),
                 preprocess_filter_num_sentences()]

train_data, validation_data, test_data = BibleDataset.split(SOURCE_VER, TARGET_VER, fields=(SRC, TRG),
                                                            preprocess_operations=[], train_split=0.85)

# print dataset lengths
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(validation_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

# print a sample from the training data
print(train_data.examples[0].src)
print(train_data.examples[0].trg)

Finding shared verses between 2 versions...        done in 1.647 seconds
Separate test verses...                            done in 1.088 seconds
Separate validation verses...                      done in 0.029 seconds
Zip together verses (shuffle = True)...            done in 0.038 seconds

# training verses:    20,327 (71%)
# validation verses:   3,588 (13%)
# test verses:         4,599 (16%)
Number of training examples: 20327
Number of validation examples: 3588
Number of testing examples: 4599
['`therfor', 'sche', 'was', 'ioyned', 'to', 'the', 'damesels', 'of', 'booz', ';', 'and', 'so', 'longe', 'sche', 'rap', 'with', 'hem', ',', 'til', 'bothe', 'barli', 'and', 'wheete', 'weren', 'closid', 'in', 'the', 'bernys', '.']
['so', 'she', 'kept', 'fast', 'by', 'the', 'maidens', 'of', 'boaz', 'to', 'glean', 'unto', 'the', 'end', 'of', 'barley', 'harvest', 'and', 'of', 'wheat', 'harvest', ';', 'and', 'dwelt', 'with', 'her', 'mother', 'in', 'law', '.']


In [None]:
SRC.build_vocab(train_data, min_freq=MIN_WORD_FREQ)
TRG.build_vocab(train_data, min_freq=MIN_WORD_FREQ)

print(f"Unique tokens in source (me) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (me) vocabulary: 8777
Unique tokens in target (en) vocabulary: 7302


In [None]:
# To use packed padded sequences, need to sorted by their non-padded lengths in descending order
# first sentence is the longest
train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, validation_data, test_data), 
                                                                      batch_size = BATCH_SIZE, 
                                                                      sort_within_batch=True,
                                                                      sort_key=lambda x: len(x.src),
                                                                      device = device)

## Model Design

The encoder of a seq2seq network is a RNN that outputs some value for every word from the input sentence. For every input word the encoder outputs a vector and a hidden state, and uses the hidden state for the next input word.

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size: int, embedding_size: int, hidden_size: int, num_layers: int, bidirectional: bool, dropout: float):
        super().__init__()

        self.input_size = input_size
        self.embedded_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.is_bidirectional = bidirectional

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size, num_layers=num_layers,
                          bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_size * (2 if bidirectional else 1), hidden_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_seq, seq_len):
        
        # src = [seq len, batch size]
        # src_len = [batch size]
        
        embedded = self.dropout(self.embedding(input_seq))
        
        # embedded = [seq len, batch, embedded size]

        # need to explicitly put lengths on cpu!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len.to('cpu'))

        packed_outputs, hidden = self.rnn(embedded)

        # packed_outputs is a packed sequence containing all hidden states
        # hidden is now from the final non-padded element in the batch

        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs) 
            
        # outputs is now a non-packed sequence, all hidden states obtained
        # when the input is a pad token are all zeros
            
        # outputs = [src len, batch size, hid dim * directions]
        # hidden = [n layers * directions, batch size, hid dim]

        # hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        # outputs are always from the last layer

        # hidden [-2, :, : ] is the last of the forwards RNN 
        # hidden [-1, :, : ] is the last of the backwards RNN

        # initial decoder hidden is final hidden state of the forwards and backwards 
        # encoder RNNs fed through a linear layer
        final_hidden_state = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1) if self.is_bidirectional else hidden[-1,:,:]
        hidden = torch.tanh(self.fc(final_hidden_state))

        # outputs = [src len, batch size, enc hid dim * directions]
        # hidden = [batch size, dec hid dim]
        return outputs, hidden

In [None]:
class Attention(nn.Module):
    def __init__(self, encoder_hidden_size, decoder_hidden_size):
        super().__init__()
        
        self.attn = nn.Linear(encoder_hidden_size + decoder_hidden_size, decoder_hidden_size)
        self.v = nn.Linear(decoder_hidden_size, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs, mask):
        
        # hidden = [batch size, dec hid dim]
        # encoder_outputs = [src len, batch size, enc hid dim * directions]
        # mask = [batch size, src seq len]
        
        batch_size = query.shape[1]
        src_len = query.shape[0]
        
        # repeat decoder hidden state src_len times
        # hidden = [batch size, src len, dec hid dim]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        # encoder_outputs = [batch size, src len, enc hid dim * directions]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        # energy = [batch size, src len, dec hid dim]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        # attention= [batch size, src len]
        attention = self.v(energy).squeeze(2)
        
        # mask padding sequences
        attention = attention.masked_fill(mask == 0, -1e10)

        return F.softmax(attention, dim=1)

In [None]:
class DecoderWithAttention(nn.Module):
    def __init__(self, output_size: int, embedding_size: int, encoder_hidden_size: int, decoder_hidden_size: int, num_layers: int, dropout: int, attention: Attention):
        super().__init__()

        self.output_size = output_size
        self.hidden_size = decoder_hidden_size
        self.num_layers = num_layers
        self.attention = attention
        
        self.embedding = nn.Embedding(output_size, embedding_size)
        
        self.rnn = nn.GRU(encoder_hidden_size + embedding_size, decoder_hidden_size, num_layers=num_layers)
        
        self.fc_out = nn.Linear(encoder_hidden_size + decoder_hidden_size + embedding_size, output_size)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, decoder_input, hidden, encoder_outputs, mask):
             
        # decoder_input = [batch size]
        # hidden = [batch size, dec hid dim]
        # encoder_outputs = [src len, batch size, enc hid dim * directions]
        # mask = [batch size, src len]
        
        #input = [1, batch size]
        input = input.unsqueeze(0)
        
        # embedded = [1, batch size, emb dim]
        embedded = self.dropout(self.embedding(input))

        # a = [batch size, 1, src len]
        a = self.attention(hidden, encoder_outputs, mask).unsqueeze(1)

        # encoder_outputs = [batch size, src len, enc hid dim * 2]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        # weighted = [1, batch size, enc hid dim * 2]
        weighted = torch.bmm(a, encoder_outputs).permute(1, 0, 2)
        
        # rnn_input = [1, batch size, (enc hid dim * directions) + emb dim]
        rnn_input = torch.cat((embedded, weighted), dim = 2)
            
        #output = [seq len, batch size, dec hid dim * directions]
        #hidden = [n layers * directions, batch size, dec hid dim]
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        # #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        # #output = [1, batch size, dec hid dim]
        # #hidden = [1, batch size, dec hid dim]
        # this also means that output == hidden

        # TODO : Restructure so that the decoder can be multilayer
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        
        #prediction = [batch size, output dim]
        return F.log_softmax(prediction, dim=1) , hidden.squeeze(0)

In [None]:
class BeamState:
    def __init__(self, hidden, prev, word_idx, log_prob, length):
        '''
        BeamNode class to help with Beam Search
        
        Arguments:
          hidden
          prev
          word_idx
          log_prob
        '''
        self.h = hidden
        self.prev = prev
        self.prev_log_sum = 1.0 if prev is None else prev.eval()
        self.logp = log_prob
        self.word_idx = word_idx
        self.length = length

    def eval(self):
        return self.logp * self.prev_log_sum
    
    def construct_seq(self):
        if prev is None:
            return [self.word_idx]
        else:
          return prev.construct_seq().append(self.word_idx)

### Non-working code

The predict function does not perform beam search no return effective results. Needs to be implemented properly before working. Training code works fine, predictions on test data are only Greedy (Beam size of 1). 

This attempt was abandoned in favor of working OpenNMT-py framework for Pytorch.

In [None]:
from Queue import PriorityQueue

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, beam_size=1, 
                 src_sos_idx=SRC.vocab.stoi[INIT_TOKEN], src_eos_idx=SRC.vocab.stoi[EOS_TOKEN], src_pad_idx=SRC.vocab.stoi[PAD_TOKEN],
                 trg_sos_idx=TRG.vocab.stoi[INIT_TOKEN], trg_eos_idx=TRG.vocab.stoi[EOS_TOKEN], trg_pad_idx=TRG.vocab.stoi[PAD_TOKEN],
                 max_length=MAX_SEQUENCE_LENGTH):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.beam_size = k
        
        self.src_sos_idx = src_sos_idx
        self.src_eos_idx = src_eos_idx
        self.src_pad_idx = src_pad_idx

        self.trg_sos_idx = trg_sos_idx
        self.trg_eos_idx = trg_eos_idx
        self.trg_pad_idx = trg_pad_idx

        self.max_length = max_length
        self.device = device

    def create_mask(self, src):
        mask = (src != self.src_pad_idx).permute(1,0)
        return mask
        
    def forward(self, src, trg = None, teacher_forcing_ratio = 0.5):
        # src = [src len, batch size]
        # trg = [trg len, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        seq_len = trg.shape[0] if trg is not None else self.max_length
        trg_vocab_size = self.decoder.output_size
        
        # tensor to store decoder outputs
        outputs = torch.zeros(seq_len, batch_size, trg_vocab_size).to(self.device)
        
        # encoder_outputs is all hidden states of the input sequence, back and forwards
        # hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
                
        # first input to the decoder is the <sos> tokens
        decoder_input = torch.tensor([self.sos_token_idx] * batch_size).to(self.device)

        # create mask to ignore padding for attention mechanism
        mask = self.create_mask(src)

        for t in range(1, seq_len):
            # insert input token embedding, previous hidden state and all encoder hidden states
            # receive output tensor (predictions) and new hidden state

            # prediction = [batch size, output dim]
            prediction, hidden = self.decoder(decoder_input, hidden, encoder_outputs, mask)
            
            # place predictions in a tensor holding predictions for each token
            outputs[t] = prediction
            
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            # get the highest predicted token from our predictions
            top1 = prediction.argmax(1)
            
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            decoder_input = top1

        return outputs

### HERE BEGINS PAIN ###
    def predict(self, src):
        """
        Performs Beam search with beam size of k

        Based off https://www.programcreek.com/python/?code=AuCson%2FSEDST%2FSEDST-master%2Funsup_net.py

        What is this?
        This is an attempt to try to implement the simplier beam search for a single sample

        Issues: Any parallelization is pulling teeth (hidden state reshaping, rewriting tensors to gpu, etc..)

        TODO : 
          - implement single batch processing
          - leverage parallelism for batch sizes

        Arguments:
            src{tensor[src len, batch size]} -- tensor of the source tokens

        Return:
            prediction{tensor[max seq len, batch size]} -- predicted token values
        """
        if self.k == 1:
            # k == 1 is greedy search
            # outputs = [max seq len, batch size, trg vocab len]
            outputs = self.forward(src, teacher_forcing_ratio=0)
            # top1 = [max seq len, batch size]
            top1 = outputs.argmax(1)
            return top1
        else:
            # beam search
            # src = [src len, batch size]
            batch_size = src.shape[1]
            seq_len = self.max_length
            trg_vocab_size = self.decoder.output_size

            # encoder_outputs is all hidden states of the input sequence, back and forwards
            # hidden is the final forward and backward hidden states, passed through a linear layer
            encoder_outputs, hidden = self.encoder(src)
            
            # tensor to store decoder outputs
            # 1 index of the <pad> token
            outputs = torch.zeros(()).new_ones((seq_len, batch_size, trg_vocab_size)).to(self.device)
            
            # encoder_outputs is all hidden states of the input sequence, back and forwards
            # hidden is the final forward and backward hidden states, passed through a linear layer
            encoder_outputs, hidden = self.encoder(src)
            # encoder_outputs = [src len, batch size, enc hid dim * directions]
            # hidden = [batch size, dec hid dim]

            # create mask to ignore padding for attention mechanism
            mask = self.create_mask(src)

            for b in range(batch_size):
                outputs[:, b] = topk_decode_step(self, encoder_outputs[:,b], hidden[b], mask[b])

            return outputs

    def topk_decode_single(self, encoder_outputs, hidden, mask):
        """
        Single beam search decoding. Batch size have to be 1.
        
        Arguments:
            encoder_outputs{tensor[src len, 1, enc hid dim * directions]}
            hidden{tensor[dec hid dim]}
        """
        class BeamState:
            def __init__(self, beam_prob, prev_hidden, decode_string, length):
                """
                Keeps Track of Beam context
                """
                self.beam_prob = beam_prob
                self.prev_hidden = prev_hidden
                self.decode_string = decode_string
                self.length = length

            def get_token_bonus(self, token):
                """ 
                Calculates the log_probability of the token given the previous tokens
                with associated weights and penalties for repeated words (excluding eos and pad)
                """
                bonus = 0
                if token == self.decode_string[-1] and (token != self.trg_eos_idx or token != self.trg_pad_idx):
                    bonus -= -1000.0
                if token == 0:
                    bonus -= 5.0

                return bonus
            
            def gen_child(self, token_prob, prev_hidden, token_idx):
                decoded_string = self.decode_string.copy()
                decoded_string.append(token_idx)
                return BeamState(self.beam_prob + token_prob, hidden, decoded_string)

          # hidden = [1, dec hid dim]
          hidden = hidden.unsqueeze(0)

          # encoder_outputs = [src len, 1, dec hid dim]
          encoder_outputs = encoder_outputs.unsqueeze(1)

          # mask = [1, dec hid dim]
          mask = mask.unsqueeze(0)

          BeamState(0, hidden, [self.sos_token_idx], 1)

          # contain beam states that have reached their <eos> state
          finished_beams = []

          pq = PriorityQueue()

          # put the initial state into the priority queue
          pq.put((BeamState(0, hidden, [self.sos_token_idx], 1),))

          for t in range(1, self.max_length)
              # hidden = [1, dec hid dim]
              hidden = hidden.unsqueeze(0)

              # get the top k beams from the priority queue and discard rest
              beams = []
              k = 0
              while not pq.empty() and k < self.beam_size:
                beams = [pq.get()]

              for b in beams:
                  prediction, hidden = self.decoder(decoder_input, b.prev_hidden, encoder_outputs, mask)
                  top_prob, top_idx = prediction.topk(self.beam_size, dim=1)
                  for k in range(self.beam_size):
                      token_prob = top_prob[k]
                      token_idx = top_idx[k]
                      bonus = b.get_token_bonus(token_idx)
                      new_beam = b.gen_child(token_prob + bonus, hidden, token_idx)


              for b in range(top_prob.shape[0]):

              log_prob =  + node.get_token_bonus(tok)
              score = -log_prob
              next_pq.append((score, node.gen_child(log_prob, hidden[b].unsqueeze(0), ))

              pq = next_pq
              # read beam size from priority queue then 
              decoder_input = torch.tensor()

              # decoder_input = [beam size, ]

              # insert input token embedding, previous hidden state and all encoder hidden states
              # receive output tensor (predictions) and new hidden state

              # prediction = [batch size, output dim]
              # hidden = [batch size, dec hid dim]
              prediction, hidden = self.decoder(decoder_input, hidden, encoder_outputs)
              
              # place predictions in a tensor holding predictions for each token
              outputs[t] = prediction
              
              # decide if we are going to use teacher forcing or not
              teacher_force = random.random() < teacher_forcing_ratio
              
              # get the highest predicted token from our predictions
              top_prob, top_i = prediction.topk(self.k)
              
              # if teacher forcing, use actual next token as next input
              # if not, use predicted token
              decoder_input = top1


          # finished = []
          # failed = []
          # states = []

          # states.append(BeamState(0, hidden, [self.sos_token_idx], 1))
          
          # for t in range(1, self.max_length):
          #     new_States = []
          #     k = 0

          # max_score = -float('inf')
          # best_beam = None
          # for b in beams:
          #     if beast_beam is None or best_beam.score < b.score:
          #         best_beam = b

          # return tensor(best_beam.decode_string + [self.pad_token] * )


In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

## Training and Evaluation

In [None]:
def create_checkpoint(name, model, source, target, score):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'source': source.vocab,
        'target': target.vocab,
        'score' : score
    }
    torch.save(checkpoint, f'{name}.pth')

In [None]:
class ModelTrainer:
    def __init__(self, model, optimizer, criterion, source, target, path='.', early_stopping=False):
        self.model = model
        self.optim = optimizer
        self.crit = criterion
        self.src = source
        self.trg = target
        self.path = path
        self.early_stopping = early_stopping

    def fit(self, train_iterator, valid_iterator, num_epochs=10, run_name='train', patience=7, delta=0, verbose=True):
        epoch_train_loss = []
        epoch_valid_loss = []
        best_valid_loss = float('inf')

        # calculate epoch
        counter = 0
        clip = 1
        for epoch in range(1, num_epochs+1):
            if verbose:
                pbar = tqdm(total=len(train_iterator), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}', unit=' batches', ncols=200)

            tr_loss = self._train(train_iterator, epoch, clip, pbar if verbose else None)
            epoch_train_loss.append(tr_loss)

            val_loss = self.evaluate(valid_iterator)
            epoch_valid_loss.append(val_loss)

            if verbose:
                pbar.set_postfix(epoch=f" {epoch}, train loss= {round(tr_loss, 4)}, val loss= {round(val_loss, 4)}", refresh=False)
                pbar.close()

            if val_loss < best_valid_loss + delta:
                # Improvement in validation loss, save the new model
                best_valid_loss = val_loss
                counter = 0
                create_checkpoint(join(self.path, run_name), self.model, self.src, self.trg, best_valid_loss)
            else:
                counter += 1

            # If we've seen no improvement for [patience] epochs, stop training
            if self.early_stopping and counter >= patience:
                break

        return epoch_train_loss, epoch_valid_loss


    def _train(self, iterator, epoch, clip, pbar=None):
        # set training mode
        self.model.train()

        epoch_loss = 0

        # Loop through the training batch
        for i, batch in enumerate(iterator):
          # Get the source and target tokens
          # src = [sentence length, batch len]
          # trg = [sentence length, batch len]
          src = batch.src
          trg = batch.trg

          self.optim.zero_grad()

          output = self.model(src, trg, 0.2)

          # trg = [trg len, batch size]
          # output = [trg len, batch size, output dim]

          # reshape the output
          output_dim = output.shape[-1]

          # Discard first token (always <sos>)
          output = output[1:].view(-1, output_dim)
          trg = trg[1:].view(-1)

          # Calculate loss and back propagate
          loss = self.crit(output, trg)
          loss.backward()

          # Gradient clipping for stability
          torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip)

          self.optim.step()

          epoch_loss += loss.item()

          if pbar:
              pbar.set_postfix(epoch=f" {epoch}, train loss= {round(epoch_loss / (i+1), 4)}", refresh=True)
              pbar.update()

        return epoch_loss / len(iterator)


    def evaluate(self, iterator):

        # set model to eval
        self.model.eval()
        
        epoch_loss = 0

        with torch.no_grad():
            # Loop through the validation batch
            for i, batch in enumerate(iterator):
                src = batch.src
                trg = batch.trg

                # Forward pass
                output = self.model(src, trg, 0)

                # trg = [trg len, batch size]
                # output = [trg len, batch size, output dim]

                # reshape the output
                output_dim = output.shape[-1]

                # discard the first token (always <sos>)
                output = output[1:].view(-1, output_dim)
                trg = trg[1:].view(-1)

                # trg = [(trg len - 1) * batch size]
                # output = [(trg len - 1) * batch size, output dim]

                # Calculate Loss
                loss = self.crit(output, trg)

                epoch_loss += loss.item()

        return epoch_loss / len(iterator)  

    def predict(self, iterator):
        pass

In [None]:
def create_encoder_decoder_model(source, target, embedding_dim, hidden_dim, max_seq_len, num_layers, dropout=0.0, bidirectional=False):
    input_dim = len(source.vocab)
    output_dim = len(target.vocab)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder_hidden_dims = hidden_dim * (2 if bidirectional else 1)

    # Instantiate models
    attention = Attention(encoder_hidden_dims, hidden_dim)
    enc = EncoderRNN(input_dim, embedding_dim, hidden_dim, num_layers, bidirectional, dropout)
    dec = DecoderWithAttention(output_dim, embedding_dim, encoder_hidden_dims, hidden_dim, 1, dropout, attention)

    src_pad_token = source.vocab.stoi[source.pad_token]

    return Seq2Seq(enc, dec, device, target.vocab.stoi[INIT_TOKEN], max_seq_len).to(device)

In [None]:
model = create_encoder_decoder_model(SRC, TRG, 512, 1024, MAX_SEQUENCE_LENGTH+2, 2, 0.5, True)

# Initialize Adam with default parameters
optimizer = optim.Adam(model.parameters())

# Makes sure the CrossEntropyLoss ignores the padding tokens.
TARGET_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.NLLLoss(ignore_index=TARGET_PAD_IDX)

# Initialize the weight values
model.apply(init_weights)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 80,788,002 trainable parameters


In [None]:
# begin training
SAVE_PATH = "models"
!mkdir -p "{SAVE_PATH}" 
trainer = ModelTrainer(model, optimizer, criterion, SRC, TRG, SAVE_PATH, False)


tr_loss, val_loss = trainer.fit(train_iterator, valid_iterator, 1000, "bbe-kjv", patience=10)

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=305.0), HTML(value='')), layout=Layout(di…

KeyboardInterrupt: ignored

In [None]:
create_checkpoint("models/bbe-kjv-early-stop", model, SRC, TRG, 1.29)