In [1]:
## Imports

from hw import load_data, Encoder, Decoder, Seq2Seq
from transformer import Transformer
import string
import random
from typing import Tuple
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor
import math
import time
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [2]:
# Data and Constants
X_train, X_test, X_val, Y_train, Y_val, Y_test = load_data(100)

INPUT_DIM = len(X_train[0][0])
OUTPUT_DIM = len(Y_train[0][0])
ENC_EMB_DIM = 16
DEC_EMB_DIM = 16
HID_DIM = 32
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
NUM_LAYERS = 2
PAD_IDX = 0

LEARNING_RATE = 3e-4
BATCH = 32
EMB_DIM = 128
NUM_HEADS = 8
DROPOUT = 0.10
FORWARD_EXPANSION = 4
MAX_LEN = 40
criterion_lstm = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
criterion_transformer = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
N_EPOCHS = 10
CLIP = 1

Processed


In [4]:
## LSTM Stuff
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM,
                  NUM_LAYERS, ENC_DROPOUT)

dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM,
                  NUM_LAYERS, DEC_DROPOUT)

lstm = Seq2Seq(enc, dec).to(device)



In [5]:
## Transformer
transformer = Transformer(
    EMB_DIM,
    INPUT_DIM,
    INPUT_DIM,
    PAD_IDX,
    NUM_HEADS,
    NUM_LAYERS,
    NUM_LAYERS,
    FORWARD_EXPANSION,
    DROPOUT,
    MAX_LEN,
    device,
).to(device)

In [6]:
def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

In [13]:
## LSTM Training
def train(model: nn.Module,
          X: list,
          Y: list,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):

    model.train()

    epoch_loss = 0

    for i, batch in enumerate(X):

        src = X[i]
        trg = Y[i]

        optimizer.zero_grad()

        output = model(src, trg)

        output = output.view(-1, output.shape[-1])
        
        trg = trg.view(-1)
        
        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(X)

In [8]:
# LSTM Evaluate
def evaluate(model: nn.Module,
             X: list,
             Y: list,
             criterion: nn.Module):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for i, batch in enumerate(X):

            src = X[i]
            trg = Y[i]

            output = model(src, trg, 0)  

            output = output.view(-1, output.shape[-1])
            trg = trg.view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(X)


def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [14]:
# Optimizers
lstm.apply(init_weights)

optimizer_lstm = optim.Adam(lstm.parameters(),lr=LEARNING_RATE)
optimizer_transformer = optim.Adam(transformer.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer_transformer, factor=0.1, patience=10, verbose=True
)

best_valid_loss = float('inf')

In [15]:
# Train LSTM
for epoch in range(N_EPOCHS):

        start_time = time.time()

        train_loss = train(lstm, X_train, Y_train, optimizer_lstm, criterion_lstm, CLIP)
        valid_loss = evaluate(lstm, X_val, Y_val, criterion_lstm)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

test_loss = evaluate(lstm, X_test, Y_test, criterion_lstm)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')


Epoch: 01 | Time: 0m 5s
	Train Loss: 3.228 | Train PPL:  25.226
	 Val. Loss: 2.902 |  Val. PPL:  18.207
Epoch: 02 | Time: 0m 6s
	Train Loss: 1.268 | Train PPL:   3.553
	 Val. Loss: 0.526 |  Val. PPL:   1.692
Epoch: 03 | Time: 0m 4s
	Train Loss: 0.374 | Train PPL:   1.453
	 Val. Loss: 0.294 |  Val. PPL:   1.342
Epoch: 04 | Time: 0m 4s
	Train Loss: 0.273 | Train PPL:   1.314
	 Val. Loss: 0.258 |  Val. PPL:   1.295
Epoch: 05 | Time: 0m 5s
	Train Loss: 0.251 | Train PPL:   1.286
	 Val. Loss: 0.245 |  Val. PPL:   1.277
Epoch: 06 | Time: 0m 4s
	Train Loss: 0.241 | Train PPL:   1.273
	 Val. Loss: 0.238 |  Val. PPL:   1.269
Epoch: 07 | Time: 0m 4s
	Train Loss: 0.236 | Train PPL:   1.266
	 Val. Loss: 0.234 |  Val. PPL:   1.263
Epoch: 08 | Time: 0m 4s
	Train Loss: 0.232 | Train PPL:   1.261
	 Val. Loss: 0.231 |  Val. PPL:   1.260
Epoch: 09 | Time: 0m 4s
	Train Loss: 0.230 | Train PPL:   1.258
	 Val. Loss: 0.229 |  Val. PPL:   1.257
Epoch: 10 | Time: 0m 4s
	Train Loss: 0.228 | Train PPL:   1.256


In [11]:
# Train Transformer

for epoch in range(N_EPOCHS):
    print(f"[Epoch {epoch} / {N_EPOCHS}]")

    transformer.eval()

    transformer.train()
    losses = []

    for batch_idx, batch in enumerate(X_train):
        
        inp_data = batch
        target = Y_train[batch_idx]

        output = transformer(inp_data, target)

        output = output.view(-1, output.shape[-1])
        
        target = target.view(-1) 

        loss = criterion_transformer(output, target)
        losses.append(loss.item())

        loss.backward()

        torch.nn.utils.clip_grad_norm_(transformer.parameters(), max_norm=1)

        # Gradient descent step
        optimizer_transformer.step()

    mean_loss = sum(losses) / len(losses)
    
    scheduler.step(mean_loss)

[Epoch 0 / 10]
[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
[Epoch 1 / 10]
[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
[Epoch 2 / 10]
[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

In [25]:
# translator
def translate(model, english, pig_latin, device, max_length=50):
    alphabet = list(string.ascii_lowercase)
    alphadict = {}
    numdict = {}
    for i, a in enumerate(alphabet):
        alphadict.update({a: i+1})
        numdict.update({i+1: a})
    english = torch.as_tensor(to_vector(english, alphadict,len(english)))
    pig_latin = to_vector(pig_latin, alphadict, len(pig_latin))
    with torch.no_grad():
        hidden, cell = model.encoder(english.unsqueeze(1).to(device))
    outputs = [0]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)
        
        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
        best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == 0:
            break
    print(outputs)
    translated_sentence = [to_letter(oh, numdict) for oh in outputs]

    # remove start token
    return translated_sentence

In [17]:
def to_vector(word, alphadict, max_length):
    vec = []
    for letter in word:
        idx = alphadict[letter]
        vec.append(one_hot(idx))
    while len(vec) < max_length:
        vec.append(one_hot(0))
    return vec

def one_hot(index):
    vec = 27 * [0]
    vec[index] = 1
    return vec

def to_letter(oh, numdict):
    for i, on in enumerate(oh):
        if on == 1:
            return numdict[i]
    return ''

In [27]:
# Example
# 
translate(lstm, 'apple', 'appleway', device, max_length=50)

[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


TypeError: 'int' object is not iterable