In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from Utils import plot_graphs, plot_attention_heatmaps
import wandb
from SequenceLearning import *

In [None]:
# Model hyperparameters (for both attention and vanilla models)
batch_size = 128
input_size_encoder = english_char_count
input_size_decoder = target_char_count
output_size = target_char_count
MAX_LEN = 35

# Without Attention

In [None]:
train = data.load_data("train", batch_size, num_batches=1)
valid = data.load_data("valid", batch_size, num_batches=1)
test = data.load_data(  "test", batch_size, num_batches=1)

## Sweep

In [None]:
wandb.login()
sweep_config = {
    "method": 'bayes',
    "metric": {
    'name': 'accuracy',
    'goal': 'maximize'
    },
    'parameters' :{
        "num_epochs" : {"min": 7, "max": 18}, 
        "learning_rate" : {"values": [1e-2, 1e-3]},
        "encoder_layers": {"values": [1,2,3]},
        "decoder_layers": {"values": [1,2,3]},
        "hidden_size" : {"values": [64, 128, 256]},
        "rnn" : {"values" : ["LSTM", "GRU", "RNN"]},
        "bi_directional" : {"values": [True, False]},
        "dropout" : {"values": [0.2, 0.3]},
        "embedding_size" : {"values" : [32, 64, 128, 256]}}
}

def tune_rnn():
   """A utility function for performing the sweep"""
   wandb.init()

   if(wandb.config.rnn == "LSTM"):
      rnn = nn.LSTM
   if(wandb.config.rnn == "GRU"):
      rnn = nn.GRU
   if(wandb.config.rnn == "RNN"):
      rnn = nn.RNN

   enc = Encoder(english_char_count, wandb.config.embedding_size, wandb.config.hidden_size, 
               num_layers=wandb.config.encoder_layers, 
               bi_dir=wandb.config.bi_directional,
               p=wandb.config.dropout,
               rnn_class=rnn).to(device)

   dec = Decoder(target_char_count, wandb.config.embedding_size, wandb.config.hidden_size, target_char_count, 
               num_layers=wandb.config.decoder_layers, 
               bi_dir=wandb.config.bi_directional, 
               p =wandb.config.dropout,
               rnn_class=rnn).to(device)

   mod = Seq2Seq(enc, dec).to(device)

   optimizer = optim.Adam(mod.parameters(), lr=wandb.config.learning_rate)
   tr_loss, val_loss, tr_acc, val_acc = mod.learn(train, valid, wandb.config.num_epochs, optimizer)


   name = f"{wandb.config.encoder_layers}_enc_{wandb.config.decoder_layers}_dec_{wandb.config.hidden_size}_hs_"
   if(wandb.config.bi_directional == True):
      name += "bidir_"
   if(dec.used_attn == True):
      name += "attn_"
   if(len(tr_loss) != wandb.config.num_epochs):
      name += "early_stop"
   wandb.run.name = name

   for i in range(len(tr_loss)):
      wandb.log({"tr_loss":tr_loss[i],
                  "tr_acc" : tr_acc[i],
                  "val_loss" : val_loss[i],
                  "val_acc" : val_acc[i],
                  "epoch":(i+1)})

      wandb.log({"accuracy": val_acc[-1]})

sweep_id=wandb.sweep(sweep_config,project="CS6910_Assignment_3")
wandb.agent(sweep_id,function=tune_rnn)
wandb.finish()

## Best Model

In [None]:
num_epochs = 2
learning_rate = 0.001

embedding_size = 32
encoder_layers = 3
decoder_layers = 2
enc_dropout = 0.3
dec_dropout = 0.3
hidden_size = 256
bi_directional = True
rnn = nn.LSTM

In [None]:
enc = Encoder(english_char_count, 
              embedding_size, hidden_size, 
              num_layers=encoder_layers, 
              bi_dir=bi_directional,
              p=enc_dropout,
              rnn_class=rnn).to(device)

dec = Decoder(target_char_count, embedding_size, hidden_size, target_char_count, 
              num_layers=decoder_layers, 
              bi_dir=bi_directional, 
              p = dec_dropout,
              rnn_class=rnn).to(device)

model = Seq2Seq(enc, dec).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
loss, val_loss, acc, val_acc = model.learn(train, valid, num_epochs, optimizer)
fig = plot_graphs(loss, val_loss, acc, val_acc)

## Testing

In [None]:
test_loss, test_acc = model.calc_evaluation_metrics(test, path_to_store_predictions="predictions_attention/results.csv")
print(f"Test dataset loss: {test_loss:.2f} \nAccuracy: {test_acc}")

# With Attention

In [None]:
train = data.load_data("train", batch_size, padding_upper_bound=MAX_LEN, num_batches=1)
valid = data.load_data("valid", batch_size, padding_upper_bound=MAX_LEN, num_batches=1)
test = data.load_data(  "test", batch_size, padding_upper_bound=MAX_LEN, num_batches=1)

## Sweep

In [None]:
wandb.login()
sweep_config = {
    "method": 'bayes',
    "metric": {
    'name': 'accuracy',
    'goal': 'maximize'
    },
    'parameters' :{
        "num_epochs" : {"min": 7, "max": 20}, 
        "learning_rate" : {"values": [1e-2, 1e-3]},
        "encoder_layers": {"values": [1,2,3]},
        "decoder_layers": {"values": [1,2,3]},
        "hidden_size" : {"values": [64, 128, 256]},
        "rnn" : {"values" : ["LSTM", "GRU", "RNN"]},
        "bi_directional" : {"values": [True, False]},
        "dropout" : {"values": [0.2, 0.3]},
        "embedding_size" : {"values" : [32, 64, 128, 256]}}
}

def tune_rnn():
    """A utility function for performing the sweep"""
    wandb.init()

    if(wandb.config.rnn == "LSTM"):
        rnn = nn.LSTM
    if(wandb.config.rnn == "GRU"):
        rnn = nn.GRU
    if(wandb.config.rnn == "RNN"):
        rnn = nn.RNN

    enc = Encoder(english_char_count, wandb.config.embedding_size, wandb.config.hidden_size, 
               num_layers=wandb.config.encoder_layers, 
               bi_dir=wandb.config.bi_directional,
               p=wandb.config.dropout,
               rnn_class=rnn).to(device)

    dec = AttnDecoder(wandb.config.embedding_size, wandb.config.hidden_size, output_size=target_char_count, 
                         num_layers=wandb.config.decoder_layers,
                         dropout_p = wandb.config.dropout, 
                         bi_dir = wandb.config.bi_directional,
                         rnn_class= rnn,
                         max_length=MAX_LEN).to(device)

    mod = Seq2Seq(enc, dec).to(device)

    optimizer = optim.Adam(mod.parameters(), lr=wandb.config.learning_rate)
    tr_loss, val_loss, tr_acc, val_acc = mod.learn(train, valid, wandb.config.num_epochs, optimizer)


    name = f"{wandb.config.encoder_layers}_enc_{wandb.config.decoder_layers}_dec_{wandb.config.hidden_size}_hs_"
    if(wandb.config.bi_directional == True):
        name += "bidir_"
    if(dec.used_attn == True):
        name += "attn_"
    if(len(tr_loss) != wandb.config.num_epochs):
        name += "early_stop"
    wandb.run.name = name

    for i in range(len(tr_loss)):
        wandb.log({"tr_loss":tr_loss[i],
                  "tr_acc" : tr_acc[i],
                  "val_loss" : val_loss[i],
                  "val_acc" : val_acc[i],
                  "epoch":(i+1)})

        wandb.log({"accuracy": val_acc[-1]})

sweep_id=wandb.sweep(sweep_config,project="CS6910_Assignment_3_Attn")
wandb.agent(sweep_id,function=tune_rnn)
wandb.finish()

## Best Model

In [None]:
num_epochs = 3
learning_rate = 0.001

embedding_size = 128
encoder_layers = 3
decoder_layers = 2
enc_dropout = 0.2
dec_dropout = 0.2
hidden_size = 256
bi_directional = True
rnn = nn.LSTM

In [None]:
enc = Encoder(english_char_count, 
              embedding_size, hidden_size, 
              num_layers=encoder_layers, 
              bi_dir=bi_directional,
              p=enc_dropout,
              rnn_class=rnn).to(device)

dec = AttnDecoder(embedding_size, hidden_size, output_size=target_char_count, 
                     num_layers=decoder_layers,
                     bi_dir = bi_directional,
                          rnn_class= rnn,
                     max_length=MAX_LEN).to(device)

attn_model = Seq2Seq(enc, dec).to(device)

optimizer = optim.Adam(attn_model.parameters(), lr=learning_rate)

In [None]:
loss, val_loss, acc, val_acc = attn_model.learn(train, valid, num_epochs, optimizer)

## Visualizing attention heatmaps

In [None]:
sample_test_data = data.load_data("test", batch_size=12, padding_upper_bound=MAX_LEN, num_batches=1)
src, tar = sample_test_data[0]
src, tar = src.to(device), tar.to(device)

src_strings = data.tensor_to_string(src, string_type="source")
tar_strings = data.tensor_to_string(tar, string_type="target")
attn = attn_model.get_attention_matrix(src, tar)

fig = plot_attention_heatmaps(attn, src_strings, tar_strings)

## Testing

In [None]:
test_loss, test_acc = attn_model.calc_evaluation_metrics(test)
print(f"Test dataset loss: {test_loss:.2f} \nAccuracy: {test_acc}")