# Model 1: Attention TRAIN

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle

import help_functions
import data_processor
import attention_models
import better_attention_models

## STRUCTURE
3. Build the model with embedding and Attention.
5. Train on the data.

## 3. Build the model

### Get the data

In [2]:
def load_pickle_data(filename):
    with open(filename, "rb") as load_file:
        return pickle.load(load_file)

In [3]:
X_train = load_pickle_data("saved_data/splitted_X_train_eq.pickle")
X_test = load_pickle_data("saved_data/splitted_X_test_eq.pickle")
y_train = load_pickle_data("saved_data/splitted_y_train_eq.pickle")
y_test = load_pickle_data("saved_data/splitted_y_test_eq.pickle")
voc = load_pickle_data("saved_data/all_voc.pickle")

In [3]:
X_train = load_pickle_data("attention_data/splitted_with_mask_X_train.pickle")
X_test = load_pickle_data("attention_data/splitted_with_mask_X_test.pickle")
y_train = load_pickle_data("attention_data/splitted_with_mask_y_train.pickle")
y_test = load_pickle_data("attention_data/splitted_with_mask_y_test.pickle")
mask_train = load_pickle_data("attention_data/splitted_with_mask_mask_train.pickle")
mask_test = load_pickle_data("attention_data/splitted_with_mask_mask_test.pickle")
voc = load_pickle_data("attention_data/splitted_with_mask_voc.pickle")

In [4]:
max_sequence_length = len(X_train[0])

In [5]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('Using', device)

Using cpu


In [7]:
print(f'Number of training samples: {len(y_train)}')
print(f'Number of test samples: {len(y_test)}')
print("")
print(f'Sequence length per sample: {max_sequence_length}')

Number of training samples: 101905
Number of test samples: 50193

Sequence length per sample: 283


In [8]:
def batchify(x, y, batch_size, mask=None):
    random_indices = torch.randperm(len(x))
    for i in range(0, len(x) - batch_size + 1, batch_size):
        indices = random_indices[i:i+batch_size]
        if not type(mask) == type(None):
            yield x[indices].to(device), y[indices].to(device), mask[indices].to(device)
        else:
            yield x[indices].to(device), y[indices].to(device)

## Train the transformer!

In [10]:
def train(model, X_train, X_test, y_train, y_test, n_epochs=1, batch_size=100, lr=0.001, max_samples=None, weight_true=0.5):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    min_ppl = float('inf')
    for t in range(n_epochs):
        model.train()
        loss_fun = F.binary_cross_entropy

        loss_sum = 0
        accuracy_sum = 0
        nbr_train_batches = 0
        for bx, by in batchify(X_train, y_train, batch_size):
            nbr_train_batches += 1
            optimizer.zero_grad()
            output = model.forward(bx)
            sample_weight = (by.eq(1)*weight_true)+(by.eq(0)*(1-weight_true))
            #print(sample_weight)
            loss = loss_fun(output, by.type(torch.FloatTensor), weight=sample_weight)
            loss_sum += loss.item()
            accuracy = (output.eq(by)).sum()
            accuracy_sum += accuracy

            if max_samples and updater.n >= max_samples:
                break
            loss.backward()
            optimizer.step()

        train_loss = loss_sum/(nbr_train_batches*batch_size)
        train_acc = torch.true_divide(accuracy_sum,(nbr_train_batches*batch_size))
        model.eval()
        with torch.no_grad():
            loss_sum = 0
            accuracy_sum = 0
            nbr_test_batches = 0
            for bx, by in batchify(X_test, y_test, batch_size):
                nbr_test_batches += 1
                output = model.forward(bx)
                sample_weight = (by.eq(1)*weight_true)+(by.eq(0)*(1-weight_true))
                loss = loss_fun(output, by.type(torch.FloatTensor), weight=sample_weight)
                loss_sum += loss.item()
                accuracy = (output.eq(by)).sum()
                accuracy_sum += accuracy
        test_loss = loss_sum/(nbr_test_batches*batch_size)
        test_acc = torch.true_divide(accuracy_sum,(nbr_test_batches*batch_size))

        print(f'epoch {t} | train loss {train_loss} | train acc {train_acc} | validation loss {test_loss} | validation acc {test_acc}')

    return model

In [150]:
def train_with_mask(model, X_train, X_test, y_train, y_test, mask_train, mask_test, n_epochs=1, batch_size=100, lr=0.001, max_samples=None, weight_true=0.5, padding_index=None):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    min_ppl = float('inf')
    for t in range(n_epochs):
        model.train()
        loss_fun = F.binary_cross_entropy

        loss_sum = 0
        accuracy_sum = 0
        nbr_train_batches = 0
        for bx, by, bm in batchify(X_train, y_train, batch_size, mask_train):
            nbr_train_batches += 1
            optimizer.zero_grad()
            if padding_index is not None:
                #print("using padding")
                padding_mask = (bx == padding_index)
                #print(f"padding_mask shape: {padding_mask.shape}")
                #print(f"bx shape: {bx.shape}")
                #print(f"padding_mask: {padding_mask}")
                output = model.forward(bx, bm, has_mask=True, padding_mask=padding_mask)
            else:
                #print("not using padding")
                output = model.forward(bx, bm)

            sample_weight = (by.eq(1)*weight_true)+(by.eq(0)*(1-weight_true))
            #print("Output after word position mask selection: ")
            #print(torch.masked_select(bx, bm))
            #print(sample_weight)
            #print(f"by shape: {by.shape}")
            #print(f"output shape: {output.shape}")
            #print(f"output: {output}")
            #print(f"bx: {bx}")
            loss = loss_fun(output, by.type(torch.FloatTensor), weight=sample_weight)
            loss_sum += loss.item()
            #print(f"output rounded: {output.round()}")
            #print(f"by: {by}")
            accuracy = (output.round().eq(by)).sum()
            accuracy_sum += accuracy

            if max_samples and updater.n >= max_samples:
                break
            loss.backward()
            optimizer.step()

        train_loss = loss_sum/(nbr_train_batches*batch_size)
        train_acc = torch.true_divide(accuracy_sum,(nbr_train_batches*batch_size))
        model.eval()
        with torch.no_grad():
            loss_sum = 0
            accuracy_sum = 0
            nbr_test_batches = 0
            for bx, by, bm in batchify(X_test, y_test, batch_size, mask_test):
                nbr_test_batches += 1
                if padding_index is not None:
                    padding_mask = (bx == padding_index)
                    output = model.forward(bx, bm, has_mask=True, padding_mask=padding_mask)
                else:
                    output = model.forward(bx, bm)

                sample_weight = (by.eq(1)*weight_true)+(by.eq(0)*(1-weight_true))
                loss = loss_fun(output, by.type(torch.FloatTensor), weight=sample_weight)
                loss_sum += loss.item()
                accuracy = (output.round().eq(by)).sum()
                accuracy_sum += accuracy
        test_loss = loss_sum/(nbr_test_batches*batch_size)
        test_acc = torch.true_divide(accuracy_sum,(nbr_test_batches*batch_size))

        print(f'epoch {t} | train loss {train_loss} | train acc {train_acc} | validation loss {test_loss} | validation acc {test_acc}')

    return model

In [13]:
def get_data_subset(sub_percentage, X_train, X_test, y_train, y_test):
    train_sub_size = int(sub_percentage*len(y_train))
    test_sub_size = int(sub_percentage*len(y_test))

    X_train_sub = X_train[:train_sub_size]
    X_test_sub = X_test[:test_sub_size]
    y_train_sub = y_train[:train_sub_size]
    y_test_sub = y_test[:test_sub_size]

    return X_train_sub, X_test_sub, y_train_sub, y_test_sub

## HANDLE MASK OF INPUT? (pad_token)

In [211]:
import importlib
importlib.reload(attention_models)
importlib.reload(better_attention_models)

<module 'attention_models' from '/Users/lovhag/Projects/dl4nlp_assignment_1/attention_models.py'>

In [216]:
model = better_attention_models.MyAttentionModelWithPooling(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=2, dim_feedforward=8, num_layers=1, dropout=0.2).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=20, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5, padding_index=voc.stoi["___PAD___"])

epoch 0 | train loss 0.004919881224439213 | train acc 0.6069507598876953 | validation loss 0.004498981557102228 | validation acc 0.680086076259613
epoch 1 | train loss 0.004431075947979306 | train acc 0.6808358430862427 | validation loss 0.004439097912019898 | validation acc 0.6851283311843872
epoch 2 | train loss 0.004348251864199419 | train acc 0.6897966265678406 | validation loss 0.004372248389254495 | validation acc 0.6886559128761292
epoch 3 | train loss 0.00427969527298037 | train acc 0.6963430643081665 | validation loss 0.004361356068069914 | validation acc 0.6876395344734192
epoch 4 | train loss 0.004213247361333208 | train acc 0.7022809386253357 | validation loss 0.004348757522054758 | validation acc 0.6879384517669678
epoch 5 | train loss 0.0041504913783660296 | train acc 0.7061970233917236 | validation loss 0.004410814515276983 | validation acc 0.6881775856018066
epoch 6 | train loss 0.004082272804006419 | train acc 0.7113202810287476 | validation loss 0.004372191947423948 |

# PICKED 2 (train 0.76 val 0.68)

In [212]:
model = attention_models.MyAttentionModelWithMaskOnWordPositionOutputAndMaskedSkipCORRECTEDwithMaskOnPaddingBothWaysTransposed(vocab_size=len(voc), embedding_dim=8, max_seq_len=max_sequence_length, num_heads=2, dim_feedforward=8, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5, padding_index=voc.stoi["___PAD___"])

epoch 0 | train loss 0.005084941087901387 | train acc 0.5771238803863525 | validation loss 0.004573788645627376 | validation acc 0.6524633169174194
epoch 1 | train loss 0.004545036815960971 | train acc 0.6644158363342285 | validation loss 0.004549594325778473 | validation acc 0.6423788070678711
epoch 2 | train loss 0.004422851363881421 | train acc 0.6838587522506714 | validation loss 0.004453614128430431 | validation acc 0.6788703799247742
epoch 3 | train loss 0.004319671781741822 | train acc 0.6950376629829407 | validation loss 0.00441636916485197 | validation acc 0.6827965378761292
epoch 4 | train loss 0.004250477465049799 | train acc 0.7027912735939026 | validation loss 0.0044119875840676415 | validation acc 0.6815210580825806


In [213]:
trained_model = train_with_mask(trained_model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5, padding_index=voc.stoi["___PAD___"])

epoch 0 | train loss 0.004185397648275947 | train acc 0.7107608318328857 | validation loss 0.004489231606403708 | validation acc 0.6811822652816772
epoch 1 | train loss 0.004139450121607781 | train acc 0.7143334150314331 | validation loss 0.004438301369581106 | validation acc 0.68359375
epoch 2 | train loss 0.004084396385032001 | train acc 0.7202614545822144 | validation loss 0.004566717815967942 | validation acc 0.6885363459587097
epoch 3 | train loss 0.0040362114674673795 | train acc 0.7254239916801453 | validation loss 0.004452898099065797 | validation acc 0.6892139911651611
epoch 4 | train loss 0.003989203225958147 | train acc 0.7280248999595642 | validation loss 0.004495862365655164 | validation acc 0.6903300285339355


In [214]:
trained_model = train_with_mask(trained_model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=10, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5, padding_index=voc.stoi["___PAD___"])

epoch 0 | train loss 0.003936590820617609 | train acc 0.7330107688903809 | validation loss 0.004581964254612103 | validation acc 0.6922632455825806
epoch 1 | train loss 0.003888784655622539 | train acc 0.7381536364555359 | validation loss 0.004579380857259301 | validation acc 0.6929010152816772
epoch 2 | train loss 0.003840530986035355 | train acc 0.7428058385848999 | validation loss 0.004635081197341847 | validation acc 0.6863042116165161
epoch 3 | train loss 0.0037981283749470105 | train acc 0.7464765310287476 | validation loss 0.004614710099805071 | validation acc 0.6934789419174194
epoch 4 | train loss 0.0037670996595564857 | train acc 0.7508244514465332 | validation loss 0.004577731046640333 | validation acc 0.6854870915412903
epoch 5 | train loss 0.0037207996163250477 | train acc 0.754475474357605 | validation loss 0.004702297632394796 | validation acc 0.6855867505073547
epoch 6 | train loss 0.0036821636743083054 | train acc 0.7591375112533569 | validation loss 0.0047113517716845

In [199]:
model = attention_models.MyAttentionModelWithMaskOnWordPositionOutputAndMaskedSkipCORRECTEDwithMaskOnPaddingBothWaysTransposed(vocab_size=len(voc), embedding_dim=4, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=4, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5, padding_index=voc.stoi["___PAD___"])

epoch 0 | train loss 0.005421715691432956 | train acc 0.5005103349685669 | validation loss 0.005415573998292604 | validation acc 0.49705037474632263
epoch 1 | train loss 0.00541595330206799 | train acc 0.49923446774482727 | validation loss 0.005415838692878962 | validation acc 0.49968111515045166


KeyboardInterrupt: 

In [152]:
model = attention_models.MyAttentionModelWithMaskOnWordPositionOutputAndMaskedSkipCORRECTEDwithMaskOnPadding(vocab_size=len(voc), embedding_dim=8, max_seq_len=max_sequence_length, num_heads=2, dim_feedforward=8, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5, padding_index=voc.stoi["___PAD___"])

epoch 0 | train loss 0.004632107675983453 | train acc 0.6567112803459167 | validation loss 0.0044574606358857675 | validation acc 0.6833147406578064
epoch 1 | train loss 0.004452804778585557 | train acc 0.6796678900718689 | validation loss 0.00444137543227229 | validation acc 0.6841318607330322
epoch 2 | train loss 0.004441115043000856 | train acc 0.681002676486969 | validation loss 0.004445498577575675 | validation acc 0.6828563213348389
epoch 3 | train loss 0.004436282550415315 | train acc 0.6821215748786926 | validation loss 0.004445788503995127 | validation acc 0.6831552982330322
epoch 4 | train loss 0.004435591586723914 | train acc 0.6818271279335022 | validation loss 0.004444650366631508 | validation acc 0.6842514276504517


In [151]:
model = attention_models.MyAttentionModelWithMaskOnWordPositionOutputAndMaskedSkipCORRECTEDwithMaskOnPadding(vocab_size=len(voc), embedding_dim=8, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=8, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5, padding_index=voc.stoi["___PAD___"])

epoch 0 | train loss 0.004629464534839063 | train acc 0.6592042446136475 | validation loss 0.004451352655970757 | validation acc 0.6830357313156128
epoch 1 | train loss 0.004449036289427673 | train acc 0.6809830665588379 | validation loss 0.0044433740805121785 | validation acc 0.6823979616165161
epoch 2 | train loss 0.004441115954433376 | train acc 0.6824159622192383 | validation loss 0.004444128343697676 | validation acc 0.6827168464660645
epoch 3 | train loss 0.004436607050958637 | train acc 0.6819449067115784 | validation loss 0.004440417011358718 | validation acc 0.6838129758834839
epoch 4 | train loss 0.004435394586844313 | train acc 0.6823570728302002 | validation loss 0.0044453041111261644 | validation acc 0.6810028553009033


In [142]:
model = attention_models.MyAttentionModelWithMaskOnWordPositionOutputAndMaskedSkipCORRECTED(vocab_size=len(voc), embedding_dim=8, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=8, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.0046243099137455055 | train acc 0.6585564613342285 | validation loss 0.004455494706884825 | validation acc 0.6824178695678711
epoch 1 | train loss 0.004453588319660385 | train acc 0.6789121627807617 | validation loss 0.004444048215506827 | validation acc 0.6846300959587097
epoch 2 | train loss 0.0044435603499070895 | train acc 0.681002676486969 | validation loss 0.0044433617722646965 | validation acc 0.6837332844734192
epoch 3 | train loss 0.004436718196585896 | train acc 0.6821706295013428 | validation loss 0.004442322553475197 | validation acc 0.683015763759613
epoch 4 | train loss 0.004436661298304853 | train acc 0.6819350719451904 | validation loss 0.0044413141137682735 | validation acc 0.683195173740387


In [111]:
trained_model = train_with_mask(trained_model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004438100981873568 | train acc 0.6818663477897644 | validation loss 0.004450808284561891 | validation acc 0.6803252696990967
epoch 1 | train loss 0.004435096486400223 | train acc 0.6823669075965881 | validation loss 0.00444878395694327 | validation acc 0.6819595098495483
epoch 2 | train loss 0.004433769227126052 | train acc 0.6820822954177856 | validation loss 0.00444030267399099 | validation acc 0.6828961968421936
epoch 3 | train loss 0.004434878244526899 | train acc 0.6826024651527405 | validation loss 0.00443860646830254 | validation acc 0.6841916441917419
epoch 4 | train loss 0.004432957457968188 | train acc 0.682916522026062 | validation loss 0.004439978006507308 | validation acc 0.6829958558082581


In [103]:
model = attention_models.MyAttentionModelWithMaskOnWordPositionOutputAndMaskedSkipPlusMaskOnPadding(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=2, dim_feedforward=32, num_layers=1, dropout=0.5).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=10, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5, padding_index=voc.stoi["___PAD___"])

using padding
padding_mask shape: torch.Size([64, 283])
bx shape: torch.Size([64, 283])
Positional encoding shape: torch.Size([64, 283, 16])
Padding mask shape: torch.Size([64, 283])


AssertionError: 

In [None]:
model = attention_models.MyAttentionModelWithMaskOnWordPositionOutputAndMaskedSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=2, dim_feedforward=32, num_layers=1, dropout=0.5).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=10, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

In [62]:
model = attention_models.MyAttentionModelWithMaskOnWordPositionOutputAndMaskedSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=2, dim_feedforward=32, num_layers=1, dropout=0.5).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=10, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004606983171461207 | train acc 0.6592827439308167 | validation loss 0.004448186092100068 | validation acc 0.683573842048645
epoch 1 | train loss 0.0044541016240644 | train acc 0.6798936128616333 | validation loss 0.004445147721396227 | validation acc 0.6831752061843872
epoch 2 | train loss 0.00444555999746256 | train acc 0.6803450584411621 | validation loss 0.004443273654478431 | validation acc 0.6834343075752258
epoch 3 | train loss 0.0044407088295778906 | train acc 0.681670069694519 | validation loss 0.004441302975019611 | validation acc 0.6839525103569031
epoch 4 | train loss 0.004437167623347074 | train acc 0.680933952331543 | validation loss 0.004444432289848028 | validation acc 0.6830357313156128
epoch 5 | train loss 0.0044356451003318515 | train acc 0.6824257969856262 | validation loss 0.00444732807554086 | validation acc 0.6842713356018066
epoch 6 | train loss 0.004433529202717511 | train acc 0.6827889680862427 | validation loss 0.004444381952100452 | val

In [63]:
trained_model = train_with_mask(trained_model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=10, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004431699380400268 | train acc 0.6826319098472595 | validation loss 0.00444557509926737 | validation acc 0.6838927268981934
epoch 1 | train loss 0.00443036327824165 | train acc 0.6833778023719788 | validation loss 0.004442813077452117 | validation acc 0.6842713356018066
epoch 2 | train loss 0.004430697209011043 | train acc 0.6830637454986572 | validation loss 0.0044402719305457585 | validation acc 0.6842514276504517
epoch 3 | train loss 0.00442907197199687 | train acc 0.6826809644699097 | validation loss 0.004440097894391273 | validation acc 0.6843510866165161
epoch 4 | train loss 0.004429804837745357 | train acc 0.6826319098472595 | validation loss 0.004439055525557594 | validation acc 0.6830955147743225
epoch 5 | train loss 0.004429921711415951 | train acc 0.6828674674034119 | validation loss 0.004439195501024132 | validation acc 0.683992326259613
epoch 6 | train loss 0.0044280264489165505 | train acc 0.6831913590431213 | validation loss 0.004439981848509906 | 

In [60]:
model = attention_models.MyAttentionModelWithMaskOnWordPositionOutputAndMaskedSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=2, dim_feedforward=32, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004558818094226433 | train acc 0.667399525642395 | validation loss 0.004454303977711659 | validation acc 0.6842913031578064
epoch 1 | train loss 0.004451525872912558 | train acc 0.6803745031356812 | validation loss 0.004450733713420792 | validation acc 0.6830556392669678
epoch 2 | train loss 0.004445482194294975 | train acc 0.6813167333602905 | validation loss 0.004455863461623025 | validation acc 0.6834741830825806
epoch 3 | train loss 0.0044437008830905894 | train acc 0.681002676486969 | validation loss 0.004449086251896711 | validation acc 0.6829360723495483
epoch 4 | train loss 0.004439700977822962 | train acc 0.6817878484725952 | validation loss 0.004454112519647888 | validation acc 0.6798868179321289


In [59]:
model = attention_models.MyAttentionModelWithMaskOnWordPositionOutputAndMaskedSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=2, dim_feedforward=16, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004562477396196222 | train acc 0.6657604575157166 | validation loss 0.004450621608435652 | validation acc 0.6825374960899353
epoch 1 | train loss 0.004449456841825278 | train acc 0.6795501112937927 | validation loss 0.004446163879260801 | validation acc 0.6823979616165161
epoch 2 | train loss 0.004444686722258415 | train acc 0.6808456182479858 | validation loss 0.004448350926098528 | validation acc 0.6832150816917419
epoch 3 | train loss 0.004442044778466084 | train acc 0.6815326809883118 | validation loss 0.004442728167323738 | validation acc 0.6836734414100647
epoch 4 | train loss 0.004437503202797274 | train acc 0.6813560128211975 | validation loss 0.004450901516009958 | validation acc 0.6836535334587097


In [58]:
model = attention_models.MyAttentionModelWithMaskOnWordPositionOutputAndMaskedSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004566744389802208 | train acc 0.6652402877807617 | validation loss 0.004452310507577768 | validation acc 0.6779735088348389
epoch 1 | train loss 0.004450675888085298 | train acc 0.6802665591239929 | validation loss 0.00445954530198146 | validation acc 0.6788703799247742
epoch 2 | train loss 0.004445265197890094 | train acc 0.6811302900314331 | validation loss 0.004445237333278115 | validation acc 0.6811423897743225
epoch 3 | train loss 0.004442772522379162 | train acc 0.6806100606918335 | validation loss 0.004444013541262616 | validation acc 0.68359375
epoch 4 | train loss 0.0044400807542915935 | train acc 0.6816995143890381 | validation loss 0.004442508612184462 | validation acc 0.6838528513908386


In [57]:
model = attention_models.MyAttentionModelWithMaskOnWordPositionOutputAndMaskedSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=3, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004562939379724613 | train acc 0.6657310128211975 | validation loss 0.004451121592939337 | validation acc 0.6804049611091614
epoch 1 | train loss 0.004451848961993229 | train acc 0.6797659993171692 | validation loss 0.004445281202847861 | validation acc 0.6827567219734192
epoch 2 | train loss 0.004443730473642392 | train acc 0.6796776652336121 | validation loss 0.004446334957812761 | validation acc 0.6839126348495483


In [48]:
model = attention_models.MyAttentionModelWithPoolingAndSkipOnWordPositionTwoLayers(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=1, dropout=0.1, dim_hidden_decoder=8).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=3, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004561093941666701 | train acc 0.6672424674034119 | validation loss 0.004456883064727774 | validation acc 0.6819396018981934
epoch 1 | train loss 0.004441026115442518 | train acc 0.6805904507637024 | validation loss 0.004460767029703842 | validation acc 0.6764987111091614
epoch 2 | train loss 0.00443008779703134 | train acc 0.6815817356109619 | validation loss 0.004456883669968656 | validation acc 0.6793686151504517


In [44]:
model = attention_models.MyAttentionModelWithPoolingAndSkipOnWordPosition(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=3, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004573507227434136 | train acc 0.6675958037376404 | validation loss 0.004451421496627035 | validation acc 0.682637095451355
epoch 1 | train loss 0.004435483337439662 | train acc 0.6813756227493286 | validation loss 0.004449188586904154 | validation acc 0.6812220811843872
epoch 2 | train loss 0.004429199820736554 | train acc 0.6815621256828308 | validation loss 0.004456665414701482 | validation acc 0.6779934763908386


In [45]:
model = attention_models.MyAttentionModelWithPoolingAndSkipOnWordPosition(vocab_size=len(voc), embedding_dim=32, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=10, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004545213624948505 | train acc 0.6680472493171692 | validation loss 0.004450402385020647 | validation acc 0.6837731003761292
epoch 1 | train loss 0.004441837943246794 | train acc 0.6808063983917236 | validation loss 0.004455881574005839 | validation acc 0.6787109375
epoch 2 | train loss 0.004430041527205928 | train acc 0.6810517311096191 | validation loss 0.004459751671006694 | validation acc 0.6799466013908386
epoch 3 | train loss 0.0044234250729199235 | train acc 0.6824552416801453 | validation loss 0.004465781026626747 | validation acc 0.680644154548645
epoch 4 | train loss 0.004414622861588954 | train acc 0.6838980317115784 | validation loss 0.004474507580031355 | validation acc 0.6763990521430969


KeyboardInterrupt: 

In [36]:
model = attention_models.MyAttentionModelWithPoolingAndSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=10, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.005416679883729105 | train acc 0.505790650844574 | validation loss 0.005410850713295596 | validation acc 0.522062361240387
epoch 1 | train loss 0.00539411854551141 | train acc 0.5345379114151001 | validation loss 0.0053290227524299474 | validation acc 0.6283681392669678
epoch 2 | train loss 0.005162442772562648 | train acc 0.6292497515678406 | validation loss 0.004978231641425922 | validation acc 0.652742326259613
epoch 3 | train loss 0.00485021103057664 | train acc 0.6664474606513977 | validation loss 0.004828228293894315 | validation acc 0.6515864133834839
epoch 4 | train loss 0.0046802696511009005 | train acc 0.6747801303863525 | validation loss 0.004775377461741374 | validation acc 0.625996470451355
epoch 5 | train loss 0.004580687657940487 | train acc 0.6788238286972046 | validation loss 0.004776985090218807 | validation acc 0.6235650777816772


KeyboardInterrupt: 

In [49]:
model = attention_models.MyAttentionModelWithMaskOnWordPositionOutputAndSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=3, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.0045634866499075265 | train acc 0.6658095121383667 | validation loss 0.0044626091501190875 | validation acc 0.6758809089660645
epoch 1 | train loss 0.004447784168053718 | train acc 0.6799230575561523 | validation loss 0.004450265819156941 | validation acc 0.6823381781578064
epoch 2 | train loss 0.00443883963358055 | train acc 0.6803745031356812 | validation loss 0.00445738075329561 | validation acc 0.6811822652816772


# PICKED

In [154]:
model = MyAttentionModelWithMaskOnWordPositionAndSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=2, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=4, batch_size=64, lr=0.0005, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004600985258251391 | train acc 0.6613732576370239 | validation loss 0.004453893274850478 | validation acc 0.6813217401504517
epoch 1 | train loss 0.004465438979683706 | train acc 0.6777932643890381 | validation loss 0.0044495356188996756 | validation acc 0.6800262928009033
epoch 2 | train loss 0.004463747952363908 | train acc 0.6779306530952454 | validation loss 0.004447118593593679 | validation acc 0.6836734414100647
epoch 3 | train loss 0.004458789922380071 | train acc 0.6799623370170593 | validation loss 0.004443505789005977 | validation acc 0.6834143996238708


In [156]:
model = MyAttentionModelWithMaskOnWordPositionAndSkip(vocab_size=len(voc), embedding_dim=32, max_seq_len=max_sequence_length, num_heads=4, dim_feedforward=32, num_layers=2, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=4, batch_size=64, lr=0.0005, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004582824037046448 | train acc 0.66144198179245 | validation loss 0.004461520132893811 | validation acc 0.6788902878761292
epoch 1 | train loss 0.004471411472818248 | train acc 0.675820529460907 | validation loss 0.0044550802915033945 | validation acc 0.6831353902816772
epoch 2 | train loss 0.004463508199103616 | train acc 0.6763603091239929 | validation loss 0.0044491798919863166 | validation acc 0.6817801594734192
epoch 3 | train loss 0.004463460496024242 | train acc 0.6776853203773499 | validation loss 0.004452569045813051 | validation acc 0.679109513759613


In [150]:
model = MyAttentionModelWithMaskOnWordPositionAndSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=2, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=10, batch_size=64, lr=0.0001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.005069418102748523 | train acc 0.6058515310287476 | validation loss 0.004597217420992746 | validation acc 0.6716158986091614
epoch 1 | train loss 0.004528155480014235 | train acc 0.6707561016082764 | validation loss 0.004451946965988954 | validation acc 0.6833147406578064
epoch 2 | train loss 0.004470993669635742 | train acc 0.6781269907951355 | validation loss 0.004442791472906627 | validation acc 0.683195173740387
epoch 3 | train loss 0.0044613515777716965 | train acc 0.678342878818512 | validation loss 0.004443389930517641 | validation acc 0.6836535334587097
epoch 4 | train loss 0.004452196304802198 | train acc 0.6785489916801453 | validation loss 0.00443882708394976 | validation acc 0.6840122938156128
epoch 5 | train loss 0.004454163813831322 | train acc 0.6786667704582214 | validation loss 0.004440363247080573 | validation acc 0.6839525103569031
epoch 6 | train loss 0.0044524092853172265 | train acc 0.6792851090431213 | validation loss 0.00443882212412249 | 

In [151]:
model = MyAttentionModelWithMaskOnWordPositionAndSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=2, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=10, batch_size=64, lr=0.00005, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.005304634000698863 | train acc 0.5619994401931763 | validation loss 0.0050715585904462 | validation acc 0.6253387928009033
epoch 1 | train loss 0.004846395652661634 | train acc 0.6425290703773499 | validation loss 0.004623212458385269 | validation acc 0.6627471446990967
epoch 2 | train loss 0.004568052345924266 | train acc 0.6661824584007263 | validation loss 0.004484202303538783 | validation acc 0.6790696978569031
epoch 3 | train loss 0.004485624396135731 | train acc 0.6772436499595642 | validation loss 0.004451954369350071 | validation acc 0.6827766299247742
epoch 4 | train loss 0.0044629388908860295 | train acc 0.6784704923629761 | validation loss 0.0044476447451137465 | validation acc 0.6821588277816772
epoch 5 | train loss 0.004455520204577277 | train acc 0.6794617772102356 | validation loss 0.004440744633475147 | validation acc 0.6832549571990967
epoch 6 | train loss 0.004451952254412527 | train acc 0.6800702810287476 | validation loss 0.004441470330657095 

In [152]:
model = MyAttentionModelWithMaskOnWordPositionAndSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=2, dropout=0.2).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=10, batch_size=64, lr=0.0001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.00509055516239134 | train acc 0.6039572954177856 | validation loss 0.004623973122334621 | validation acc 0.6701809763908386
epoch 1 | train loss 0.00453963147662931 | train acc 0.6696078181266785 | validation loss 0.004458486681034294 | validation acc 0.6815210580825806
epoch 2 | train loss 0.004476245543153151 | train acc 0.6769688129425049 | validation loss 0.004444234651674954 | validation acc 0.6826570630073547
epoch 3 | train loss 0.004462162147676245 | train acc 0.6780288219451904 | validation loss 0.004446033453711366 | validation acc 0.6831752061843872
epoch 4 | train loss 0.004456018370163192 | train acc 0.6785882711410522 | validation loss 0.004439338907475906 | validation acc 0.6836734414100647
epoch 5 | train loss 0.004452455091821357 | train acc 0.6787158250808716 | validation loss 0.004451745376822406 | validation acc 0.6818199753761292
epoch 6 | train loss 0.0044524114556732265 | train acc 0.6792163848876953 | validation loss 0.004440945487918465 |

In [215]:
MODEL_PATH = input("Specify the path you wish to save the Attention model to: ")
torch.save(model.state_dict(), MODEL_PATH)