# Model 1: Attention TRAIN

In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle

import help_functions
import data_processor

## STRUCTURE
3. Build the model with embedding and Attention.
5. Train on the data.

## 3. Build the model

### Get the data

In [4]:
def load_pickle_data(filename):
    with open(filename, "rb") as load_file:
        return pickle.load(load_file)

In [5]:
X_train = load_pickle_data("saved_data/splitted_X_train_eq.pickle")
X_test = load_pickle_data("saved_data/splitted_X_test_eq.pickle")
y_train = load_pickle_data("saved_data/splitted_y_train_eq.pickle")
y_test = load_pickle_data("saved_data/splitted_y_test_eq.pickle")
voc = load_pickle_data("saved_data/all_voc.pickle")

In [40]:
X_train = load_pickle_data("attention_data/splitted_with_mask_X_train.pickle")
X_test = load_pickle_data("attention_data/splitted_with_mask_X_test.pickle")
y_train = load_pickle_data("attention_data/splitted_with_mask_y_train.pickle")
y_test = load_pickle_data("attention_data/splitted_with_mask_y_test.pickle")
mask_train = load_pickle_data("attention_data/splitted_with_mask_mask_train.pickle")
mask_test = load_pickle_data("attention_data/splitted_with_mask_mask_test.pickle")
voc = load_pickle_data("attention_data/splitted_with_mask_voc.pickle")

In [35]:
max_sequence_length = len(X_train[0])

In [6]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('Using', device)

Using cpu


In [16]:
print(f'Number of training samples: {len(y_train)}')
print(f'Number of test samples: {len(y_test)}')
print("")
print(f'Sequence length per sample: {max_sequence_length}')

Number of training samples: 101905
Number of test samples: 50193

Sequence length per sample: 283


In [46]:
def batchify(x, y, batch_size, mask=None):
    random_indices = torch.randperm(len(x))
    for i in range(0, len(x) - batch_size + 1, batch_size):
        indices = random_indices[i:i+batch_size]
        if not type(mask) == type(None):
            yield x[indices].to(device), y[indices].to(device), mask[indices].to(device)
        else:
            yield x[indices].to(device), y[indices].to(device)

In [10]:
class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens
        in the sequence. The positional encodings have the same dimension as
        the embeddings, so that the two can be summed. Here, we use sine and cosine
        functions of different frequencies.
    .. math::
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """

        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [229]:
class MyAttentionModel(nn.Module):
    """My Attention model, based on the Transformer encoder."""

    def __init__(self, vocab_size, embedding_dim, max_seq_len, num_heads, dim_feedforward, num_layers, dropout=0.5):
        super(MyAttentionModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(embedding_dim, dropout=0.1, max_len=max_seq_len)
        encoder_layers = TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward, dropout)
        # output shape (batch_size, max_seq_len, embedding_dim)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers) 
        self.encoder = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dim = embedding_dim
        self.decoder = nn.Linear(embedding_dim*max_seq_len, 1)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.weight)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        #print(f"Input shape: {src.shape}")
        src = self.encoder(src) * math.sqrt(self.embedding_dim)
        #print(f"Embedded shape: {src.shape}")
        src = self.pos_encoder(src)
        #print(f"Positional encoding shape: {src.shape}")
        output = self.transformer_encoder(src, self.src_mask)
        #print(f"Transformer encoder output shape: {output.shape}")
        # (batch_size, n_tokens, emb_dim) -> (batch_size, 1, emb_dim)
        output = output.view(output.shape[0], -1)
        #print(f"Reshaped output shape: {output.shape}")
        output = self.decoder(output)
        #print(f"Decoder output shape: {output.shape}")
        #print(h)
        return F.log_softmax(output, dim=-1)

In [27]:
class MyAttentionModelWithPooling(nn.Module):
    """My Attention model, based on the Transformer encoder."""

    def __init__(self, vocab_size, embedding_dim, max_seq_len, num_heads, dim_feedforward, num_layers, dropout=0.5):
        super(MyAttentionModelWithPooling, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(embedding_dim, dropout=0.1, max_len=max_seq_len)
        encoder_layers = TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward, dropout)
        # output shape (batch_size, max_seq_len, embedding_dim)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers) 
        self.encoder = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dim = embedding_dim
        self.pooler = nn.AvgPool1d(max_seq_len, stride=1)
        self.decoder = nn.Linear(embedding_dim, 1)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.weight)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        #print(f"Input shape: {src.shape}")
        src = self.encoder(src) * math.sqrt(self.embedding_dim)
        #print(f"Embedded shape: {src.shape}")
        src = self.pos_encoder(src)
        #print(f"Positional encoding shape: {src.shape}")
        output = self.transformer_encoder(src, self.src_mask)
        #print(f"Transformer encoder output shape: {output.shape}")
        # (batch_size, n_tokens, emb_dim) -> (batch_size, 1, emb_dim)
        output = self.pooler(output.permute(0,2,1))
        #print(f"Pooled output shape: {output.shape}")
        output = self.decoder(output.view(-1,output.shape[1]))
        #print(f"Decoder output shape: {output.shape}")
        #print(h)
        return F.log_softmax(output, dim=-1)

In [122]:
class MyAttentionModelWithMaskOnWordPosition(nn.Module):
    """My Attention model, based on the Transformer encoder."""

    def __init__(self, vocab_size, embedding_dim, max_seq_len, num_heads, dim_feedforward, num_layers, dropout=0.5):
        super(MyAttentionModelWithMaskOnWordPosition, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(embedding_dim, dropout=0.1, max_len=max_seq_len)
        encoder_layers = TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward, dropout)
        # output shape (batch_size, max_seq_len, embedding_dim)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers) 
        self.encoder = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dim = embedding_dim
        self.decoder = nn.Linear(embedding_dim, 1)
        self.decoder_act = nn.Sigmoid()

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.weight)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src, word_position_mask, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        print(f"Input shape: {src.shape}")
        print(f"Mask shape: {word_position_mask.shape}")
        print(h)
        src = self.encoder(src) * math.sqrt(self.embedding_dim)
        #print(f"Embedded shape: {src.shape}")
        src = self.pos_encoder(src)
        #print(f"Positional encoding shape: {src.shape}")
        #print(f"Word position mask shape after unsqueeze: {word_position_mask.unsqueeze(-1).expand(src.shape).shape}")
        src = torch.masked_select(src, word_position_mask.unsqueeze(-1).expand(src.shape)).view(-1, 1, self.embedding_dim)
        #print(f"Masked output shape: {src.shape}")
        output = self.transformer_encoder(src, self.src_mask)
        #print(f"Transformer encoder output shape: {output.shape}")
        output = self.decoder(output)
        #print(f"Decoder output shape: {output.shape}")
        return self.decoder_act(output.squeeze(2))

In [145]:
class MyAttentionModelWithMaskOnWordPositionAndSkip(nn.Module):
    """My Attention model, based on the Transformer encoder."""

    def __init__(self, vocab_size, embedding_dim, max_seq_len, num_heads, dim_feedforward, num_layers, dropout=0.5):
        super(MyAttentionModelWithMaskOnWordPositionAndSkip, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(embedding_dim, dropout=0.1, max_len=max_seq_len)
        encoder_layers = TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward, dropout)
        # output shape (batch_size, max_seq_len, embedding_dim)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers) 
        self.encoder = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dim = embedding_dim
        self.decoder = nn.Linear(embedding_dim, 1)
        self.batch_normer = torch.nn.BatchNorm1d(embedding_dim)
        self.decoder_act = nn.Sigmoid()

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.weight)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src, word_position_mask, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        #print(f"Input shape: {src.shape}")
        #print(f"Mask shape: {word_position_mask.shape}")
        src = self.encoder(src) * math.sqrt(self.embedding_dim)
        #print(f"Embedded src shape: {src.shape}")
        skip_src = torch.masked_select(src, word_position_mask.unsqueeze(-1).expand(src.shape)).view(-1, 1, self.embedding_dim)
        #print(f"Skip src shape: {skip_src.shape}")
        src = self.pos_encoder(src)
        #print(f"Positional encoding shape: {src.shape}")
        #print(f"Word position mask shape after unsqueeze: {word_position_mask.unsqueeze(-1).expand(src.shape).shape}")
        src = torch.masked_select(src, word_position_mask.unsqueeze(-1).expand(src.shape)).view(-1, 1, self.embedding_dim)
        #print(f"Masked output shape: {src.shape}")
        output = self.transformer_encoder(src, self.src_mask)
        #print(f"Transformer encoder output shape: {output.shape}")
        output = torch.add(output, skip_src).squeeze(1)
        #print(f"Added output shape: {output.shape}")
        output = self.batch_normer(output)
        output = self.decoder(output)
        #print(f"Decoder output shape: {output.shape}")
        return self.decoder_act(output)

## Train the transformer!

In [12]:
def train(model, X_train, X_test, y_train, y_test, n_epochs=1, batch_size=100, lr=0.001, max_samples=None, weight_true=0.5):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    min_ppl = float('inf')
    for t in range(n_epochs):
        model.train()
        loss_fun = F.binary_cross_entropy

        loss_sum = 0
        accuracy_sum = 0
        nbr_train_batches = 0
        for bx, by in batchify(X_train, y_train, batch_size):
            nbr_train_batches += 1
            optimizer.zero_grad()
            output = model.forward(bx)
            sample_weight = (by.eq(1)*weight_true)+(by.eq(0)*(1-weight_true))
            #print(sample_weight)
            loss = loss_fun(output, by.type(torch.FloatTensor), weight=sample_weight)
            loss_sum += loss.item()
            accuracy = (output.eq(by)).sum()
            accuracy_sum += accuracy

            if max_samples and updater.n >= max_samples:
                break
            loss.backward()
            optimizer.step()

        train_loss = loss_sum/(nbr_train_batches*batch_size)
        train_acc = torch.true_divide(accuracy_sum,(nbr_train_batches*batch_size))
        model.eval()
        with torch.no_grad():
            loss_sum = 0
            accuracy_sum = 0
            nbr_test_batches = 0
            for bx, by in batchify(X_test, y_test, batch_size):
                nbr_test_batches += 1
                output = model.forward(bx)
                sample_weight = (by.eq(1)*weight_true)+(by.eq(0)*(1-weight_true))
                loss = loss_fun(output, by.type(torch.FloatTensor), weight=sample_weight)
                loss_sum += loss.item()
                accuracy = (output.eq(by)).sum()
                accuracy_sum += accuracy
        test_loss = loss_sum/(nbr_test_batches*batch_size)
        test_acc = torch.true_divide(accuracy_sum,(nbr_test_batches*batch_size))

        print(f'epoch {t} | train loss {train_loss} | train acc {train_acc} | validation loss {test_loss} | validation acc {test_acc}')

    return model

In [112]:
def train_with_mask(model, X_train, X_test, y_train, y_test, mask_train, mask_test, n_epochs=1, batch_size=100, lr=0.001, max_samples=None, weight_true=0.5):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    min_ppl = float('inf')
    for t in range(n_epochs):
        model.train()
        loss_fun = F.binary_cross_entropy

        loss_sum = 0
        accuracy_sum = 0
        nbr_train_batches = 0
        for bx, by, bm in batchify(X_train, y_train, batch_size, mask_train):
            nbr_train_batches += 1
            optimizer.zero_grad()
            output = model.forward(bx, bm)
            sample_weight = (by.eq(1)*weight_true)+(by.eq(0)*(1-weight_true))
            #print("Output after word position mask selection: ")
            #print(torch.masked_select(bx, bm))
            #print(sample_weight)
            #print(f"by shape: {by.shape}")
            #print(f"output shape: {output.shape}")
            loss = loss_fun(output, by.type(torch.FloatTensor), weight=sample_weight)
            loss_sum += loss.item()
            #print(f"output rounded: {output.round()}")
            #print(f"by: {by}")
            accuracy = (output.round().eq(by)).sum()
            accuracy_sum += accuracy

            if max_samples and updater.n >= max_samples:
                break
            loss.backward()
            optimizer.step()

        train_loss = loss_sum/(nbr_train_batches*batch_size)
        train_acc = torch.true_divide(accuracy_sum,(nbr_train_batches*batch_size))
        model.eval()
        with torch.no_grad():
            loss_sum = 0
            accuracy_sum = 0
            nbr_test_batches = 0
            for bx, by, bm in batchify(X_test, y_test, batch_size, mask_test):
                nbr_test_batches += 1
                output = model.forward(bx, bm)
                sample_weight = (by.eq(1)*weight_true)+(by.eq(0)*(1-weight_true))
                loss = loss_fun(output, by.type(torch.FloatTensor), weight=sample_weight)
                loss_sum += loss.item()
                accuracy = (output.round().eq(by)).sum()
                accuracy_sum += accuracy
        test_loss = loss_sum/(nbr_test_batches*batch_size)
        test_acc = torch.true_divide(accuracy_sum,(nbr_test_batches*batch_size))

        print(f'epoch {t} | train loss {train_loss} | train acc {train_acc} | validation loss {test_loss} | validation acc {test_acc}')

    return model

In [13]:
def get_data_subset(sub_percentage, X_train, X_test, y_train, y_test):
    train_sub_size = int(sub_percentage*len(y_train))
    test_sub_size = int(sub_percentage*len(y_test))

    X_train_sub = X_train[:train_sub_size]
    X_test_sub = X_test[:test_sub_size]
    y_train_sub = y_train[:train_sub_size]
    y_test_sub = y_test[:test_sub_size]

    return X_train_sub, X_test_sub, y_train_sub, y_test_sub

### Initial model (BAD)

In [209]:
data_subset = get_data_subset(0.01, X_train, X_test, y_train, y_test)

model = MyAttentionModel(vocab_size=len(voc), embedding_dim=32, max_seq_len=max_sequence_length, num_heads=2, dim_feedforward=32, num_layers=1, dropout=0.1).to(device)
trained_model = train(model, torch.LongTensor(data_subset[0]), torch.LongTensor(data_subset[1]), torch.LongTensor(data_subset[2]), torch.LongTensor(data_subset[3]), n_epochs=5, batch_size=32, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.7985761088709677 | train acc 0.48891130089759827 | validation loss 0.771484375 | validation acc 0.5062500238418579
epoch 1 | train loss 0.8001512096774194 | train acc 0.4879032373428345 | validation loss 0.7584635416666666 | validation acc 0.5145833492279053
epoch 2 | train loss 0.8033014112903226 | train acc 0.4858871102333069 | validation loss 0.751953125 | validation acc 0.518750011920929
epoch 3 | train loss 0.8048765120967742 | train acc 0.4848790466785431 | validation loss 0.7649739583333334 | validation acc 0.5104166865348816
epoch 4 | train loss 0.8001512096774194 | train acc 0.4879032373428345 | validation loss 0.771484375 | validation acc 0.5062500238418579


### Model with word position (BETTER), lr = 0.0001

In [113]:
model = MyAttentionModelWithMaskOnWordPosition(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.0001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.005195849543804742 | train acc 0.5914042592048645 | validation loss 0.004801498082162318 | validation acc 0.6525430679321289
epoch 1 | train loss 0.004706355374559161 | train acc 0.6589882969856262 | validation loss 0.004541682469663306 | validation acc 0.6789700388908386
epoch 2 | train loss 0.0045658173427119125 | train acc 0.6713548302650452 | validation loss 0.004487742078003512 | validation acc 0.6823381781578064
epoch 3 | train loss 0.004529521219671035 | train acc 0.6731312870979309 | validation loss 0.004465106048808927 | validation acc 0.6829759478569031
epoch 4 | train loss 0.004496643676869478 | train acc 0.6748292446136475 | validation loss 0.004453594098758541 | validation acc 0.6838328838348389


### Model with word position (BETTER), lr = 0.001

In [114]:
model = MyAttentionModelWithMaskOnWordPosition(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004614046647859241 | train acc 0.66082364320755 | validation loss 0.004465662476925028 | validation acc 0.6778938174247742
epoch 1 | train loss 0.004470847523507976 | train acc 0.6789513826370239 | validation loss 0.0044546720672194486 | validation acc 0.6808235049247742
epoch 2 | train loss 0.004456827052998773 | train acc 0.6795108318328857 | validation loss 0.004446458911323654 | validation acc 0.6837332844734192
epoch 3 | train loss 0.004451406081546398 | train acc 0.6808750629425049 | validation loss 0.00444886220021563 | validation acc 0.6838528513908386
epoch 4 | train loss 0.004447987597978594 | train acc 0.6819056272506714 | validation loss 0.004447791070560925 | validation acc 0.6822983026504517


### Model with word position (BETTER), lr = 0.001, more epochs

In [115]:
model = MyAttentionModelWithMaskOnWordPosition(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=10, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004616558228428686 | train acc 0.6617069840431213 | validation loss 0.0044684916478106565 | validation acc 0.6789700388908386
epoch 1 | train loss 0.004466166189782808 | train acc 0.6786962151527405 | validation loss 0.004452353190423978 | validation acc 0.6822983026504517
epoch 2 | train loss 0.0044582220102688435 | train acc 0.6807082295417786 | validation loss 0.0044462922137891115 | validation acc 0.6825374960899353
epoch 3 | train loss 0.004452732713698835 | train acc 0.6806198954582214 | validation loss 0.0044470997726216875 | validation acc 0.6838528513908386
epoch 4 | train loss 0.00444922012106712 | train acc 0.6811989545822144 | validation loss 0.004444805624875791 | validation acc 0.6839525103569031
epoch 5 | train loss 0.004448093323419619 | train acc 0.680933952331543 | validation loss 0.00444489560471325 | validation acc 0.6834343075752258
epoch 6 | train loss 0.0044408707047903905 | train acc 0.6825631856918335 | validation loss 0.00444611956214541

### Model with word position (BETTER), lr=0.001, embedding_dim=32

In [116]:
model = MyAttentionModelWithMaskOnWordPosition(vocab_size=len(voc), embedding_dim=32, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004566876791463229 | train acc 0.6648672819137573 | validation loss 0.0044660135632623174 | validation acc 0.6817402839660645
epoch 1 | train loss 0.004461310231466146 | train acc 0.6788139939308167 | validation loss 0.004486466950097844 | validation acc 0.6727718710899353
epoch 2 | train loss 0.00445315194545529 | train acc 0.6811106204986572 | validation loss 0.0044526342048347755 | validation acc 0.6838528513908386
epoch 3 | train loss 0.0044480670664995324 | train acc 0.6816307902336121 | validation loss 0.004455088610150281 | validation acc 0.6832349896430969
epoch 4 | train loss 0.004442146246850341 | train acc 0.6825534105300903 | validation loss 0.004455286609151458 | validation acc 0.6823580861091614


### Model with word position (BETTER), lr=0.001, num_heads=2

In [117]:
model = MyAttentionModelWithMaskOnWordPosition(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=2, dim_feedforward=16, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004608239061415579 | train acc 0.6631497144699097 | validation loss 0.004470323396868034 | validation acc 0.6770169138908386
epoch 1 | train loss 0.004469045385514765 | train acc 0.6779306530952454 | validation loss 0.004455907273579066 | validation acc 0.6804049611091614
epoch 2 | train loss 0.004453843937872034 | train acc 0.6807769536972046 | validation loss 0.004447608298505656 | validation acc 0.6834542155265808
epoch 3 | train loss 0.004449923895416435 | train acc 0.6818369030952454 | validation loss 0.004446920593404591 | validation acc 0.6836934089660645
epoch 4 | train loss 0.004448208277846158 | train acc 0.6814345121383667 | validation loss 0.004450966856485157 | validation acc 0.683613657951355


### Model with word position (BETTER), lr=0.001, dim_feed_forward=32

In [118]:
model = MyAttentionModelWithMaskOnWordPosition(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=32, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004628330949417798 | train acc 0.6602641940116882 | validation loss 0.00445457686502629 | validation acc 0.6825175285339355
epoch 1 | train loss 0.0044671814928495856 | train acc 0.6794813871383667 | validation loss 0.004449236860358612 | validation acc 0.6823979616165161
epoch 2 | train loss 0.004461987267585293 | train acc 0.6793930530548096 | validation loss 0.0044524290632190445 | validation acc 0.680086076259613
epoch 3 | train loss 0.0044525637734218046 | train acc 0.6799426674842834 | validation loss 0.0044472632630923005 | validation acc 0.6834940910339355
epoch 4 | train loss 0.004447900771746066 | train acc 0.6815621256828308 | validation loss 0.004448444883954445 | validation acc 0.6834542155265808


### Model (BETTER), lr=0.001, num_layers=2

In [119]:
model = MyAttentionModelWithMaskOnWordPosition(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=2, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.0046285387800173395 | train acc 0.6600973606109619 | validation loss 0.004467735785102396 | validation acc 0.681640625
epoch 1 | train loss 0.004472482840977516 | train acc 0.6792752742767334 | validation loss 0.0044523523357216915 | validation acc 0.6832947731018066
epoch 2 | train loss 0.00446194501569724 | train acc 0.6793636083602905 | validation loss 0.004453408547287464 | validation acc 0.681620717048645
epoch 3 | train loss 0.004452104662251355 | train acc 0.6805708408355713 | validation loss 0.004447732718568771 | validation acc 0.6831552982330322
epoch 4 | train loss 0.004451527385287988 | train acc 0.6808750629425049 | validation loss 0.004447868717204761 | validation acc 0.6837332844734192


### Model with SKIP

In [147]:
model = MyAttentionModelWithMaskOnWordPositionAndSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004567755323930657 | train acc 0.6651322841644287 | validation loss 0.00446053385991147 | validation acc 0.6810028553009033
epoch 1 | train loss 0.004472423260464057 | train acc 0.6766842007637024 | validation loss 0.004462138659080813 | validation acc 0.6785315871238708
epoch 2 | train loss 0.004464741773586299 | train acc 0.678382158279419 | validation loss 0.0044527496142925845 | validation acc 0.6825773119926453
epoch 3 | train loss 0.004468777842045148 | train acc 0.6767725348472595 | validation loss 0.004450293900195642 | validation acc 0.6780133843421936
epoch 4 | train loss 0.004464458588782192 | train acc 0.6788728833198547 | validation loss 0.0044479133179340016 | validation acc 0.6821588277816772


In [148]:
model = MyAttentionModelWithMaskOnWordPositionAndSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=2, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=5, batch_size=64, lr=0.001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004565130642073547 | train acc 0.664739727973938 | validation loss 0.004464899359047603 | validation acc 0.6797273755073547
epoch 1 | train loss 0.004474202448990074 | train acc 0.6763995885848999 | validation loss 0.004452605968773631 | validation acc 0.6837531924247742
epoch 2 | train loss 0.0044677315027413225 | train acc 0.6775871515274048 | validation loss 0.0044462886476789466 | validation acc 0.6828762888908386
epoch 3 | train loss 0.004462914391895803 | train acc 0.6781760454177856 | validation loss 0.004452702448565551 | validation acc 0.6820989847183228
epoch 4 | train loss 0.004463629105482859 | train acc 0.6780386567115784 | validation loss 0.004445910977962313 | validation acc 0.6800462603569031


# PICKED

In [154]:
model = MyAttentionModelWithMaskOnWordPositionAndSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=2, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=4, batch_size=64, lr=0.0005, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004600985258251391 | train acc 0.6613732576370239 | validation loss 0.004453893274850478 | validation acc 0.6813217401504517
epoch 1 | train loss 0.004465438979683706 | train acc 0.6777932643890381 | validation loss 0.0044495356188996756 | validation acc 0.6800262928009033
epoch 2 | train loss 0.004463747952363908 | train acc 0.6779306530952454 | validation loss 0.004447118593593679 | validation acc 0.6836734414100647
epoch 3 | train loss 0.004458789922380071 | train acc 0.6799623370170593 | validation loss 0.004443505789005977 | validation acc 0.6834143996238708


In [156]:
model = MyAttentionModelWithMaskOnWordPositionAndSkip(vocab_size=len(voc), embedding_dim=32, max_seq_len=max_sequence_length, num_heads=4, dim_feedforward=32, num_layers=2, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=4, batch_size=64, lr=0.0005, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.004582824037046448 | train acc 0.66144198179245 | validation loss 0.004461520132893811 | validation acc 0.6788902878761292
epoch 1 | train loss 0.004471411472818248 | train acc 0.675820529460907 | validation loss 0.0044550802915033945 | validation acc 0.6831353902816772
epoch 2 | train loss 0.004463508199103616 | train acc 0.6763603091239929 | validation loss 0.0044491798919863166 | validation acc 0.6817801594734192
epoch 3 | train loss 0.004463460496024242 | train acc 0.6776853203773499 | validation loss 0.004452569045813051 | validation acc 0.679109513759613


In [150]:
model = MyAttentionModelWithMaskOnWordPositionAndSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=2, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=10, batch_size=64, lr=0.0001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.005069418102748523 | train acc 0.6058515310287476 | validation loss 0.004597217420992746 | validation acc 0.6716158986091614
epoch 1 | train loss 0.004528155480014235 | train acc 0.6707561016082764 | validation loss 0.004451946965988954 | validation acc 0.6833147406578064
epoch 2 | train loss 0.004470993669635742 | train acc 0.6781269907951355 | validation loss 0.004442791472906627 | validation acc 0.683195173740387
epoch 3 | train loss 0.0044613515777716965 | train acc 0.678342878818512 | validation loss 0.004443389930517641 | validation acc 0.6836535334587097
epoch 4 | train loss 0.004452196304802198 | train acc 0.6785489916801453 | validation loss 0.00443882708394976 | validation acc 0.6840122938156128
epoch 5 | train loss 0.004454163813831322 | train acc 0.6786667704582214 | validation loss 0.004440363247080573 | validation acc 0.6839525103569031
epoch 6 | train loss 0.0044524092853172265 | train acc 0.6792851090431213 | validation loss 0.00443882212412249 | 

In [151]:
model = MyAttentionModelWithMaskOnWordPositionAndSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=2, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=10, batch_size=64, lr=0.00005, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.005304634000698863 | train acc 0.5619994401931763 | validation loss 0.0050715585904462 | validation acc 0.6253387928009033
epoch 1 | train loss 0.004846395652661634 | train acc 0.6425290703773499 | validation loss 0.004623212458385269 | validation acc 0.6627471446990967
epoch 2 | train loss 0.004568052345924266 | train acc 0.6661824584007263 | validation loss 0.004484202303538783 | validation acc 0.6790696978569031
epoch 3 | train loss 0.004485624396135731 | train acc 0.6772436499595642 | validation loss 0.004451954369350071 | validation acc 0.6827766299247742
epoch 4 | train loss 0.0044629388908860295 | train acc 0.6784704923629761 | validation loss 0.0044476447451137465 | validation acc 0.6821588277816772
epoch 5 | train loss 0.004455520204577277 | train acc 0.6794617772102356 | validation loss 0.004440744633475147 | validation acc 0.6832549571990967
epoch 6 | train loss 0.004451952254412527 | train acc 0.6800702810287476 | validation loss 0.004441470330657095 

In [152]:
model = MyAttentionModelWithMaskOnWordPositionAndSkip(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=2, dropout=0.2).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=10, batch_size=64, lr=0.0001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.00509055516239134 | train acc 0.6039572954177856 | validation loss 0.004623973122334621 | validation acc 0.6701809763908386
epoch 1 | train loss 0.00453963147662931 | train acc 0.6696078181266785 | validation loss 0.004458486681034294 | validation acc 0.6815210580825806
epoch 2 | train loss 0.004476245543153151 | train acc 0.6769688129425049 | validation loss 0.004444234651674954 | validation acc 0.6826570630073547
epoch 3 | train loss 0.004462162147676245 | train acc 0.6780288219451904 | validation loss 0.004446033453711366 | validation acc 0.6831752061843872
epoch 4 | train loss 0.004456018370163192 | train acc 0.6785882711410522 | validation loss 0.004439338907475906 | validation acc 0.6836734414100647
epoch 5 | train loss 0.004452455091821357 | train acc 0.6787158250808716 | validation loss 0.004451745376822406 | validation acc 0.6818199753761292
epoch 6 | train loss 0.0044524114556732265 | train acc 0.6792163848876953 | validation loss 0.004440945487918465 |

In [155]:
MODEL_PATH = input("Specify the path you wish to save the Attention model to: ")
torch.save(model.state_dict(), MODEL_PATH)