# Model 1: Attention TRAIN

In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle

import help_functions
import data_processor

## STRUCTURE
3. Build the model with embedding and Attention.
5. Train on the data.

## 3. Build the model

### Get the data

In [4]:
def load_pickle_data(filename):
    with open(filename, "rb") as load_file:
        return pickle.load(load_file)

In [5]:
X_train = load_pickle_data("saved_data/splitted_X_train_eq.pickle")
X_test = load_pickle_data("saved_data/splitted_X_test_eq.pickle")
y_train = load_pickle_data("saved_data/splitted_y_train_eq.pickle")
y_test = load_pickle_data("saved_data/splitted_y_test_eq.pickle")
voc = load_pickle_data("saved_data/all_voc.pickle")

In [40]:
X_train = load_pickle_data("attention_data/splitted_with_mask_X_train.pickle")
X_test = load_pickle_data("attention_data/splitted_with_mask_X_test.pickle")
y_train = load_pickle_data("attention_data/splitted_with_mask_y_train.pickle")
y_test = load_pickle_data("attention_data/splitted_with_mask_y_test.pickle")
mask_train = load_pickle_data("attention_data/splitted_with_mask_mask_train.pickle")
mask_test = load_pickle_data("attention_data/splitted_with_mask_mask_test.pickle")
voc = load_pickle_data("attention_data/splitted_with_mask_voc.pickle")

In [35]:
max_sequence_length = len(X_train[0])

In [6]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('Using', device)

Using cpu


In [16]:
print(f'Number of training samples: {len(y_train)}')
print(f'Number of test samples: {len(y_test)}')
print("")
print(f'Sequence length per sample: {max_sequence_length}')

Number of training samples: 101905
Number of test samples: 50193

Sequence length per sample: 283


In [46]:
def batchify(x, y, batch_size, mask=None):
    random_indices = torch.randperm(len(x))
    for i in range(0, len(x) - batch_size + 1, batch_size):
        indices = random_indices[i:i+batch_size]
        if not type(mask) == type(None):
            yield x[indices].to(device), y[indices].to(device), mask[indices].to(device)
        else:
            yield x[indices].to(device), y[indices].to(device)

In [10]:
class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens
        in the sequence. The positional encodings have the same dimension as
        the embeddings, so that the two can be summed. Here, we use sine and cosine
        functions of different frequencies.
    .. math::
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """

        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [229]:
class MyAttentionModel(nn.Module):
    """My Attention model, based on the Transformer encoder."""

    def __init__(self, vocab_size, embedding_dim, max_seq_len, num_heads, dim_feedforward, num_layers, dropout=0.5):
        super(MyAttentionModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(embedding_dim, dropout=0.1, max_len=max_seq_len)
        encoder_layers = TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward, dropout)
        # output shape (batch_size, max_seq_len, embedding_dim)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers) 
        self.encoder = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dim = embedding_dim
        self.decoder = nn.Linear(embedding_dim*max_seq_len, 1)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.weight)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        #print(f"Input shape: {src.shape}")
        src = self.encoder(src) * math.sqrt(self.embedding_dim)
        #print(f"Embedded shape: {src.shape}")
        src = self.pos_encoder(src)
        #print(f"Positional encoding shape: {src.shape}")
        output = self.transformer_encoder(src, self.src_mask)
        #print(f"Transformer encoder output shape: {output.shape}")
        # (batch_size, n_tokens, emb_dim) -> (batch_size, 1, emb_dim)
        output = output.view(output.shape[0], -1)
        #print(f"Reshaped output shape: {output.shape}")
        output = self.decoder(output)
        #print(f"Decoder output shape: {output.shape}")
        #print(h)
        return F.log_softmax(output, dim=-1)

In [27]:
class MyAttentionModelWithPooling(nn.Module):
    """My Attention model, based on the Transformer encoder."""

    def __init__(self, vocab_size, embedding_dim, max_seq_len, num_heads, dim_feedforward, num_layers, dropout=0.5):
        super(MyAttentionModelWithPooling, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(embedding_dim, dropout=0.1, max_len=max_seq_len)
        encoder_layers = TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward, dropout)
        # output shape (batch_size, max_seq_len, embedding_dim)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers) 
        self.encoder = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dim = embedding_dim
        self.pooler = nn.AvgPool1d(max_seq_len, stride=1)
        self.decoder = nn.Linear(embedding_dim, 1)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.weight)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        #print(f"Input shape: {src.shape}")
        src = self.encoder(src) * math.sqrt(self.embedding_dim)
        #print(f"Embedded shape: {src.shape}")
        src = self.pos_encoder(src)
        #print(f"Positional encoding shape: {src.shape}")
        output = self.transformer_encoder(src, self.src_mask)
        #print(f"Transformer encoder output shape: {output.shape}")
        # (batch_size, n_tokens, emb_dim) -> (batch_size, 1, emb_dim)
        output = self.pooler(output.permute(0,2,1))
        #print(f"Pooled output shape: {output.shape}")
        output = self.decoder(output.view(-1,output.shape[1]))
        #print(f"Decoder output shape: {output.shape}")
        #print(h)
        return F.log_softmax(output, dim=-1)

In [85]:
class MyAttentionModelWithMaskOnWordPosition(nn.Module):
    """My Attention model, based on the Transformer encoder."""

    def __init__(self, vocab_size, embedding_dim, max_seq_len, num_heads, dim_feedforward, num_layers, dropout=0.5):
        super(MyAttentionModelWithMaskOnWordPosition, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(embedding_dim, dropout=0.1, max_len=max_seq_len)
        encoder_layers = TransformerEncoderLayer(embedding_dim, num_heads, dim_feedforward, dropout)
        # output shape (batch_size, max_seq_len, embedding_dim)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers) 
        self.encoder = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dim = embedding_dim
        self.decoder = nn.Linear(embedding_dim, 1)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.weight)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src, word_position_mask, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        #print(f"Input shape: {src.shape}")
        src = self.encoder(src) * math.sqrt(self.embedding_dim)
        #print(f"Embedded shape: {src.shape}")
        src = self.pos_encoder(src)
        #print(f"Positional encoding shape: {src.shape}")
        #print(f"Word position mask shape after unsqueeze: {word_position_mask.unsqueeze(-1).expand(src.shape).shape}")
        src = torch.masked_select(src, word_position_mask.unsqueeze(-1).expand(src.shape)).view(-1, 1, self.embedding_dim)
        #print(f"Masked output shape: {src.shape}")
        output = self.transformer_encoder(src, self.src_mask)
        #print(f"Transformer encoder output shape: {output.shape}")
        output = self.decoder(output)
        output = output.squeeze(2)
        #print(f"Decoder output shape: {output.shape}")
        return F.log_softmax(output, dim=-1)

## Train the transformer!

In [12]:
def train(model, X_train, X_test, y_train, y_test, n_epochs=1, batch_size=100, lr=0.001, max_samples=None, weight_true=0.5):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    min_ppl = float('inf')
    for t in range(n_epochs):
        model.train()
        loss_fun = F.binary_cross_entropy

        loss_sum = 0
        accuracy_sum = 0
        nbr_train_batches = 0
        for bx, by in batchify(X_train, y_train, batch_size):
            nbr_train_batches += 1
            optimizer.zero_grad()
            output = model.forward(bx)
            sample_weight = (by.eq(1)*weight_true)+(by.eq(0)*(1-weight_true))
            #print(sample_weight)
            loss = loss_fun(output, by.type(torch.FloatTensor), weight=sample_weight)
            loss_sum += loss.item()
            accuracy = (output.eq(by)).sum()
            accuracy_sum += accuracy

            if max_samples and updater.n >= max_samples:
                break
            loss.backward()
            optimizer.step()

        train_loss = loss_sum/(nbr_train_batches*batch_size)
        train_acc = torch.true_divide(accuracy_sum,(nbr_train_batches*batch_size))
        model.eval()
        with torch.no_grad():
            loss_sum = 0
            accuracy_sum = 0
            nbr_test_batches = 0
            for bx, by in batchify(X_test, y_test, batch_size):
                nbr_test_batches += 1
                output = model.forward(bx)
                sample_weight = (by.eq(1)*weight_true)+(by.eq(0)*(1-weight_true))
                loss = loss_fun(output, by.type(torch.FloatTensor), weight=sample_weight)
                loss_sum += loss.item()
                accuracy = (output.eq(by)).sum()
                accuracy_sum += accuracy
        test_loss = loss_sum/(nbr_test_batches*batch_size)
        test_acc = torch.true_divide(accuracy_sum,(nbr_test_batches*batch_size))

        print(f'epoch {t} | train loss {train_loss} | train acc {train_acc} | validation loss {test_loss} | validation acc {test_acc}')

    return model

In [86]:
def train_with_mask(model, X_train, X_test, y_train, y_test, mask_train, mask_test, n_epochs=1, batch_size=100, lr=0.001, max_samples=None, weight_true=0.5):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    min_ppl = float('inf')
    for t in range(n_epochs):
        model.train()
        loss_fun = F.binary_cross_entropy

        loss_sum = 0
        accuracy_sum = 0
        nbr_train_batches = 0
        for bx, by, bm in batchify(X_train, y_train, batch_size, mask_train):
            nbr_train_batches += 1
            optimizer.zero_grad()
            output = model.forward(bx, bm)
            sample_weight = (by.eq(1)*weight_true)+(by.eq(0)*(1-weight_true))
            #print(sample_weight)
            #print(f"by shape: {by.shape}")
            #print(f"output shape: {output.shape}")
            loss = loss_fun(output, by.type(torch.FloatTensor), weight=sample_weight)
            loss_sum += loss.item()
            accuracy = (output.eq(by)).sum()
            accuracy_sum += accuracy

            if max_samples and updater.n >= max_samples:
                break
            loss.backward()
            optimizer.step()

        train_loss = loss_sum/(nbr_train_batches*batch_size)
        train_acc = torch.true_divide(accuracy_sum,(nbr_train_batches*batch_size))
        model.eval()
        with torch.no_grad():
            loss_sum = 0
            accuracy_sum = 0
            nbr_test_batches = 0
            for bx, by, bm in batchify(X_test, y_test, batch_size, mask_test):
                nbr_test_batches += 1
                output = model.forward(bx, bm)
                sample_weight = (by.eq(1)*weight_true)+(by.eq(0)*(1-weight_true))
                loss = loss_fun(output, by.type(torch.FloatTensor), weight=sample_weight)
                loss_sum += loss.item()
                accuracy = (output.eq(by)).sum()
                accuracy_sum += accuracy
        test_loss = loss_sum/(nbr_test_batches*batch_size)
        test_acc = torch.true_divide(accuracy_sum,(nbr_test_batches*batch_size))

        print(f'epoch {t} | train loss {train_loss} | train acc {train_acc} | validation loss {test_loss} | validation acc {test_acc}')

    return model

In [13]:
def get_data_subset(sub_percentage, X_train, X_test, y_train, y_test):
    train_sub_size = int(sub_percentage*len(y_train))
    test_sub_size = int(sub_percentage*len(y_test))

    X_train_sub = X_train[:train_sub_size]
    X_test_sub = X_test[:test_sub_size]
    y_train_sub = y_train[:train_sub_size]
    y_test_sub = y_test[:test_sub_size]

    return X_train_sub, X_test_sub, y_train_sub, y_test_sub

In [209]:
data_subset = get_data_subset(0.01, X_train, X_test, y_train, y_test)

model = MyAttentionModel(vocab_size=len(voc), embedding_dim=32, max_seq_len=max_sequence_length, num_heads=2, dim_feedforward=32, num_layers=1, dropout=0.1).to(device)
trained_model = train(model, torch.LongTensor(data_subset[0]), torch.LongTensor(data_subset[1]), torch.LongTensor(data_subset[2]), torch.LongTensor(data_subset[3]), n_epochs=5, batch_size=32, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.7985761088709677 | train acc 0.48891130089759827 | validation loss 0.771484375 | validation acc 0.5062500238418579
epoch 1 | train loss 0.8001512096774194 | train acc 0.4879032373428345 | validation loss 0.7584635416666666 | validation acc 0.5145833492279053
epoch 2 | train loss 0.8033014112903226 | train acc 0.4858871102333069 | validation loss 0.751953125 | validation acc 0.518750011920929
epoch 3 | train loss 0.8048765120967742 | train acc 0.4848790466785431 | validation loss 0.7649739583333334 | validation acc 0.5104166865348816
epoch 4 | train loss 0.8001512096774194 | train acc 0.4879032373428345 | validation loss 0.771484375 | validation acc 0.5062500238418579


In [32]:
data_subset = get_data_subset(1, X_train, X_test, y_train, y_test)

model = MyAttentionModelWithPooling(vocab_size=len(voc), embedding_dim=32, max_seq_len=max_sequence_length, num_heads=4, dim_feedforward=16, num_layers=1, dropout=0.1).to(device)
trained_model = train(model, torch.LongTensor(data_subset[0]), torch.LongTensor(data_subset[1]), torch.LongTensor(data_subset[2]), torch.LongTensor(data_subset[3]), n_epochs=5, batch_size=64, lr=0.0001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.3897662138819096 | train acc 0.5010992288589478 | validation loss 0.3924155721859056 | validation acc 0.4977080821990967
epoch 1 | train loss 0.3897738816151068 | train acc 0.5010894536972046 | validation loss 0.39236886160714285 | validation acc 0.4977678656578064
epoch 2 | train loss 0.3897432106823178 | train acc 0.5011286735534668 | validation loss 0.3923532914142219 | validation acc 0.4977877736091614
epoch 3 | train loss 0.3897662138819096 | train acc 0.5010992288589478 | validation loss 0.39238443180006377 | validation acc 0.49774792790412903
epoch 4 | train loss 0.3897585461487123 | train acc 0.5011090636253357 | validation loss 0.39243114237882654 | validation acc 0.4976881444454193


In [88]:
#data_subset = get_data_subset(1, X_train, X_test, y_train, y_test)

model = MyAttentionModelWithMaskOnWordPosition(vocab_size=len(voc), embedding_dim=16, max_seq_len=max_sequence_length, num_heads=1, dim_feedforward=16, num_layers=1, dropout=0.1).to(device)
trained_model = train_with_mask(model, torch.LongTensor(X_train), torch.LongTensor(X_test), torch.LongTensor(y_train), torch.LongTensor(y_test), torch.BoolTensor(mask_train), torch.BoolTensor(mask_test), n_epochs=3, batch_size=64, lr=0.0001, max_samples=None, weight_true=0.5)

epoch 0 | train loss 0.3897662138819096 | train acc 0.5010992288589478 | validation loss 0.39236886160714285 | validation acc 0.4977678656578064
epoch 1 | train loss 0.3897738816151068 | train acc 0.5010894536972046 | validation loss 0.39236886160714285 | validation acc 0.4977678656578064
epoch 2 | train loss 0.3897662138819096 | train acc 0.5010992288589478 | validation loss 0.39243114237882654 | validation acc 0.4976881444454193


In [238]:
model

MyAttentionModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=16, out_features=16, bias=True)
        )
        (linear1): Linear(in_features=16, out_features=2, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2, out_features=16, bias=True)
        (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (encoder): Embedding(8420, 16)
  (decoder): Linear(in_features=4528, out_features=1, bias=True)
)

In [239]:
nbr_params = 0
for parameter in model.parameters():
    nbr_params = nbr_params + len(parameter)
nbr_params

8650

In [None]:
MODEL_PATH = input("Specify the path you wish to save the Attention model to: ")
torch.save(model.state_dict(), MODEL_PATH)

In [None]:
model = MyAttentionModelWithPooling(vocab_size=len(voc), embedding_dim=32, max_seq_len=max_sequence_length, num_heads=4, dim_feedforward=16, num_layers=1, dropout=0.1)
model.load_state_dict(torch.load(MODEL_PATH))
model.eval()