# Sentiment Analysis (Stanford Sentiment Treebank)

---

**Various models for sentiment analysis on Stanford Sentiment Treebank dataset**

In [1]:
!pip install pytorch-lightning -q

[K     |████████████████████████████████| 409kB 9.0MB/s 
[K     |████████████████████████████████| 829kB 19.0MB/s 
[K     |████████████████████████████████| 276kB 29.8MB/s 
[K     |████████████████████████████████| 2.8MB 46.8MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
  Building wheel for PyYAML (setup.py) ... [?25l[?25hdone
[31mERROR: tensorflow 2.3.0 has requirement tensorboard<3,>=2.3.0, but you'll have tensorboard 2.2.0 which is incompatible.[0m


In [3]:
# import necessary modules

import os
import torch
import torchtext

import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint

from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import (pad_sequence, pack_padded_sequence, 
                                  pad_packed_sequence)

from collections import namedtuple, Counter

## Multi-Layer Perceptron

The first model is based on an architecture presented in the paper, [Bag of Tricks for Efficient Text Classification](https://www.aclweb.org/anthology/E17-2068/). In particular, we use a Multi-Layer Perceptron (MLP) to predict sentiment using an aggregated representation of the input. The input is tokenized into individual words, and each word is then represented by its embedding. Inputs can be of different length, thus, the embeddings are aggregated by the use of a pooling function (often mean-pooling - that is, simply the element-wise mean) to create a fixed-size representation.

Our implementation differs slightly from the presentation in the paper, since we make use of pre-trained word embeddings and do not implement a hierarchical softmax. Regarding the use of embeddings, there are two options: (1) keep the pre-trained embeddings fixed during training, and (2) include the embeddings in the optimisation. In our implementation we choose to optimise the embeddings.

### Pre-processing

In [4]:
# use a named-tuple to store data examples
Example = namedtuple("Example", ["text", "label"])

In [5]:
class SST_Dataset(Dataset):
    
    def __init__(self, dataset):
        self.dataset = dataset
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx): 
        element = self.dataset[idx] 
        X = element.text
        Y = element.label
        return X, Y

In [6]:
class SSTDataModuleBase(pl.LightningDataModule):

    def __init__(self):
        super().__init__()

        self.PAD_token = 0
        self.UNK_token = 1
        self.SOS_token = 2
        self.EOS_token = 3

        self.targetEncoding = {'negative': 0, 'positive': 1}
        
    
    def _format_data(self, dataset):
        
        tokenized_dataset = []
        for element in dataset:
            encoding = self._tokenize(element)
            tokenized_dataset.append(Example(text=encoding[0], label=encoding[1]))

        return tokenized_dataset


    def embedding_matrix(self):

        glove = torchtext.vocab.GloVe(name='6B', dim=300, 
                                      unk_init = torch.Tensor.normal_)
        matrix_len = len(self._wordlist)
        weights_matrix = np.zeros((matrix_len, 300))

        for i, word in enumerate(self._wordlist):
            try: 
                weights_matrix[i] = glove.vectors[glove.stoi[word]]
            except KeyError:
                weights_matrix[i] = np.random.normal(scale=0.5, size=(300, ))

        return weights_matrix   

    @staticmethod
    def _flatten(lst):
        return [item for sublist in lst for item in sublist]
        
    def _build_vocab(self, data):
        vocab_counter = Counter(self._flatten([example.text for example in data]))
        return vocab_counter


    def _build_encoding(self, vocab_count, min_freq=3):

        self._wordlist = ["<PAD>", "<UNK>", "<SOS>", "<EOS>"]
        self.encoding = {}

        svocabCount = {k: v for k, v in reversed(sorted(vocab_count.items(), 
                                        key=lambda item: item[1]))}

        for word in svocabCount:
            if svocabCount[word] >= min_freq:
                self._wordlist.append(word)
        self.encoding.update({tok: i for i, tok in enumerate(self._wordlist)})
        
    def _tokenize(self, element):

        text = (torch.tensor([self.SOS_token] + 
          [self.encoding.get(word, self.UNK_token) for word in element.text] + 
          [self.EOS_token]))
        label = torch.tensor(self.targetEncoding[element.label])

        return text, label  

    def setup(self, stage=None, min_freq=3):

        TEXT = torchtext.data.Field(tokenize='spacy', lower=True)
        LABEL = torchtext.data.Field(sequential=False)
        
        train_data, val_data, test_data = torchtext.datasets.SST.splits(TEXT, 
          LABEL, filter_pred=lambda ex: ex.label != 'neutral', train_subtrees=True)
        
      
        vocab_counter = self._build_vocab(train_data)
        self._build_encoding(vocab_counter, min_freq)

        if stage == 'fit' or stage is None:
            self.sst_train = SST_Dataset(self._format_data(train_data))
            self.sst_val = SST_Dataset(self._format_data(val_data))


        if stage == 'test' or stage is None:
            self.sst_test = SST_Dataset(self._format_data(test_data))

    def train_dataloader(self):
        raise NotImplementedError

    def val_dataloader(self):
        raise NotImplementedError

    def test_dataloader(self):
        raise NotImplementedError


In [7]:
class SSTDataModuleMLP(SSTDataModuleBase):

    def __init__(self):
        super().__init__()

    @staticmethod
    def _collate_fn(batch):
        # get data and targets from batch
        data = [item[0] for item in batch]
        targets = [item[1] for item in batch]
        lengths = [len(el) for el in data]
        offsets = np.cumsum(lengths)
        offsets = np.concatenate([[0], offsets[:-1]])

        return (torch.LongTensor(torch.cat(data).long()), 
                torch.Tensor(targets).float(), 
                torch.LongTensor(offsets))
        
    def train_dataloader(self):
        return DataLoader(self.sst_train, batch_size=64, 
                              collate_fn=self._collate_fn, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.sst_val, batch_size=64, 
                              collate_fn=self._collate_fn)

    def test_dataloader(self):
        return DataLoader(self.sst_test, batch_size=64,
                              collate_fn=self._collate_fn)

### Model

In [8]:
ds = SSTDataModuleMLP()
ds.setup(min_freq=3)

downloading trainDevTestTrees_PTB.zip


trainDevTestTrees_PTB.zip: 100%|██████████| 790k/790k [00:01<00:00, 500kB/s]


extracting


In [9]:
class FastText(pl.LightningModule):
    
    def __init__(self, input_size, embed_mat=None):
        
        super().__init__()

        self.embedding = nn.EmbeddingBag(input_size, 300, mode="mean")
        if embed_mat is not None:
            self.embedding = self.embedding.from_pretrained(torch.from_numpy(embed_mat).float())
        self.fc = nn.Sequential(
            nn.Linear(300, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, 1),
        )

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.0001, 
                                     weight_decay=1e-05)
        return optimizer

    def forward(self, batch):

        inputs, _, offsets = batch
        # inputs: [SUM(SEQ_LENGTHS)]

        x = self.embedding(inputs, offsets)
        # x: [BATCH_SIZE, EMBED_DIM]

        x = self.fc(x).squeeze()
        # x: [BATCH_SIZE]
        
        return x

    def training_step(self, batch, batch_idx):

        x, y, offsets = batch
        y_hat = self(batch)

        # compute loss
        loss = F.binary_cross_entropy_with_logits(y_hat, y)

        return {'loss': loss, 
                "batch_size": len(y)}

    def training_epoch_end(self, outputs):

        total = sum([x['batch_size'] for x in outputs])
        avg_loss = sum([x['loss']*x['batch_size'] for x in outputs])/total

        print(f"Epoch {self.current_epoch}:\t Train loss: {avg_loss:.4f}")
        return {'avg_train_loss': avg_loss}

    def validation_step(self, batch, batch_idx):

        x, y, offsets = batch
        y_hat = self(batch)

        # compute loss
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        
        # compute acc
        preds = torch.round(torch.sigmoid(y_hat))
        correct = (preds == y).float().sum()
        acc = correct/len(y)

        return {"loss": loss, 
                "acc": acc, 
                "batch_size": len(y)}

    def validation_epoch_end(self, outputs, mode="val"):
      
        total = sum([x['batch_size'] for x in outputs])
        avg_loss = sum([x['loss']*x['batch_size'] for x in outputs])/total
        avg_acc = sum([x['acc']*x['batch_size'] for x in outputs])/total
      
        if mode=='val':
            print(f"Epoch {self.current_epoch}:\t Validation acc: {avg_acc:.4f}\t Validation loss: {avg_loss:.4f}")
        
        return {"epoch_val_loss": avg_loss, "epoch_val_acc": avg_acc}

    def test_step(self, batch, batch_idx):

        return self.validation_step(batch, batch_idx)
      

    def test_epoch_end(self, outputs):

        outputs = self.validation_epoch_end(outputs, mode="test")
        return {"test_loss": outputs['epoch_val_loss'], 
                "test_acc": outputs['epoch_val_acc']
                }
      

In [10]:
# load the pre-trained embedding - this
# may take a couple minutes at first execution.
embed_mat = ds.embedding_matrix()

.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                           
100%|█████████▉| 399668/400000 [00:35<00:00, 11085.67it/s]

In [11]:
model = FastText(len(ds.encoding), embed_mat=embed_mat)


early_stop_callback = EarlyStopping(
   monitor='epoch_val_loss',
   min_delta=0.0001,
   patience=3,
   verbose=False,
   mode='min'
)

checkpoint_callback = ModelCheckpoint(
    filepath='./checkpoints/'+'{epoch}',
    save_top_k=1,
    verbose=False,
    monitor='epoch_val_loss',
    mode='min',
    prefix=model.__class__.__name__+"_"
)

trainer = pl.Trainer(gpus=1, progress_bar_refresh_rate=0, max_epochs=30, 
                     num_sanity_val_steps=0, 
                     early_stop_callback=early_stop_callback,
                     checkpoint_callback=checkpoint_callback)
trainer.fit(model, ds)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type         | Params
-------------------------------------------
0 | embedding | EmbeddingBag | 4 M   
1 | fc        | Sequential   | 77 K  


Epoch 0:	 Validation acc: 0.7580	 Validation loss: 0.4897
Epoch 0:	 Train loss: 0.4815
Epoch 1:	 Validation acc: 0.7638	 Validation loss: 0.4723
Epoch 1:	 Train loss: 0.3753
Epoch 2:	 Validation acc: 0.7695	 Validation loss: 0.4649
Epoch 2:	 Train loss: 0.3624
Epoch 3:	 Validation acc: 0.7672	 Validation loss: 0.4607
Epoch 3:	 Train loss: 0.3569
Epoch 4:	 Validation acc: 0.7706	 Validation loss: 0.4590
Epoch 4:	 Train loss: 0.3523
Epoch 5:	 Validation acc: 0.7672	 Validation loss: 0.4585
Epoch 5:	 Train loss: 0.3480
Epoch 6:	 Validation acc: 0.7661	 Validation loss: 0.4564
Epoch 6:	 Train loss: 0.3441
Epoch 7:	 Validation acc: 0.7683	 Validation loss: 0.4558
Epoch 7:	 Train loss: 0.3401
Epoch 8:	 Validation acc: 0.7695	 Validation loss: 0.4536
Epoch 8:	 Train loss: 0.3368
Epoch 9:	 Validation acc: 0.7729	 Validation loss: 0.4524
Epoch 9:	 Train loss: 0.3336
Epoch 10:	 Validation acc: 0.7683	 Validation loss: 0.4518
Epoch 10:	 Train loss: 0.3305
Epoch 11:	 Validation acc: 0.7787	 Valida

Saving latest checkpoint..


Epoch 22:	 Validation acc: 0.7867	 Validation loss: 0.4419
Epoch 22:	 Train loss: 0.2989


1

In [12]:
test_results = trainer.test(model, 
                           ds.test_dataloader(), 
                           ckpt_path=checkpoint_callback.best_model_path)

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': tensor(0.8023, device='cuda:0'),
 'test_loss': tensor(0.4259, device='cuda:0')}
--------------------------------------------------------------------------------




## 2. LSTM

The appeal of the `FastText` model is its simplicity, however, it suffers from some inherent disadvantages. In particular, the model does not consider word order. The [paper](https://www.aclweb.org/anthology/E17-2068/) addresses this issue by including additional n-gram features to capture local word ordering. An alternative approach is to use a different model architecture designed for sequential data, recurrent neural networks. We now implement a single-layer LSTM for the sentiment classification task.

### Pre-processing

In [13]:
class SSTDataModuleLSTM(SSTDataModuleBase):

    def __init__(self):
        super().__init__()

    @staticmethod
    def _collate_fn(batch):
        # get inputs and targets
        data = [item[0] for item in batch]
        targets = [item[1] for item in batch]

        # to be able to pack sequences later on, need
        # the original sequence lengths
        seqlengths = [len(el) for el in data]
    
        # pad the sequences
        x = pad_sequence(data, batch_first=True)

        return (x, torch.Tensor(targets).float(), 
                      seqlengths)
        
    def train_dataloader(self):
        return DataLoader(self.sst_train, batch_size=64, 
                              collate_fn=self._collate_fn, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.sst_val, batch_size=64, 
                              collate_fn=self._collate_fn)

    def test_dataloader(self):
        return DataLoader(self.sst_test, batch_size=64,
                              collate_fn=self._collate_fn)

### Model

In [15]:
class SentimentLSTM(pl.LightningModule):

    def __init__(self, input_size, embed_mat):
    
        super().__init__()    

        self.embedding = nn.Embedding(input_size, 300, padding_idx=0)
        if embed_mat is not None:
            self.embedding = self.embedding.from_pretrained(torch.from_numpy(embed_mat).float())
        self.lstm = nn.LSTM(300, 256, num_layers = 1, batch_first = True)
        self.linear1 = nn.Linear(256, 64)
        self.linear2 = nn.Linear(64, 1)
        self.dropout = nn.Dropout()

    def forward(self, batch):

        inputs, _, seqlengths = batch
        # inputs: [BATCH_SIZE, LONGEST_SEQ]
    
        embeds = self.embedding(inputs.long())
        # embeds: [BATCH_SIZE, LONGEST_SEQ, EMBED_DIM]

        embeds = self.dropout(embeds)

        inputs = pack_padded_sequence(embeds, seqlengths, 
              enforce_sorted=False, batch_first=True)
        # inputs: [SUM(SEQ_LENGTHS), EMBED_DIM)

        packed_output, (hidden, cell) = self.lstm(inputs)
        # packed_outputs: [SUM(SEQ_LENGTHS), LSTM_OUT]
        # hidden: [1, BATCH_SIZE, LSTM_OUT]

        lastState = hidden[-1]
        # lastState: [BATCH_SIZE, LSTM_OUT]

        output = self.dropout(F.relu(self.linear1(lastState)))
        output = self.linear2(output).squeeze()

        return output

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-4, 
                                     weight_decay=1e-05)
        return optimizer

    def training_step(self, batch, batch_idx):

        x, y , seqlengths = batch
        y_hat = self(batch)

        # compute loss
        loss = F.binary_cross_entropy_with_logits(y_hat, y)

        return {'loss': loss, 
                "batch_size": len(y)}

    def training_epoch_end(self, outputs):

        total = sum([x['batch_size'] for x in outputs])
        avg_loss = sum([x['loss']*x['batch_size'] for x in outputs])/total

        print(f"Epoch {self.current_epoch}:\t Train loss: {avg_loss:.4f}")
        return {'avg_train_loss': avg_loss}

    
    def validation_step(self, batch, batch_idx):

        x, y, offsets = batch
        y_hat = self(batch)

        # compute loss
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        
        # compute acc
        preds = torch.round(torch.sigmoid(y_hat))
        correct = (preds == y).float().sum()
        acc = correct/len(y)

        return {"loss": loss, 
                "acc": acc, 
                "batch_size": len(y)}

    def validation_epoch_end(self, outputs, mode="val"):
      
        total = sum([x['batch_size'] for x in outputs])
        avg_loss = sum([x['loss']*x['batch_size'] for x in outputs])/total
        avg_acc = sum([x['acc']*x['batch_size'] for x in outputs])/total

        if mode=='val':
            print(f"Epoch {self.current_epoch}:\t Validation acc: {avg_acc:.4f}\t Validation loss: {avg_loss:.4f}")

        return {"epoch_val_loss": avg_loss, "epoch_val_acc": avg_acc}

    def test_step(self, batch, batch_idx):

        return self.validation_step(batch, batch_idx)
      
    def test_epoch_end(self, outputs):

        outputs = self.validation_epoch_end(outputs, mode="test")
        return {"test_loss": outputs['epoch_val_loss'], 
                "test_acc": outputs['epoch_val_acc']}

In [16]:
ds = SSTDataModuleLSTM()
ds.setup(min_freq=3)
embed_mat = ds.embedding_matrix()

In [17]:
model = SentimentLSTM(len(ds.encoding), embed_mat=embed_mat)

early_stop_callback = EarlyStopping(
   monitor='epoch_val_loss',
   min_delta=0.0001,
   patience=5,
   verbose=False,
   mode='min'
)

checkpoint_callback = ModelCheckpoint(
    filepath='./checkpoints/'+'{epoch}',
    save_top_k=1,
    verbose=False,
    monitor='epoch_val_loss',
    mode='min',
    prefix=model.__class__.__name__+"_"
)


trainer = pl.Trainer(gpus=1, progress_bar_refresh_rate=0, max_epochs=30, 
                     num_sanity_val_steps=0, 
                     early_stop_callback=early_stop_callback,
                     checkpoint_callback=checkpoint_callback)
trainer.fit(model, ds)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4 M   
1 | lstm      | LSTM      | 571 K 
2 | linear1   | Linear    | 16 K  
3 | linear2   | Linear    | 65    
4 | dropout   | Dropout   | 0     


Epoch 0:	 Validation acc: 0.7775	 Validation loss: 0.4544
Epoch 0:	 Train loss: 0.4833
Epoch 1:	 Validation acc: 0.7936	 Validation loss: 0.4476
Epoch 1:	 Train loss: 0.4026
Epoch 2:	 Validation acc: 0.7878	 Validation loss: 0.4417
Epoch 2:	 Train loss: 0.3812
Epoch 3:	 Validation acc: 0.8085	 Validation loss: 0.4194
Epoch 3:	 Train loss: 0.3599
Epoch 4:	 Validation acc: 0.8108	 Validation loss: 0.4111
Epoch 4:	 Train loss: 0.3443
Epoch 5:	 Validation acc: 0.8142	 Validation loss: 0.4158
Epoch 5:	 Train loss: 0.3315
Epoch 6:	 Validation acc: 0.7947	 Validation loss: 0.4708
Epoch 6:	 Train loss: 0.3194
Epoch 7:	 Validation acc: 0.8280	 Validation loss: 0.3892
Epoch 7:	 Train loss: 0.3087
Epoch 8:	 Validation acc: 0.8280	 Validation loss: 0.3967
Epoch 8:	 Train loss: 0.2982
Epoch 9:	 Validation acc: 0.8177	 Validation loss: 0.3964
Epoch 9:	 Train loss: 0.2886
Epoch 10:	 Validation acc: 0.8314	 Validation loss: 0.4092
Epoch 10:	 Train loss: 0.2783
Epoch 11:	 Validation acc: 0.8257	 Valida

Saving latest checkpoint..


Epoch 12:	 Validation acc: 0.8154	 Validation loss: 0.4435
Epoch 12:	 Train loss: 0.2646


1

In [18]:
test_results = trainer.test(model, 
                            ds.test_dataloader(), 
                            ckpt_path=checkpoint_callback.best_model_path)

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': tensor(0.8325, device='cuda:0'),
 'test_loss': tensor(0.3854, device='cuda:0')}
--------------------------------------------------------------------------------




## Attention-LSTM

In the LSTM-model presented above, the dense layer receives the last hidden state as input. This implies that all the information required to make a prediction must be contained in the last hidden state, thus creating a bottleneck of information.

Rather than using the last hidden state as input, we can compute a representation through the use of weighted sum of all the hidden states by using an attention mechanism. In that way, we can "focus" on words that heavily influence the sentiment of the sentence.

The model implemented is the word-sequence and word-attention model presented in [Hierarchical Attention Networks for Document Classification](https://www.aclweb.org/anthology/N16-1174/).

In [31]:
class Attention(nn.Module):

    def __init__(self, hidden_size, att_dim):
        super().__init__()
        self.w = nn.Linear(2*hidden_size, att_dim)
        self.u_w = nn.Linear(att_dim, 1)

    def forward(self, outputs, mask):

        outputs = outputs.permute(0,2,1)
        # outputs: [BATCH_SIZE, LSTM_OUT, LONGEST_SEQ]

        # compute u_{it} representations of each of the hidden 
        u_it = torch.einsum('ki,bij->bkj', self.w.weight, outputs)
        # u_it: [BATCH_SIZE, ATTN_OUT, LONGEST_SEQ]
        
        
        # compute alpha
        alpha = torch.einsum('ij, bjk->bk', self.u_w.weight, u_it)
        # alpha: [BATCH_SIZE, LONGEST_SEQ]

        # use mask
        alpha = alpha.masked_fill(mask == 0, -1e-10)

        return F.softmax(alpha, dim=1)

In [32]:
class AttentionLSTM(pl.LightningModule):

    def __init__(self, input_size, embed_mat):
    
        super().__init__()    

        self.embedding = nn.Embedding(input_size, 300, padding_idx=0)
        if embed_mat is not None:
            self.embedding = self.embedding.from_pretrained(torch.from_numpy(embed_mat).float())
        self.attention = Attention(256, 200)
        self.lstm = nn.LSTM(300, 256, bidirectional=True, batch_first = True)
        self.fc1 = nn.Linear(256*2, 64)
        self.fc2 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.5)
        self.embed_dropout = nn.Dropout(0.1)

    def _create_mask(self, inputs):
        mask = (inputs != 0)
        return mask 

    def forward(self, batch):

        inputs, _, seqlengths = batch
        # inputs: [BATCH_SIZE, LONGEST_SEQ]

        mask = self._create_mask(inputs)
        # mask: [BATCH_SIZE, 1, LONGEST_SEQ]

        embeds = self.embedding(inputs.long())
        # embeds: [BATCH_SIZE, LONGEST_SEQ, EMBED_DIM]

        embeds = self.embed_dropout(embeds)

        inputs = pack_padded_sequence(embeds, seqlengths, 
              enforce_sorted=False, batch_first=True)
        # inputs: [SUM(SEQ_LENGTHS), EMBED_DIM)
     
        packed_outputs, (hidden, cell) = self.lstm(inputs)
        # packed_outputs: [SUM(SEQ_LENGTHS), N_DIR * LSTM_OUT]
        # hidden: [N_DIR * N_LAYERS, BATCH_SIZE, LSTM_OUT]

        outputs, _ = pad_packed_sequence(packed_outputs, batch_first=True)
        # outputs: [BATCH_SIZE, LONGEST_SEQ, N_DIR * LSTM_OUT]

        a = self.attention(outputs, mask)
        # a: [BATCH_SIZE, LONGEST_SEQ]

        context = torch.einsum('bj, bjk -> bk', a, outputs)
        # context: [BATCH_SIZE, N_DIR * LSTM_OUT]
        
        # linear layer
        output = self.dropout(F.relu(self.fc1(context)))
        output = self.fc2(output)

        return output.squeeze()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.0001, 
                                     weight_decay=1e-05)
        return optimizer

    def training_step(self, batch, batch_idx):

        x, y , seqlengths = batch
        y_hat = self(batch)

        # compute loss
        loss = F.binary_cross_entropy_with_logits(y_hat, y)

        return {'loss': loss, 
                "batch_size": len(y)}

    def training_epoch_end(self, outputs):

        total = sum([x['batch_size'] for x in outputs])
        avg_loss = sum([x['loss']*x['batch_size'] for x in outputs])/total

        print(f"Epoch {self.current_epoch}:\t Train loss: {avg_loss:.4f}")
        return {'avg_train_loss': avg_loss}

    
    def validation_step(self, batch, batch_idx):

        x, y, offsets = batch
        y_hat = self(batch)

        # compute loss
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        
        # compute acc
        preds = torch.round(torch.sigmoid(y_hat))
        correct = (preds == y).float().sum()
        acc = correct/len(y)

        return {"loss": loss, 
                "acc": acc, 
                "batch_size": len(y)}

    def validation_epoch_end(self, outputs, mode="val"):
      
        total = sum([x['batch_size'] for x in outputs])
        avg_loss = sum([x['loss']*x['batch_size'] for x in outputs])/total
        avg_acc = sum([x['acc']*x['batch_size'] for x in outputs])/total
      
        if mode=='val':
            print(f"Epoch {self.current_epoch}:\t Validation acc: {avg_acc:.4f}\t Validation loss: {avg_loss:.4f}")

        return {"epoch_val_loss": avg_loss, "epoch_val_acc": avg_acc}

    def test_step(self, batch, batch_idx):

        return self.validation_step(batch, batch_idx)
      

    def test_epoch_end(self, outputs):

        outputs = self.validation_epoch_end(outputs, mode="test")
        return {"test_loss": outputs['epoch_val_loss'], 
                "test_acc": outputs['epoch_val_acc']
                }
    

In [33]:
model = AttentionLSTM(len(ds.encoding), embed_mat=embed_mat)

early_stop_callback = EarlyStopping(
   monitor='epoch_val_loss',
   min_delta=0.0001,
   patience=10,
   verbose=False,
   mode='min'
)

checkpoint_callback = ModelCheckpoint(
    filepath='./checkpoints/'+'{epoch}',
    save_top_k=1,
    verbose=False,
    monitor='epoch_val_loss',
    mode='min',
    prefix=model.__class__.__name__+"_"
)


trainer = pl.Trainer(gpus=1, progress_bar_refresh_rate=0, max_epochs=50, 
                     num_sanity_val_steps=0, 
                     early_stop_callback=early_stop_callback,
                     checkpoint_callback=checkpoint_callback)
trainer.fit(model, ds)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type      | Params
--------------------------------------------
0 | embedding     | Embedding | 4 M   
1 | attention     | Attention | 102 K 
2 | lstm          | LSTM      | 1 M   
3 | fc1           | Linear    | 32 K  
4 | fc2           | Linear    | 65    
5 | dropout       | Dropout   | 0     
6 | embed_dropout | Dropout   | 0     


Epoch 0:	 Validation acc: 0.7821	 Validation loss: 0.4538
Epoch 0:	 Train loss: 0.4089
Epoch 1:	 Validation acc: 0.8073	 Validation loss: 0.4214
Epoch 1:	 Train loss: 0.3346
Epoch 2:	 Validation acc: 0.8177	 Validation loss: 0.4063
Epoch 2:	 Train loss: 0.3058
Epoch 3:	 Validation acc: 0.8211	 Validation loss: 0.3970
Epoch 3:	 Train loss: 0.2824
Epoch 4:	 Validation acc: 0.8360	 Validation loss: 0.3754
Epoch 4:	 Train loss: 0.2646
Epoch 5:	 Validation acc: 0.8314	 Validation loss: 0.3844
Epoch 5:	 Train loss: 0.2460
Epoch 6:	 Validation acc: 0.8303	 Validation loss: 0.4043
Epoch 6:	 Train loss: 0.2318
Epoch 7:	 Validation acc: 0.8245	 Validation loss: 0.4055
Epoch 7:	 Train loss: 0.2161
Epoch 8:	 Validation acc: 0.8394	 Validation loss: 0.3940
Epoch 8:	 Train loss: 0.2046
Epoch 9:	 Validation acc: 0.8268	 Validation loss: 0.4221
Epoch 9:	 Train loss: 0.1912
Epoch 10:	 Validation acc: 0.8383	 Validation loss: 0.4076
Epoch 10:	 Train loss: 0.1794
Epoch 11:	 Validation acc: 0.8463	 Valida

Saving latest checkpoint..


Epoch 14:	 Validation acc: 0.8337	 Validation loss: 0.4407
Epoch 14:	 Train loss: 0.1438


1

In [34]:
test_results = trainer.test(model, 
                            ds.test_dataloader(), 
                            ckpt_path=checkpoint_callback.best_model_path)

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': tensor(0.8413, device='cuda:0'),
 'test_loss': tensor(0.4147, device='cuda:0')}
--------------------------------------------------------------------------------




## CNN

We can use a CNN for text classification as presented in [Convolutional Neural Networks for Sentence Classification](https://www.aclweb.org/anthology/D14-1181/), by using convolution operations on the embeddings.

### Pre-processing

In [36]:
class SSTDataModuleCNN(SSTDataModuleBase):

    def __init__(self):
        super().__init__()

    @staticmethod
    def _collate_fn(batch):
        # get inputs and targets
        data = [item[0] for item in batch]
        targets = [item[1] for item in batch]
    
        # pad the sequences
        x = pad_sequence(data, batch_first=True)

        return x, torch.Tensor(targets).float()
        
    def train_dataloader(self):
        return DataLoader(self.sst_train, batch_size=64, 
                              collate_fn=self._collate_fn, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.sst_val, batch_size=64, 
                              collate_fn=self._collate_fn)

    def test_dataloader(self):
        return DataLoader(self.sst_test, batch_size=64,
                              collate_fn=self._collate_fn)

### Model

In [46]:
# TODO: rename variables
class TextCNN(pl.LightningModule):
    
    def __init__(self, input_size, embed_mat=None):
        
        super().__init__()
        self.embedding = nn.Embedding(input_size, 300, padding_idx=0)
        if embed_mat is not None:
            self.embedding = self.embedding.from_pretrained(torch.from_numpy(embed_mat).float())

        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = 300, 
                                              out_channels = 100, 
                                              kernel_size = fs)
                                    for fs in [3,4,5]
                                    ])
        
        self.fc = nn.Linear(3 * 100, 1)
        
        self.dropout = nn.Dropout()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.0001, 
                                     weight_decay=1e-05)
        return optimizer

    def forward(self, batch):

        inputs, _ = batch
        # inputs:  [BATCH_SIZE, LONGEST_SEQ]
        
        embeds = self.embedding(inputs).permute(0,2,1)
        # embeds = [BATCH_SIZE, EMBED_DIM, LONGEST_SEQ]
        
        convs = [F.relu(conv(embeds)) for conv in self.convs]
        # convs: List[[BATCH_SIZE, CONV_OUT_DIM, LONGEST_SEQ - KERNEL_SIZE + 1]]  
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in convs]
        # pooled: List[[BATCH_SIZE, CONV_OUT_DIM]]
        
        conv_out = self.dropout(torch.cat(pooled, dim = 1))
        # conv_out: [BATCH_SIZE, N_FILTERS * CONV_OUT_DIM]
            
        return self.fc(conv_out).squeeze()

    def training_step(self, batch, batch_idx):

        x, y = batch
        y_hat = self(batch)

        # compute loss
        loss = F.binary_cross_entropy_with_logits(y_hat, y)

        return {'loss': loss, 
                "batch_size": len(y)}

    def training_epoch_end(self, outputs):

        total = sum([x['batch_size'] for x in outputs])
        avg_loss = sum([x['loss']*x['batch_size'] for x in outputs])/total

        print(f"Epoch {self.current_epoch}:\t Train loss: {avg_loss:.4f}")
        return {'avg_train_loss': avg_loss}

    def validation_step(self, batch, batch_idx):

        x, y = batch
        y_hat = self(batch)

        # compute loss
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        
        # compute acc
        preds = torch.round(torch.sigmoid(y_hat))
        correct = (preds == y).float().sum()
        acc = correct/len(y)

        return {"loss": loss, 
                "acc": acc, 
                "batch_size": len(y)}

    def validation_epoch_end(self, outputs, mode="val"):
      
        total = sum([x['batch_size'] for x in outputs])
        avg_loss = sum([x['loss']*x['batch_size'] for x in outputs])/total
        avg_acc = sum([x['acc']*x['batch_size'] for x in outputs])/total
      
        if mode=='val':
            print(f"Epoch {self.current_epoch}:\t Validation acc: {avg_acc:.4f}\t Validation loss: {avg_loss:.4f}")
        
        return {"epoch_val_loss": avg_loss, "epoch_val_acc": avg_acc}

    def test_step(self, batch, batch_idx):

        return self.validation_step(batch, batch_idx)
      

    def test_epoch_end(self, outputs):

        outputs = self.validation_epoch_end(outputs, mode="test")
        return {"test_loss": outputs['epoch_val_loss'], 
                "test_acc": outputs['epoch_val_acc']
                }

In [38]:
ds = SSTDataModuleCNN()
ds.setup(min_freq=3)
embed_mat = ds.embedding_matrix()

In [47]:
model = TextCNN(len(ds.encoding), embed_mat=embed_mat)

early_stop_callback = EarlyStopping(
   monitor='epoch_val_loss',
   min_delta=0.0001,
   patience=5,
   verbose=False,
   mode='min'
)

checkpoint_callback = ModelCheckpoint(
    filepath='./checkpoints/'+'{epoch}',
    save_top_k=1,
    verbose=False,
    monitor='epoch_val_loss',
    mode='min',
    prefix=model.__class__.__name__+"_"
)


trainer = pl.Trainer(gpus=1, progress_bar_refresh_rate=0, max_epochs=30, 
                     num_sanity_val_steps=0, 
                     early_stop_callback=early_stop_callback,
                     checkpoint_callback=checkpoint_callback)
trainer.fit(model, ds)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type       | Params
-----------------------------------------
0 | embedding | Embedding  | 4 M   
1 | convs     | ModuleList | 360 K 
2 | fc        | Linear     | 301   
3 | dropout   | Dropout    | 0     


Epoch 0:	 Validation acc: 0.7924	 Validation loss: 0.4427
Epoch 0:	 Train loss: 0.4201
Epoch 1:	 Validation acc: 0.8016	 Validation loss: 0.4236
Epoch 1:	 Train loss: 0.3209
Epoch 2:	 Validation acc: 0.7947	 Validation loss: 0.4177
Epoch 2:	 Train loss: 0.2948
Epoch 3:	 Validation acc: 0.7993	 Validation loss: 0.4236
Epoch 3:	 Train loss: 0.2759
Epoch 4:	 Validation acc: 0.8016	 Validation loss: 0.4195
Epoch 4:	 Train loss: 0.2592
Epoch 5:	 Validation acc: 0.8028	 Validation loss: 0.4072
Epoch 5:	 Train loss: 0.2422
Epoch 6:	 Validation acc: 0.8085	 Validation loss: 0.4087
Epoch 6:	 Train loss: 0.2274
Epoch 7:	 Validation acc: 0.8062	 Validation loss: 0.4110
Epoch 7:	 Train loss: 0.2146
Epoch 8:	 Validation acc: 0.8154	 Validation loss: 0.4069
Epoch 8:	 Train loss: 0.2043
Epoch 9:	 Validation acc: 0.8073	 Validation loss: 0.4110
Epoch 9:	 Train loss: 0.1939
Epoch 10:	 Validation acc: 0.8154	 Validation loss: 0.4044
Epoch 10:	 Train loss: 0.1837
Epoch 11:	 Validation acc: 0.8131	 Valida

Saving latest checkpoint..


Epoch 15:	 Validation acc: 0.8177	 Validation loss: 0.4244
Epoch 15:	 Train loss: 0.1490


1

In [48]:
test_results = trainer.test(model, 
                            ds.test_dataloader(), 
                            ckpt_path=checkpoint_callback.best_model_path)

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': tensor(0.8413, device='cuda:0'),
 'test_loss': tensor(0.3777, device='cuda:0')}
--------------------------------------------------------------------------------




## Recurrent Neural Filters

We now present a variant of the CNN model which uses a recurrent network as a convolution filter. The embeddings are split into chunks, and an LSTM is applied to each chunk to compute a representation. The representations are then aggregated using a pooling operations. More details are in the paper, [Convolutional Neural Networks with Recurrent Neural Filters](https://www.aclweb.org/anthology/D18-1109/).

In [49]:
class TimeDistributedLSTM(pl.LightningModule):
    def __init__(self, time_axis):        
        super().__init__()

        self.time_axis = time_axis
        self.lstm = nn.LSTM(300, 300, batch_first = True)

    def forward(self, x):

        batch_size = x.shape[0]
        time_steps = x.shape[self.time_axis]
        embed_dim = x.shape[-1] 
        outputs = torch.zeros(x.shape[0], time_steps, embed_dim, device=self.device)
        
        for i in range(time_steps):
            x_input = torch.index_select(x, dim=self.time_axis, index=torch.tensor([i], device=self.device).long()).squeeze()
            
            output_t, (cell_t, hidden_t) = self.lstm(x_input)
            
            outputs[:, i, :] = hidden_t
           
        return outputs

In [50]:
def format_conv_input(x, filter_width, sent_len):

    chunks = []
    for i in range(sent_len - filter_width + 1):
        chunk = x[:, i:i+filter_width, :]
        chunk = chunk.unsqueeze(1)
        chunks.append(chunk)
    return torch.cat(chunks, 1)

In [56]:
class RNF(pl.LightningModule):
    
    def __init__(self, input_size, embed_mat=None):
        super().__init__()

        self.embedding = nn.Embedding(input_size, 300, padding_idx=0)
        if embed_mat is not None:
            self.embedding = self.embedding.from_pretrained(torch.from_numpy(embed_mat).float())

        self.filter_width = 5
        self.time_lstm = TimeDistributedLSTM(time_axis=1)

        self.fc = nn.Sequential(
            nn.Linear(300, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, 1),
        )

    def forward(self, batch):

        inputs, _ = batch
        # inputs: [BATCH_SIZE, LONGEST_SEQ]
        
        embedded = self.embedding(inputs)       
        # embedded: [BATCH_SIZE, LONGEST_SEQ, EMBED_DIM]

        lstm_inputs = format_conv_input(embedded, 
                                        filter_width=self.filter_width, 
                                        sent_len=embedded.shape[1])
        # lstm_inputs: [BATCH SIZE, LONGEST SEQ - FILTER_WIDTH + 1, 
        # FILTER_WIDTH, EMBED_DIM]

        lstm_outputs = self.time_lstm(lstm_inputs)
        # lstm_outputs: [BATCH SIZE, LONGEST SEQ - FILTER_WIDTH + 1, 
        # FILTER_WIDTH, LSTM_OUT]

        lstm_outputs = F.max_pool1d(lstm_outputs.permute(0,2,1), 
                                    kernel_size=lstm_outputs.shape[1]).squeeze()
        # lstm_outputs: [BATCH SIZE, LSTM_OUT]
        
        outputs = self.fc(lstm_outputs)
        return outputs.squeeze()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.0001, 
                                     weight_decay=1e-05)
        return optimizer

    def training_step(self, batch, batch_idx):

        x, y = batch
        y_hat = self(batch)

        # compute loss
        loss = F.binary_cross_entropy_with_logits(y_hat, y)

        return {'loss': loss, 
                "batch_size": len(y)}

    def training_epoch_end(self, outputs):

        total = sum([x['batch_size'] for x in outputs])
        avg_loss = sum([x['loss']*x['batch_size'] for x in outputs])/total

        print(f"Epoch {self.current_epoch}:\t Train loss: {avg_loss:.4f}")
        return {'avg_train_loss': avg_loss}

    def validation_step(self, batch, batch_idx):

        x, y = batch
        y_hat = self(batch)

        # compute loss
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        
        # compute acc
        preds = torch.round(torch.sigmoid(y_hat))
        correct = (preds == y).float().sum()
        acc = correct/len(y)

        return {"loss": loss, 
                "acc": acc, 
                "batch_size": len(y)}

    def validation_epoch_end(self, outputs, mode="val"):
      
        total = sum([x['batch_size'] for x in outputs])
        avg_loss = sum([x['loss']*x['batch_size'] for x in outputs])/total
        avg_acc = sum([x['acc']*x['batch_size'] for x in outputs])/total
      
        if mode=='val':
            print(f"Epoch {self.current_epoch}:\t Validation acc: {avg_acc:.4f}\t Validation loss: {avg_loss:.4f}")
        
        return {"epoch_val_loss": avg_loss, "epoch_val_acc": avg_acc}

    def test_step(self, batch, batch_idx):

        return self.validation_step(batch, batch_idx)
      

    def test_epoch_end(self, outputs):

        outputs = self.validation_epoch_end(outputs, mode="test")
        return {"test_loss": outputs['epoch_val_loss'], 
                "test_acc": outputs['epoch_val_acc']
                }


In [None]:
ds = SSTDataModuleCNN()
ds.setup(min_freq=3)
emlbed_mat = ds.embedding_matrix()

In [57]:
model = RNF(len(ds.encoding), embed_mat=embed_mat)

early_stop_callback = EarlyStopping(
   monitor='epoch_val_loss',
   min_delta=0.0001,
   patience=5,
   verbose=False,
   mode='min'
)

checkpoint_callback = ModelCheckpoint(
    filepath='./checkpoints/'+'{epoch}',
    save_top_k=1,
    verbose=False,
    monitor='epoch_val_loss',
    mode='min',
    prefix=model.__class__.__name__+"_"
)


trainer = pl.Trainer(gpus=1, progress_bar_refresh_rate=0, max_epochs=30, 
                     num_sanity_val_steps=0, 
                     early_stop_callback=early_stop_callback,
                     checkpoint_callback=checkpoint_callback)
trainer.fit(model, ds)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                | Params
--------------------------------------------------
0 | embedding | Embedding           | 4 M   
1 | time_lstm | TimeDistributedLSTM | 722 K 
2 | fc        | Sequential          | 77 K  


Epoch 0:	 Validation acc: 0.7810	 Validation loss: 0.4485
Epoch 0:	 Train loss: 0.3957
Epoch 1:	 Validation acc: 0.7936	 Validation loss: 0.4311
Epoch 1:	 Train loss: 0.3145
Epoch 2:	 Validation acc: 0.8062	 Validation loss: 0.4130
Epoch 2:	 Train loss: 0.2826
Epoch 3:	 Validation acc: 0.8119	 Validation loss: 0.4066
Epoch 3:	 Train loss: 0.2540
Epoch 4:	 Validation acc: 0.8165	 Validation loss: 0.4076
Epoch 4:	 Train loss: 0.2246
Epoch 5:	 Validation acc: 0.8177	 Validation loss: 0.4065
Epoch 5:	 Train loss: 0.2000
Epoch 6:	 Validation acc: 0.8131	 Validation loss: 0.4149
Epoch 6:	 Train loss: 0.1779
Epoch 7:	 Validation acc: 0.8096	 Validation loss: 0.4280
Epoch 7:	 Train loss: 0.1599
Epoch 8:	 Validation acc: 0.8222	 Validation loss: 0.4298
Epoch 8:	 Train loss: 0.1438
Epoch 9:	 Validation acc: 0.8096	 Validation loss: 0.4515
Epoch 9:	 Train loss: 0.1298


Saving latest checkpoint..


Epoch 10:	 Validation acc: 0.8222	 Validation loss: 0.4782
Epoch 10:	 Train loss: 0.1184


1

In [58]:
test_results = trainer.test(model, 
                            ds.test_dataloader(), 
                            ckpt_path=checkpoint_callback.best_model_path)



--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': tensor(0.8298, device='cuda:0'),
 'test_loss': tensor(0.4330, device='cuda:0')}
--------------------------------------------------------------------------------


## Neural Semantic Encoders

The final model is an implemenations of [Neural Semantic Encoders](https://www.aclweb.org/anthology/E17-1038/).

In [59]:
class NSE(pl.LightningModule):


    def __init__(self, input_size, embed_mat=None):
        super().__init__()

        self.embedding = nn.Embedding(input_size, 300, padding_idx=0)
        if embed_mat is not None:
            self.embedding = self.embedding.from_pretrained(torch.from_numpy(embed_mat).float())

        self.read_lstm = nn.LSTM(300, 300, batch_first = True)
        self.write_lstm = nn.LSTM(2*300, 300, batch_first=True)
        self.compose_layer = nn.Linear(2*300, 2*300)

        self.fc = nn.Sequential(
            nn.Linear(300, 256),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(256, 1),
        )

    def _init_hidden(self, batch_size, hidden_dim):
        return (torch.zeros(1, batch_size, hidden_dim, device=self.device).requires_grad_(),
                torch.zeros(1, batch_size, hidden_dim, device=self.device).requires_grad_())
    
    def _compose(self, o_t, m_t):

        """
        Compose operation.
        Eq. (4)
        """
        c_t = self.compose_layer(torch.cat([o_t, m_t], dim=1))
        # c_t: [BATCH_SIZE, 2*N_UNITS]
        return c_t

    def _read(self, M_t, x_t, hidden):

        """
        Read operation.
        Eq. (1)-(3)
        """
        o_t, hidden = self.read_lstm(F.dropout(x_t, 0.3), hidden)
        # o_t: [BATCH_SIZE, 1, DIM]

        o_t = o_t.squeeze(1)
        # o_t: [BATCH_SIZE, DIM]
        
        z_t = F.softmax(torch.einsum("bo,bko->bk", o_t, M_t), dim=1)
        m_rt = torch.einsum("bk,bko->bo", z_t, M_t)
        return o_t, m_rt, z_t, hidden

    def _write(self, M_t, c_t, z_t, hidden):

        batch_size = c_t.shape[0]
        len = z_t.shape[1]
        dim = M_t.shape[2]

        h_t, hidden = self.write_lstm(F.dropout(c_t.unsqueeze(1), 0.3), hidden)
        z_t_e_k = torch.einsum('ki,kj->kji', [torch.ones(batch_size, dim, device=self.device), z_t])
        M_t = (1 - z_t_e_k) * M_t + torch.einsum('ki,kj->kij', 
            [torch.ones(batch_size,len, device=self.device), 
             h_t.squeeze(1)]) * z_t_e_k

        return M_t, h_t, hidden

    def forward(self, batch):
        
        inputs, _, seqlengths = batch
        # inputs: [BATCH_SIZE, LONGEST_SEQ]

        embeds = self.embedding(inputs.long())
        # embeds: [BATCH_SIZE, LONGEST_SEQ, EMBED_DIM]

        M_t = embeds

        all_outputs = torch.zeros(inputs.shape[0], inputs.shape[1], 
                                  300, device=self.device)
        idx = torch.tensor(seqlengths)
        idx = idx - 1 
        
        read_hidden = self._init_hidden(inputs.shape[0], 300)
        write_hidden = self._init_hidden(inputs.shape[0], 300)

        for i in range(inputs.shape[1]):

            x_t = torch.index_select(embeds, 1, torch.tensor([i]).long().cuda())
            # x_t: [BATCH_SIZE, 1, DIM]

            o_t, m_rt, z_t, read_hidden = self._read(M_t, x_t, read_hidden)

            c_t = self._compose(o_t, m_rt)

            M_t, h_t, write_hidden = self._write(M_t, c_t, z_t, write_hidden)

            all_outputs[:, i, :] = h_t.squeeze(1)

        output = all_outputs[torch.arange(all_outputs.size(0)), idx]
        output = self.fc(output)

        return output.squeeze()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.0001, 
                                     weight_decay=1e-05)
        return optimizer

    def training_step(self, batch, batch_idx):

        x, y , seqlengths = batch
        y_hat = self(batch)

        # compute loss
        loss = F.binary_cross_entropy_with_logits(y_hat, y)

        return {'loss': loss, 
                "batch_size": len(y)}

    def training_epoch_end(self, outputs):

        total = sum([x['batch_size'] for x in outputs])
        avg_loss = sum([x['loss']*x['batch_size'] for x in outputs])/total

        print(f"Epoch {self.current_epoch}:\t Train loss: {avg_loss:.4f}")
        return {'avg_train_loss': avg_loss}

    
    def validation_step(self, batch, batch_idx):

        x, y, offsets = batch
        y_hat = self(batch)

        # compute loss
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        
        # compute acc
        preds = torch.round(torch.sigmoid(y_hat))
        correct = (preds == y).float().sum()
        acc = correct/len(y)

        return {"loss": loss, 
                "acc": acc, 
                "batch_size": len(y)}

    def validation_epoch_end(self, outputs, mode="val"):
      
        total = sum([x['batch_size'] for x in outputs])
        avg_loss = sum([x['loss']*x['batch_size'] for x in outputs])/total
        avg_acc = sum([x['acc']*x['batch_size'] for x in outputs])/total
      
        if mode=='val':
            print(f"Epoch {self.current_epoch}:\t Validation acc: {avg_acc:.4f}\t Validation loss: {avg_loss:.4f}")

        return {"epoch_val_loss": avg_loss, "epoch_val_acc": avg_acc}

    def test_step(self, batch, batch_idx):

        return self.validation_step(batch, batch_idx)
      

    def test_epoch_end(self, outputs):

        outputs = self.validation_epoch_end(outputs, mode="test")
        return {"test_loss": outputs['epoch_val_loss'], 
                "test_acc": outputs['epoch_val_acc']
                }

In [60]:
ds = SSTDataModuleLSTM()
ds.setup(min_freq=3)
embed_mat = ds.embedding_matrix()

In [61]:
model = NSE(len(ds.encoding), embed_mat=embed_mat)

early_stop_callback = EarlyStopping(
   monitor='epoch_val_loss',
   min_delta=0.0001,
   patience=3,
   verbose=False,
   mode='min'
)

checkpoint_callback = ModelCheckpoint(
    filepath='./checkpoints/'+'{epoch}',
    save_top_k=1,
    verbose=False,
    monitor='epoch_val_loss',
    mode='min',
    prefix=model.__class__.__name__+"_"
)


trainer = pl.Trainer(gpus=1, progress_bar_refresh_rate=0, max_epochs=30, 
                     num_sanity_val_steps=0, 
                     early_stop_callback=early_stop_callback,
                     checkpoint_callback=checkpoint_callback)
trainer.fit(model, ds)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type       | Params
---------------------------------------------
0 | embedding     | Embedding  | 4 M   
1 | read_lstm     | LSTM       | 722 K 
2 | write_lstm    | LSTM       | 1 M   
3 | compose_layer | Linear     | 360 K 
4 | fc            | Sequential | 77 K  


Epoch 0:	 Validation acc: 0.7775	 Validation loss: 0.4591
Epoch 0:	 Train loss: 0.3850
Epoch 1:	 Validation acc: 0.8050	 Validation loss: 0.4244
Epoch 1:	 Train loss: 0.3064
Epoch 2:	 Validation acc: 0.8016	 Validation loss: 0.4206
Epoch 2:	 Train loss: 0.2825
Epoch 3:	 Validation acc: 0.8096	 Validation loss: 0.4218
Epoch 3:	 Train loss: 0.2643
Epoch 4:	 Validation acc: 0.8234	 Validation loss: 0.3987
Epoch 4:	 Train loss: 0.2494
Epoch 5:	 Validation acc: 0.8131	 Validation loss: 0.4431
Epoch 5:	 Train loss: 0.2330
Epoch 6:	 Validation acc: 0.7833	 Validation loss: 0.4833
Epoch 6:	 Train loss: 0.2181


Saving latest checkpoint..


Epoch 7:	 Validation acc: 0.8039	 Validation loss: 0.4322
Epoch 7:	 Train loss: 0.2046


1

In [62]:
test_results = trainer.test(model, 
                            ds.test_dataloader(), 
                            ckpt_path=checkpoint_callback.best_model_path)



--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': tensor(0.8292, device='cuda:0'),
 'test_loss': tensor(0.3782, device='cuda:0')}
--------------------------------------------------------------------------------
