# Sentiment classification with LSTM
In this notebook we will use LSTMs to do sentiment classification on the [imdb dataset](http://ai.stanford.edu/~amaas/data/sentiment/). 

In [1]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 

## Dataset

To get the data: <br>
`wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz`

In [3]:
def unpack_dataset():
    ! mkdir -p data/aclImdb
    ! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    ! tar -zxvf aclImdb_v1.tar.gz -C data

In [4]:
#unpack_dataset()

In [5]:
from pathlib import Path
PATH = Path("data/aclImdb/")
list(PATH.iterdir())

[PosixPath('data/aclImdb/imdbEr.txt'),
 PosixPath('data/aclImdb/imdb.vocab'),
 PosixPath('data/aclImdb/models'),
 PosixPath('data/aclImdb/train'),
 PosixPath('data/aclImdb/test'),
 PosixPath('data/aclImdb/README')]

In [6]:
path = PATH/"train/pos/0_9.txt"
path.read_text()

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

## Tokenization

In [7]:
# first time run this
#!python3 -m spacy download en

In [8]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [9]:
path = PATH/"train/pos/0_9.txt"
spacy_tok(path.read_text())[:10]

['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at']

### Computing vocab2index

In [10]:
pos_files = list((PATH/"train"/"pos").iterdir())
neg_files = list((PATH/"train"/"neg").iterdir())
all_files = pos_files + neg_files
all_files[:5]

[PosixPath('data/aclImdb/train/pos/10544_8.txt'),
 PosixPath('data/aclImdb/train/pos/9530_9.txt'),
 PosixPath('data/aclImdb/train/pos/9901_8.txt'),
 PosixPath('data/aclImdb/train/pos/11951_8.txt'),
 PosixPath('data/aclImdb/train/pos/7441_7.txt')]

In [11]:
# takes some time
counts = Counter()
for path in all_files:
    counts.update(spacy_tok(path.read_text()))

In [12]:
#counts

In [13]:
len(counts.keys())

103504

In [14]:
for word in list(counts):
    if counts[word] < 5:
        del counts[word]

In [15]:
len(counts.keys())

33907

In [16]:
vocab2index = {"<PAD>":0, "UNK":1}
words = ["<PAD>", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [17]:
#vocab2index

## Dataset

In [18]:
# spacy_tok takes a while. Run it just once
def encode_sentence(path, vocab2index, N=400, padding_start=True):
    x = spacy_tok(path.read_text())
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc, l

In [19]:
path = PATH/"train/neg/211_4.txt"
encode_sentence(path, vocab2index, N=400, padding_start=False)

(array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            1,  1782,     4,  2723,     4,    29,    36,    37,    15,
         2388,   320,     6,    23,   351,     8,   316,   254,   680,
         1382,   103,   334,  7723,   103, 12826,     3,   369,   951,
            3, 16949,    19,  1791,    55,  1573,    36,     1,   153,
      

In [20]:
class ImdbDataset(Dataset):
    def __init__(self, PATH, train="train", N=400, padding_start=True):
        self.path_to_images = PATH/train
        self.pos_files = list((self.path_to_images/"pos").iterdir())
        self.neg_files = list((self.path_to_images/"neg").iterdir())
        self.files = self.pos_files + self.neg_files
        # pos 1, neg 0
        self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),
                                np.zeros(len(self.neg_files), dtype=int)), axis=0)
        # it is important to run encode_sentence in the init
        self.X = [encode_sentence(path, vocab2index, N, padding_start) for path in self.files]
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x, s = self.X[idx]
        return x, s, self.y[idx]

In [21]:
train_ds_v0 = ImdbDataset(PATH, padding_start=False)
valid_ds_v0 = ImdbDataset(PATH, "test", padding_start=False)

In [22]:
batch_size = 1000
train_dl_v0 = DataLoader(train_ds_v0, batch_size=batch_size, shuffle=True)
valid_dl_v0 = DataLoader(valid_ds_v0, batch_size=batch_size)

In [23]:
train_ds_v0[1]

(array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

## Understanding LSTMs 

nn.LSTM() <br>
`input_size` – The dimension of the embedding for each word <br>
`hidden_size` – The number of features in the hidden state h <br>
`num_layers` – Number of recurrent layers <br>
`batch_first` – If True, then the input and output tensors are provided as (batch, seq, feature) <br>

In [24]:
inputs = [torch.randn(1, 2) for _ in range(7)] # make a sequence of length 7
inputs = torch.cat(inputs).view(1, len(inputs), -1)
inputs

tensor([[[ 0.3183,  0.3120],
         [-0.8239,  2.5844],
         [-0.4005,  1.1637],
         [ 0.7870,  0.8620],
         [-0.9666,  1.1664],
         [ 1.5673, -0.7956],
         [ 1.1117,  1.0850]]])

In [25]:
# batch size x sequence length x embedding size
inputs.shape

torch.Size([1, 7, 2])

In [26]:
lstm = nn.LSTM(input_size=2, hidden_size=4, batch_first=True)

In [27]:
out, (hidden, cell) = lstm(inputs)

In [28]:
out.shape

torch.Size([1, 7, 4])

In [29]:
hidden.shape

torch.Size([1, 1, 4])

## LSTM V0 model

In [30]:
class LSTMV0Model(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMV0Model,self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        _, (ht, _) = self.lstm(x)
        return self.linear(ht[-1])

In [31]:
def update_optimizer(optimizer, lr):
    for i, param_group in enumerate(optimizer.param_groups):
        param_group["lr"] = lr

In [32]:
def train_epocs_v0(model, optimizer, train_dl, val_dl, epochs=10):
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            # s is not used in this model
            x = x.long().cuda()
            y = y.float().cuda()
            y_pred = model(x)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics_v0(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [33]:
def val_metrics_v0(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        # s is not used here
        x = x.long().cuda()
        y = y.float().unsqueeze(1).cuda()
        y_hat = model(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [38]:
batch_size = 5000
train_dl = DataLoader(train_ds_v0, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds_v0, batch_size=batch_size)

In [39]:
vocab_size = len(words)
print(vocab_size)
model = LSTMV0Model(vocab_size, 50, 50).cuda()

parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

33909


In [40]:
train_epocs_v0(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.683 val loss 0.677 and val accuracy 0.572
train loss 0.622 val loss 0.668 and val accuracy 0.642
train loss 0.487 val loss 0.560 and val accuracy 0.741
train loss 0.380 val loss 0.528 and val accuracy 0.797
train loss 0.285 val loss 0.700 and val accuracy 0.788
train loss 0.245 val loss 0.593 and val accuracy 0.823


In [41]:
update_optimizer(optimizer, lr=0.001)
train_epocs_v0(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.192 val loss 0.635 and val accuracy 0.828
train loss 0.181 val loss 0.626 and val accuracy 0.829
train loss 0.175 val loss 0.677 and val accuracy 0.821
train loss 0.170 val loss 0.686 and val accuracy 0.823
train loss 0.164 val loss 0.662 and val accuracy 0.828
train loss 0.159 val loss 0.702 and val accuracy 0.825


## GRU Model 
GRU have less parameters but often have as good or better performance than an LSTM

In [58]:
class GRUV0Model(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(GRUV0Model,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        _, ht = self.gru(x)
        return self.linear(ht[-1])

In [59]:
vocab_size = len(words)
model = GRUV0Model(vocab_size, 50, 50).cuda()

parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

In [60]:
train_epocs_v0(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.685 val loss 0.678 and val accuracy 0.564
train loss 0.599 val loss 0.659 and val accuracy 0.677
train loss 0.458 val loss 0.976 and val accuracy 0.604
train loss 0.302 val loss 0.647 and val accuracy 0.763
train loss 0.199 val loss 0.624 and val accuracy 0.812
train loss 0.139 val loss 0.581 and val accuracy 0.839


In [61]:
update_optimizer(optimizer, lr=0.001)
train_epocs_v0(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.106 val loss 0.572 and val accuracy 0.849
train loss 0.104 val loss 0.586 and val accuracy 0.847
train loss 0.097 val loss 0.605 and val accuracy 0.846
train loss 0.096 val loss 0.603 and val accuracy 0.847
train loss 0.093 val loss 0.610 and val accuracy 0.849
train loss 0.090 val loss 0.631 and val accuracy 0.846


## Model with variable length
dynamic padding + pack_padded_sequence

`pack_padded_sequence` packs a Tensor containing padded sequences of variable length.

In [44]:
def encode_sentence_no_padding(path, vocab2index):
    x = spacy_tok(path.read_text())
    return np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])

In [45]:
path = PATH/"train/neg/211_4.txt"
#encode_sentence_no_padding(path, vocab2index)

In [46]:
class ImdbDataset2(Dataset):
    def __init__(self, PATH, train="train"):
        self.path_to_images = PATH/train
        self.pos_files = list((self.path_to_images/"pos").iterdir())
        self.neg_files = list((self.path_to_images/"neg").iterdir())
        self.files = self.pos_files + self.neg_files
        # pos 1, neg 0
        self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),
                                np.zeros(len(self.neg_files), dtype=int)), axis=0)
        # it is important to run encode_sentence in the init
        self.X = [encode_sentence_no_padding(path, vocab2index) for path in self.files]
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = self.X[idx]
        return x, self.y[idx]

In [47]:
train_ds = ImdbDataset2(PATH)
valid_ds = ImdbDataset2(PATH, "test")

In [48]:
train_ds[0]

(array([  2,   3,   4,   5,   4,   6,   7,   8,   3,   9,  10,  11,  12,
          3,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
         25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  14,  35,   3,
         36,  37,  15,  38,  19,  36,  39,  31,  40,   3,  41,  42,  43,
         44,  45,  46,  47,  16,  48,  49,   3,  50,  51,  19,  36,  52,
         53,  22,  54,  33,  55,  56,  57,  58,  59,  36,  60,  31,  61,
         62,  12,   3,   4,  63,   4,   6,  64,   8,   3,  13,  47,  29,
         65,  66,   3,  67,  43,  13,  68,  44,  69,  70,   4,   5,   4,
         67,  71,  72,  16,  54,  73,  74,  57,  58,  75,  31,  76,   4,
         77,   4,  78,  79,  80,  18,  19,  81,   9,  82,  83,  55,  84,
         85,   5,  86,  85,   3,  87,  16,  54,  88,  75,  59,  36,  80,
         89,  90,  91,  92,  93,  88,  94,  95,  91,  16,  54,  57,  95,
         91,  58,  59,  96,  97,  98,  31,  99, 100,  15,  36, 101, 102,
        103, 104,   6, 105, 106,  15,  16, 107, 108

### collate_fn function
The `collate_fn` merges a list of samples to form a mini-batch. It is an optional parameter to our data loader.

In [49]:
def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (sentences, labels).
    
    Need custom collate_fn because merging sequences (including padding) is not 
    supported in default. Sequences are padded to the maximum length of mini-batch 
    sequences (dynamic padding).
    
    Args:
        data: list of tuple (sentence, label). 
            - list of word indices of variable length
            - label, 0 or 1
    Returns:
        packed_batch: (PackedSequence), see torch.nn.utils.rnn.pack_padded_sequence
        sencences: torch tensor of shape (batch_size, max_len).
        labels: torch tensor of shape (batch_size, 1).
        lengths: list; valid length for each padded sentence. 
    """
    # Sort a data list by sentences length (descending order).
    #data.sort(key=lambda x: len(x[0]), reverse=True)
    sentences, labels = zip(*data)
    
    # stack labels
    labels = torch.Tensor(labels)
    
    # Merge sentences
    lengths = [len(s) for s in sentences]
   
    sents = torch.zeros(len(sentences), max(lengths)).long()
    for i, s in enumerate(sentences):
        end = lengths[i]
        sents[i, :end] = torch.Tensor(s[:end])        
    
    return sents, lengths, labels

In [50]:
# tiny example of our data
data = [([4, 545, 23, 1], 0), ([34, 84], 1), ([23, 6, 774], 0)]

In [51]:
# note how the data ir reorder and padded
collate_fn(data)

(tensor([[  4, 545,  23,   1],
         [ 34,  84,   0,   0],
         [ 23,   6, 774,   0]]),
 [4, 2, 3],
 tensor([0., 1., 0.]))

In [52]:
# more realistic example
data = [train_ds[0], train_ds[1], train_ds[2]]

In [53]:
collate_fn(data)

(tensor([[  2,   3,   4,  ...,   0,   0,   0],
         [181, 182, 183,  ...,   0,   0,   0],
         [156,  12, 262,  ..., 271, 281, 214]]),
 [359, 188, 710],
 tensor([1., 1., 1.]))

In [54]:
batch_size = 5
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, collate_fn=collate_fn)

In [55]:
sents, lengths, labels = next(iter(train_dl))

In [56]:
sents.shape, labels.shape

(torch.Size([5, 1089]), torch.Size([5]))

In [57]:
lengths

[134, 1089, 431, 292, 323]

In [58]:
class GRUModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(GRUModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x, lengths):
        x = self.embeddings(x)
        x = self.dropout(x)
        pack = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        _, ht = self.gru(pack)
        return self.linear(ht[-1])

In [59]:
def train_epocs(model, optimizer, train_dl, valid_dl, epochs=10):
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.long().cuda()
            y = y.float().cuda()
            y_pred = model(x, s)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, valid_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [60]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        x = x.long().cuda()
        y = y.float().unsqueeze(1).cuda()
        y_hat = model(x, s)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [61]:
batch_size = 3000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, collate_fn=collate_fn)

In [62]:
vocab_size = len(words)
model = GRUModel(vocab_size, 50, 50).cuda()

parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

In [63]:
train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.671 val loss 0.621 and val accuracy 0.652
train loss 0.491 val loss 0.723 and val accuracy 0.740
train loss 0.266 val loss 0.485 and val accuracy 0.837
train loss 0.160 val loss 0.405 and val accuracy 0.875
train loss 0.110 val loss 0.457 and val accuracy 0.880
train loss 0.087 val loss 0.599 and val accuracy 0.858


In [64]:
update_optimizer(optimizer, lr=0.001)
train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.056 val loss 0.587 and val accuracy 0.872
train loss 0.055 val loss 0.576 and val accuracy 0.873
train loss 0.052 val loss 0.590 and val accuracy 0.873
train loss 0.049 val loss 0.590 and val accuracy 0.874
train loss 0.050 val loss 0.597 and val accuracy 0.875
train loss 0.047 val loss 0.610 and val accuracy 0.873


## Reordering in the forward function

In [65]:
# back to padding
train_ds = ImdbDataset(PATH, padding_start=True)
valid_ds= ImdbDataset(PATH, "test", padding_start=True)

batch_size = 7
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

x, s, y = next(iter(train_dl)) # here s is the length of the sentences

In [66]:
x.shape, s.shape

(torch.Size([7, 400]), torch.Size([7]))

In [67]:
s

tensor([230, 213, 400, 188, 169, 274, 323])

In [68]:
y

tensor([0, 0, 0, 0, 0, 1, 1])

In [69]:
# sort by length so we can use pack_padded_sequence
s, index = s.sort(0, descending=True)
x = x[index]

In [70]:
s

tensor([400, 323, 274, 230, 213, 188, 169])

In [71]:
index

tensor([2, 6, 5, 0, 1, 3, 4])

In [72]:
y[index]

tensor([0, 1, 1, 0, 0, 0, 0])

In [73]:
vocab_size = len(words)
embedding_dim = 10
embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

In [74]:
x = embed(x.long())
x.shape

torch.Size([7, 400, 10])

In [75]:
hidden_dim = 9
lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

In [76]:
# RNN will not perform calculation on pad elements if pack_padded_sequence is used
x_pack = pack_padded_sequence(x, list(s), batch_first=True)

In [77]:
out_pack, (ht, ct) = lstm(x_pack)

In [78]:
## final hidden layer
ht.shape

torch.Size([1, 7, 9])

In [79]:
ht[-1].shape

torch.Size([7, 9])

In [80]:
linear = nn.Linear(hidden_dim, 1)
y_hat = linear(ht[-1])
y_hat

tensor([[-0.0780],
        [-0.0163],
        [-0.0245],
        [-0.0111],
        [-0.0230],
        [-0.0821],
        [-0.0500]], grad_fn=<AddmmBackward>)

In [81]:
index.unsqueeze(1).shape

torch.Size([7, 1])

In [82]:
# takes back to the original ordering
h = torch.zeros_like(y_hat).scatter_(0, index.unsqueeze(1), y_hat)

In [83]:
h

tensor([[-0.0111],
        [-0.0230],
        [-0.0780],
        [-0.0821],
        [-0.0500],
        [-0.0245],
        [-0.0163]], grad_fn=<ScatterBackward0>)

In [84]:
y_hat

tensor([[-0.0780],
        [-0.0163],
        [-0.0245],
        [-0.0111],
        [-0.0230],
        [-0.0821],
        [-0.0500]], grad_fn=<AddmmBackward>)

In [85]:
index

tensor([2, 6, 5, 0, 1, 3, 4])

### Model with sorting in the forward

In [86]:
class LSTMModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.5)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        
    def forward(self, x, s):
        # sorting
        s, sort_index = torch.sort(s, 0,descending=True)
        s = s.numpy().tolist()
        x = x[sort_index]
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1).cuda(), out) 

In [87]:
batch_size = 2000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [88]:
vocab_size = len(words)
print(vocab_size)
model = LSTMModel(vocab_size, 50, 50).cuda()

parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

33909


In [89]:
train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.665 val loss 0.634 and val accuracy 0.639
train loss 0.450 val loss 0.752 and val accuracy 0.710
train loss 0.425 val loss 0.639 and val accuracy 0.767
train loss 0.223 val loss 0.562 and val accuracy 0.829
train loss 0.172 val loss 0.478 and val accuracy 0.861
train loss 0.125 val loss 0.617 and val accuracy 0.850


In [90]:
update_optimizer(optimizer, 0.001)
train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.100 val loss 0.665 and val accuracy 0.851
train loss 0.092 val loss 0.679 and val accuracy 0.851
train loss 0.089 val loss 0.705 and val accuracy 0.848
train loss 0.083 val loss 0.704 and val accuracy 0.851
train loss 0.082 val loss 0.697 and val accuracy 0.852
train loss 0.081 val loss 0.739 and val accuracy 0.847


In [91]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [92]:
! mkdir $PATH/"models"

In [93]:
p = PATH/"models/model-855.pth"
save_model(model, p)

In [94]:
val_metrics(model, valid_dl)

(0.7423409295082092, tensor(0.8471, device='cuda:0'))

In [95]:
load_model(model, p)

## GRU model with dropout

In [96]:
class GRUModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(GRUModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout = nn.Dropout(0.5)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        
    def forward(self, x, s):
        s, sort_index = torch.sort(s, 0,descending=True)
        s = s.numpy().tolist()
        x = x[sort_index]
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, list(s), batch_first=True)
        out_pack, ht= self.gru(x_pack)
        out = self.linear(ht[-1])
        return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1).cuda(), out)

## Bidirectional and multiple layers GRUs / LSTMs

In [97]:
batch_size = 7
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

x,s,y = next(iter(train_dl)) # here s is the length of the sentences

In [98]:
vocab_size = len(words)
embedding_dim = 10
hidden_dim = 9
embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
lstm2 = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)

In [99]:
s, index = s.sort(0, descending=True)
x = x[index]
x = embed(x.long())
x_pack = pack_padded_sequence(x, list(s), batch_first=True)

In [100]:
lstm_out, (ht, ct) = lstm1(x)

In [101]:
ht.shape

torch.Size([2, 7, 9])

In [102]:
ht[-2,:,:].shape

torch.Size([7, 9])

In [103]:
lstm_out, (ht2, ct2) = lstm2(x)

In [104]:
ht2.shape

torch.Size([4, 7, 9])

In [105]:
ht2[-2,:,:].shape, ht2[-1,:,:].shape

(torch.Size([7, 9]), torch.Size([7, 9]))

In [106]:
#concat the final forward (ht[-2,:,:]) and backward (ht[-1,:,:]) hidden layers      
h = torch.cat((ht2[-2,:,:], ht2[-1,:,:]), dim = 1)
h.shape

torch.Size([7, 18])

In [107]:
# This is slow

In [108]:
class BiGRUModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(BiGRUModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=2, batch_first=True,
                          dropout=0.3, bidirectional=True)
        
        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(2*hidden_dim, 1)
        
    def forward(self, x, s):
        s, sort_index = torch.sort(s, 0,descending=True)
        s = s.numpy().tolist()
        x = x[sort_index]
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True)
        out_pack, ht = self.gru(x_pack)
        h = torch.cat((ht[-2,:,:], ht[-1,:,:]), dim = 1)
        h = self.linear(h)
        return torch.zeros_like(h).scatter_(0, sort_index.unsqueeze(1).cuda(), h)

In [109]:
vocab_size = len(words)
model = BiGRUModel(vocab_size, 50, 50).cuda()

parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

In [110]:
batch_size = 2000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [111]:
train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.661 val loss 0.662 and val accuracy 0.569
train loss 0.297 val loss 0.366 and val accuracy 0.863
train loss 0.150 val loss 0.392 and val accuracy 0.879
train loss 0.090 val loss 0.421 and val accuracy 0.883
train loss 0.056 val loss 0.526 and val accuracy 0.879
train loss 0.038 val loss 0.584 and val accuracy 0.872


In [112]:
update_optimizer(optimizer, 0.001)
train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.028 val loss 0.644 and val accuracy 0.875
train loss 0.023 val loss 0.673 and val accuracy 0.873
train loss 0.022 val loss 0.675 and val accuracy 0.873
train loss 0.019 val loss 0.700 and val accuracy 0.872
train loss 0.020 val loss 0.718 and val accuracy 0.870
train loss 0.019 val loss 0.730 and val accuracy 0.871


## References

The model in this notebook is adapted from this [pytorch tutorial](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html). 