# AWD LSTM

In [1]:
!pip install git+https://github.com/NVIDIA/apex

Collecting git+https://github.com/NVIDIA/apex
  Cloning https://github.com/NVIDIA/apex to /tmp/pip-req-build-xg_zth_f
  Running command git clone -q https://github.com/NVIDIA/apex /tmp/pip-req-build-xg_zth_f
  Running command git submodule update --init --recursive -q
Building wheels for collected packages: apex
  Building wheel for apex (setup.py) ... [?25ldone
[?25h  Created wheel for apex: filename=apex-0.1-py3-none-any.whl size=186403 sha256=35e9cddc095033a21aa47495676b5d145926e2f502a142a99130790a2dfe4aec
  Stored in directory: /tmp/pip-ephem-wheel-cache-kl_botqq/wheels/dd/7b/dc/dc522332f3f6f60db5440cbcc4ee70aa155c2cf7d1f15b6900
Successfully built apex


In [2]:
# export
from exp.nb_12 import *

In [3]:
%reload_ext autoreload
%autoreload 2

%matplotlib inline

In [4]:
!ls /datasets/fast-ai-nlp

ag_news_csv.tgz			sogou_news_csv.tgz
amazon_review_full_csv.tgz	wikitext-103.tgz
amazon_review_polarity_csv.tgz	wikitext-2.tgz
dbpedia_csv.tgz			yahoo_answers_csv.tgz
giga-fren.tgz			yelp_review_full_csv.tgz
imdb.tgz			yelp_review_polarity_csv.tgz


## Data

In [5]:
path = datasets.untar_data(datasets.URLs.IMDB)

We have to preprocess the data again to pickle it because if we try to load the previous `SplitLabeledData` with pickle, it will complain some of the functions aren't in main.

In [6]:
il = TextList.from_files(path, include=['train', 'test', 'unsup'])
sd = SplitData.split_by_func(il, partial(random_splitter, p_valid=0.1))

In [7]:
proc_tok, proc_num = TokenizeProcessor(max_workers=8), NumericalizeProcessor()

In [8]:
ll = label_by_func(sd, lambda x: 0, proc_x = [proc_tok,proc_num])

In [9]:
pickle.dump(ll, open(path/'ll_lm.pkl', 'wb'))
pickle.dump(proc_num.vocab, open(path/'vocab_lm.pkl', 'wb'))

In [10]:
ll = pickle.load(open(path/'ll_lm.pkl', 'rb'))
vocab = pickle.load(open(path/'vocab_lm.pkl', 'rb'))

In [11]:
bs, bptt = 64, 70
data = lm_databunchify

## AWD-LSTM

Before explaining what an AWD LSTM is, we need to start with an LSTM. RNNs were covered in part 1, if you need a refresher, there is a great visualization of them on [this website](http://joshvarty.github.io/VisualizingRNNs/).

[Jump_to lesson 12 video](https://course.fast.ai/videos/?lesson=12&t=6330)

### LSTM from scratch

We need to implement those equations (where $\sigma$ stands for sigmoid):

![LSTM cell and equations](images/lstm.jpg)
(picture from [Understanding LSTMs](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) by Chris Olah.)

If we want to take advantage of our GPU, it's better to do one big matrix multiplication than four smaller ones. So we compute the values of the four gates all at once. Since there is a matrix multiplication and a bias, we use `nn.Linear` to do it.

We need two linear layers: one for the input and one for the hidden state.

In [12]:
class LSTMCell(nn.Module):

    def __init__(self, ni, nh):
        super().__init__()
        self.ih = nn.Linear(ni, 4 * nh)
        self.hh = nn.Linear(nh, 4 * nh)
    def forward(self, inp, state):
        h, c = state
        # one big multiplication for all the gates is better than 4 small ones
        gates = (self.ih(inp) + self.hh(inp)).chunk(4, 1)
        ingate, forgetgate, outgate = map(torch.sigmoid, gates[:3])
        cellgate = gates[3].tanh()
        c = (forgetgate * c) + (ingate * cellgate)
        h = outgate * c.tanh()
        return h, (h, c)

In [13]:
class LSTMLayer(nn.Module):
    def __init__(self, cell, *cell_args):
        super().__init__()
        self.cell = cell(*cell_args)
    def forward(self, inp, state):
        inputs = inp.unbind(1)
        outputs = []
        for i in range(len(inputs)):
            out, state = self.cell(inputs[i], state)
            outputs += [out]
        return torch.stack(outputs, dim=1), state
        

In [14]:
lstm = LSTMLayer(LSTMCell, 300, 300)

In [15]:
x = torch.randn(64, 70, 300)
h = (torch.zeros(64, 300),torch.zeros(64, 300))

#### CPU

In [16]:
%timeit -n 10 y,h1 = lstm(x,h)

116 ms ± 8.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
lstm = lstm.cuda()
x = x.cuda()
h = (h[0].cuda(), h[1].cuda())

In [18]:
def time_fn(f):
    f()
    torch.cuda.synchronize()

#### CUDA

In [19]:
f = partial(lstm,x,h)
time_fn(f)

In [20]:
%timeit -n 10 time_fn(f)

25.4 ms ± 545 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Built-in version

In [21]:
lstm = nn.LSTM(300, 300, 1, batch_first=True)

In [22]:
x = torch.randn(64, 70, 300)
h = (torch.zeros(1, 64, 300),torch.zeros(1, 64, 300))

#### CPU

In [23]:
%timeit -n 10 y,h1 = lstm(x,h)

92.5 ms ± 2.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### GPU

In [24]:
lstm = lstm.cuda()
x = x.cuda()
h = (h[0].cuda(), h[1].cuda())

In [25]:
f = partial(lstm,x,h)
time_fn(f)

In [26]:
%timeit -n 10 time_fn(f)

4.71 ms ± 618 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Jit version

In [27]:
import torch.jit as jit
from torch import Tensor

In [28]:
class LSTMCell(jit.ScriptModule):
    def __init__(self, ni, nh):
        super().__init__()
        self.ni = ni
        self.nh = nh
        self.w_ih = nn.Parameter(torch.randn(4 * nh, ni))
        self.w_hh = nn.Parameter(torch.randn(4 * nh, nh))
        self.bias_ih = nn.Parameter(torch.randn(4 * nh))
        self.bias_hh = nn.Parameter(torch.randn(4 * nh))

    @jit.script_method
    def forward(self, input:Tensor, state:Tuple[Tensor, Tensor])->Tuple[Tensor, Tuple[Tensor, Tensor]]:
        hx, cx = state
        gates = (input @ self.w_ih.t() + self.bias_ih +
                 hx @ self.w_hh.t() + self.bias_hh)
        ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)

        ingate = torch.sigmoid(ingate)
        forgetgate = torch.sigmoid(forgetgate)
        cellgate = torch.tanh(cellgate)
        outgate = torch.sigmoid(outgate)

        cy = (forgetgate * cx) + (ingate * cellgate)
        hy = outgate * torch.tanh(cy)

        return hy, (hy, cy)

In [29]:
class LSTMLayer(jit.ScriptModule):
    def __init__(self, cell, *cell_args):
        super().__init__()
        self.cell = cell(*cell_args)

    @jit.script_method
    def forward(self, input:Tensor, state:Tuple[Tensor, Tensor])->Tuple[Tensor, Tuple[Tensor, Tensor]]:
        inputs = input.unbind(1)
        outputs = []
        for i in range(len(inputs)):
            out, state = self.cell(inputs[i], state)
            outputs += [out]
        return torch.stack(outputs, dim=1), state

In [30]:
lstm = LSTMLayer(LSTMCell, 300, 300)

In [31]:
x = torch.randn(64, 70, 300)
h = (torch.zeros(64, 300),torch.zeros(64, 300))

In [32]:
%timeit -n 10 y,h1 = lstm(x,h)

107 ms ± 12.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [33]:
lstm = lstm.cuda()
x = x.cuda()
h = (h[0].cuda(), h[1].cuda())

In [34]:
f = partial(lstm,x,h)
time_fn(f)

In [35]:
%timeit -n 10 time_fn(f)

19.2 ms ± 363 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Dropout

We want to use the AWD-LSTM from [Stephen Merity et al.](https://arxiv.org/abs/1708.02182). First, we'll need all different kinds of dropouts. Dropout consists into replacing some coefficients by 0 with probability p. To ensure that the average of the weights remains constant, we apply a correction to the weights that aren't nullified of a factor `1/(1-p)` (think of what happens to the activations if you want to figure out why!)

We usually apply dropout by drawing a mask that tells us which elements to nullify or not:

In [36]:
#export
def dropout_mask(x, sz, p):
    return x.new(*sz).bernoulli_(1 - p).div(1 - p)

In [37]:
x = torch.randn(10,10)
mask = dropout_mask(x, (10,10), 0.5); mask

tensor([[0., 2., 2., 0., 0., 0., 2., 0., 2., 2.],
        [0., 2., 0., 2., 2., 2., 2., 2., 0., 2.],
        [0., 2., 2., 2., 2., 2., 0., 0., 2., 2.],
        [2., 2., 2., 0., 2., 0., 2., 2., 2., 2.],
        [2., 2., 2., 2., 2., 2., 2., 2., 0., 0.],
        [2., 2., 2., 0., 2., 0., 0., 0., 2., 0.],
        [2., 2., 0., 0., 2., 0., 0., 2., 0., 2.],
        [0., 0., 2., 0., 0., 0., 2., 0., 0., 2.],
        [0., 0., 2., 0., 2., 2., 0., 2., 2., 0.],
        [0., 0., 2., 0., 2., 2., 2., 2., 0., 2.]])

Once with have a dropout mask `mask`, applying the dropout to `x` is simply done by `x = x * mask`. We create our own dropout mask and don't rely on pytorch dropout because we do not want to nullify all the coefficients randomly: on the sequence dimension, we will want to have always replace the same positions by zero along the seq_len dimension.

In [38]:
(x*mask).std(),x.std()

(tensor(1.6396), tensor(1.0133))

Inside a RNN, a tensor x will have three dimensions: bs, seq_len, vocab_size.  Recall that we want to consistently apply the dropout mask across the seq_len dimension, therefore, we create a dropout mask for the first and third dimension and broadcast it to the seq_len dimension.

In [50]:
#export
class RNNDropout(nn.Module):
    def __init__(self, p=0.5):
        super().__init__()
        self.p = p
    def forward(self, xb):
        if not self.training or self.p == 0.:
            return xb
        m = dropout_mask(xb.data, (xb.size(0), 1, xb.size(2)), self.p)
        return xb * m

In [51]:
dp = RNNDropout(.3)
test_input = torch.randn(3, 3, 7)
test_input, dp(test_input)

(tensor([[[-0.6271, -1.1787, -1.2264, -1.3855,  0.4905,  0.5311,  1.0003],
          [ 0.2630,  0.1117, -0.3415,  0.3866,  0.4288,  0.2056, -0.3860],
          [ 0.3656, -0.9671,  0.4248, -0.6932, -0.2435,  0.0457,  0.1831]],
 
         [[ 0.9614, -0.5359, -0.5053,  0.6134,  0.9013,  0.1548, -1.0373],
          [ 0.1242, -1.2375,  1.4367, -0.0171, -0.1743, -0.2439,  0.6390],
          [-1.5063,  0.4196,  0.5764,  0.5497, -0.1499,  1.5334, -1.7988]],
 
         [[-0.0188, -0.7649, -0.7905, -1.1936, -0.0619,  0.2954, -0.4649],
          [-0.4612, -0.3387, -0.6082,  1.1636, -1.6281, -0.6541, -0.9733],
          [ 0.8226,  0.6989, -1.1267, -0.3920,  0.5949,  1.9333,  1.8637]]]),
 tensor([[[-0.8958, -1.6838, -1.7521, -1.9793,  0.7007,  0.7587,  1.4291],
          [ 0.3757,  0.1595, -0.4879,  0.5523,  0.6125,  0.2937, -0.5514],
          [ 0.5222, -1.3815,  0.6069, -0.9903, -0.3479,  0.0653,  0.2616]],
 
         [[ 1.3735, -0.7656, -0.7218,  0.8763,  1.2875,  0.0000, -1.4819],
          [ 0

WeightDropout is dropout applied to the weights of the inner LSTM hidden to hidden matrix. This is a little hacky if we want to preserve the CuDNN speed and not reimplement the cell from scratch. We add a parameter that will contain the raw weights, and we replace the weight matrix in the LSTM at the beginning of the forward pass.

In [65]:
import torch
from torch import nn
import torch.nn.functional as F

In [66]:
#export
import warnings

WEIGHT_HH = 'weight_hh_l0'

class WeightDropout(nn.Module):
    def __init__(self, module, weight_p=[0.], layer_names=[WEIGHT_HH]):
        super().__init__()
        self.module,self.weight_p,self.layer_names = module,weight_p,layer_names
        for layer in self.layer_names:
            #Makes a copy of the weights of the selected layers.
            w = getattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
            self.module._parameters[layer] = F.dropout(w, p=self.weight_p, training=False)

    def _setweights(self):
        for layer in self.layer_names:
            raw_w = getattr(self, f'{layer}_raw')
            self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=self.training)

    def forward(self, *args):
        self._setweights()
        with warnings.catch_warnings():
            #To avoid the warning that comes because the weights aren't flattened.
            warnings.simplefilter("ignore")
            return self.module.forward(*args)

In [67]:
module = nn.LSTM(5, 2)
dp_module = WeightDropout(module, 0.4)
getattr(dp_module.module, WEIGHT_HH)

Parameter containing:
tensor([[ 0.3743,  0.5702],
        [-0.4095, -0.1545],
        [ 0.6281, -0.4933],
        [-0.4411, -0.5618],
        [-0.3762, -0.5055],
        [-0.1844,  0.6921],
        [-0.6187,  0.1410],
        [-0.1217, -0.4306]], requires_grad=True)

In [68]:
#export
class EmbeddingDropout(nn.Module):
    "Applies dropout in the embedding layer by zeroing out some elements of the embedding vector."
    def __init__(self, emb, embed_p):
        super().__init__()
        self.emb, self.embed_p = emb,embed_p
        self.pad_idx = self.emb.padding_idx
        if self.pad_idx is None: self.pad_idx = -1

    def forward(self, words, scale=None):
        if self.training and self.embed_p != 0:
            size = (self.emb.weight.size(0),1)
            mask = dropout_mask(self.emb.weight.data, size, self.embed_p)
            masked_embed = self.emb.weight * mask
        else: masked_embed = self.emb.weight
        if scale: masked_embed.mul_(scale)
        return F.embedding(words, masked_embed, self.pad_idx, self.emb.max_norm,
                           self.emb.norm_type, self.emb.scale_grad_by_freq, self.emb.sparse)

In [69]:
enc = nn.Embedding(100, 7, padding_idx=1)
enc_dp = EmbeddingDropout(enc, 0.5)
tst_input = torch.randint(0,100,(8,))
enc_dp(tst_input)

tensor([[ 4.6001,  1.0425,  3.3220,  0.2483, -0.1753,  1.9429,  0.8193],
        [ 0.0000,  0.0000, -0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.0000,  0.0000, -0.0000, -0.0000,  0.0000,  0.0000, -0.0000],
        [ 1.4824, -2.0635, -0.9413,  2.2616, -0.4480, -1.7999,  1.3532],
        [ 0.0000, -0.0000, -0.0000, -0.0000,  0.0000, -0.0000, -0.0000],
        [-0.0000, -0.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.0000],
        [-1.5572, -1.3358,  1.5200,  0.4026, -2.5249, -3.1842, -2.6833],
        [ 0.0000,  0.0000,  0.0000, -0.0000, -0.0000, -0.0000,  0.0000]],
       grad_fn=<EmbeddingBackward>)

## Main Model

The main model is a regular LSTM with several layers, but using all those kinds of dropouts.

In [70]:
#export
def to_detach(h):
    "Detaches `h` from its history."
    return h.detach() if type(h) == torch.Tensor else tuple(to_detach(v) for v in h)