In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [2]:
PATH='C:\\Users\\nelson\\Fastai\\data\\nietzsche\\'

In [3]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))

corpus length: 600901


In [4]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 86


Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [6]:
chars.insert(0, "\0")

''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

Map from chars to indices and back again

In [7]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

*idx* will be the data we use from now on - it simply converts all the characters to their index (based on the mapping above)

In [8]:
idx = [char_indices[c] for c in text]

idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [9]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## Three char model

### Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

In [10]:
cs=3
c1_dat = [idx[i]   for i in range(0, len(idx)-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-cs, cs)]

print(c1_dat[:10])
print(c2_dat[:10])
print(c3_dat[:10])

[40, 30, 29, 1, 40, 43, 31, 61, 2, 74]
[42, 25, 1, 43, 40, 33, 2, 54, 44, 73]
[29, 27, 1, 45, 39, 38, 73, 73, 71, 61]


Our inputs

In [11]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

Our output

In [12]:
y = np.stack(c4_dat)

The first 4 inputs and outputs

In [13]:
print(x1[:4], x2[:4], x3[:4])

print(y[:4])

print(x1.shape, y.shape)

[40 30 29  1] [42 25  1 43] [29 27  1 45]
[30 29  1 40]
(200300,) (200300,)


### Create and train model

Pick a size for our hidden state

In [14]:
n_hidden = 256

The number of latent factors to create (i.e. the size of the embedding matrix)

In [15]:
n_fac = 42

In [20]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)

        # The 'green arrow' from our diagram - the layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden)

        # The 'orange arrow' from our diagram - the layer operation from hidden to hidden
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # The 'blue arrow' from our diagram - the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

In [21]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

In [22]:
m = Char3Model(vocab_size, n_fac).cuda()

In [23]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [24]:
opt = optim.Adam(m.parameters(), 1e-2)

In [25]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      2.104875   3.403616  



[array([3.40362])]

In [26]:
set_lrs(opt, 0.001)

In [27]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.85302    2.831596  



[array([2.8316])]

### Test model

In [85]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [86]:
get_next('dec')

'i'

In [87]:
get_next('ppl')

'e'

In [88]:
get_next(' th')

'e'

In [89]:
get_next('and')

' '

## Our first RNN!

### Create inputs

This is the size of our unrolled RNN.

In [19]:
cs=8

#For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to our model.

c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs)]

#Then create a list of the next character in each of these series. This will be the labels for our model.

c_out_dat = [idx[j+cs] for j in range(len(idx)-cs)]

xs = np.stack(c_in_dat, axis=0)

y = np.stack(c_out_dat)

#So each column below is one series of 8 characters from the text.

xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

...and this is the next character after each sequence.

In [25]:
y[:cs]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

### Create and train model

In [25]:
val_idx = get_cv_idxs(len(idx)-cs-1)
md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)
md.trn_dl.dataset.xs[0].shape

(480715,)

In [26]:
class CharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h+inp))

        return F.log_softmax(self.l_out(h), dim=-1)

In [27]:

m = CharLoopModel(vocab_size, 42).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.99507    1.979378  



[array([1.97938])]

In [29]:
set_lrs(opt, 0.001)
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.840493   1.84219   



[array([1.84219])]

In [30]:
import torch
from torch.autograd import Variable
class MyCharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self, vocab_size, n_fac, n_hidden):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, n_fac)
        self.input = nn.Linear(n_fac, n_hidden)
        self.hidden = nn.Linear(n_hidden, n_hidden)
        self.output = nn.Linear(n_hidden, vocab_size)
        self.n_hidden = n_hidden
        
    def forward(self, *cs):
        batch_size = cs[0].size(0)
        h = V(torch.zeros(batch_size, self.n_hidden).cuda())
        for c in cs:
            input_activation = F.relu(self.input(self.embedding(c)))
            h = F.tanh(self.hidden(h + input_activation))
        prob = F.log_softmax(self.output(h), dim=-1)
        return prob

In [32]:
mym = MyCharLoopModel(vocab_size, n_fac, 256).cuda()
myopt = optim.Adam(mym.parameters(), 1e-2)

fit(mym, md, 1, myopt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      2.013592   2.003067  



[array([2.00307])]

In [33]:
set_lrs(myopt, 0.001)

In [34]:
fit(mym, md, 1, myopt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.740829   1.725917  



[array([1.72592])]

In [34]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))  # output dim: n_hidden
            h = F.tanh(self.l_hidden(inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [48]:
m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

fit(m, md, 1, opt, F.nll_loss)

set_lrs(opt, 1e-4)

fit(m, md, 1, opt, F.nll_loss)

In [None]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super.__init__()
        self.embedding = nn.Embedding(vocab_size, n_fac)
        
        
    def forward(self, *cs):


### Test model

In [53]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [54]:
get_next('for thos')

'e'

In [55]:
get_next('part of ')

't'

In [56]:
get_next('queens a')

'n'

## RNN with pytorch

In [57]:
class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [58]:
m = CharRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [59]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [60]:
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [61]:
ht = V(torch.zeros(1, 512,n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [62]:
t = m(*V(xs)); t.size()

torch.Size([512, 86])

In [67]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.678139   1.664934  
    1      1.590221   1.589249                                                                                         
    2      1.535725   1.547329                                                                                         
    3      1.499757   1.524068                                                                                         


[array([1.52407])]

In [68]:
set_lrs(opt, 1e-4)

In [69]:
fit(m, md, 2, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.434743   1.481081  
    1      1.42892    1.477565                                                                                         


[array([1.47757])]

### Test model

In [70]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [71]:
get_next('for thos')

'e'

In [72]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [73]:
get_next_n('for thos', 40)

'for those the sense of the same time that it is '

## Multi-output model

### Setup

Let's take non-overlapping sets of characters this time

In [74]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]

Then create the exact same thing, offset by 1, as our labels

In [75]:
c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [76]:
xs = np.stack(c_in_dat)
xs.shape

(75112, 8)

In [77]:
ys = np.stack(c_out_dat)
ys.shape

(75112, 8)

In [78]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [ 1,  1, 43, 45, 40, 40, 39, 43],
       [33, 38, 31,  2, 73, 61, 54, 73],
       [ 2, 44, 71, 74, 73, 61,  2, 62],
       [72,  2, 54,  2, 76, 68, 66, 54],
       [67,  9,  9, 76, 61, 54, 73,  2],
       [73, 61, 58, 67, 24,  2, 33, 72],
       [ 2, 73, 61, 58, 71, 58,  2, 67]])

In [79]:
ys[:cs,:cs]

array([[42, 29, 30, 25, 27, 29,  1,  1],
       [ 1, 43, 45, 40, 40, 39, 43, 33],
       [38, 31,  2, 73, 61, 54, 73,  2],
       [44, 71, 74, 73, 61,  2, 62, 72],
       [ 2, 54,  2, 76, 68, 66, 54, 67],
       [ 9,  9, 76, 61, 54, 73,  2, 73],
       [61, 58, 67, 24,  2, 33, 72,  2],
       [73, 61, 58, 71, 58,  2, 67, 68]])

### Create and train model

In [80]:
val_idx = get_cv_idxs(len(xs)-cs-1)

In [81]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [82]:
class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        return F.log_softmax(self.l_out(outp), dim=-1)  # originally F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [83]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [84]:
it = iter(md.trn_dl)
*xst,yt = next(it)

In [85]:
yt.size()

torch.Size([512, 8])

In [86]:
def nll_loss_seq(inp, targ):
    sl,bs,nh = inp.size()  # sequence length * batch size * hidden state size  (8*512*84)
    targ = targ.transpose(0,1).contiguous().view(-1) # flatten targets  .contiguous() gets rid of "contiguous" error (because transpose keeps some metadata)
    #.view is the same as .reshape in numpy
    return F.nll_loss(inp.view(-1,nh), targ) # flatten input

In [87]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      2.610982   2.416335  
    1      2.296517   2.204556                                                                                         
    2      2.141938   2.09074                                                                                          
    3      2.053606   2.016665                                                                                         


[array([2.01667])]

In [88]:
set_lrs(opt, 1e-4)

In [89]:
fit(m, md, 1, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      2.001587   2.001735  


[array([2.00173])]

### Identity init!

In [90]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [91]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden)) #replace initial l_hidden to identity matrix


    1     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     1  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      1     0     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     1
[torch.cuda.FloatTensor of size 256x256 (GPU 0)]

In [92]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      2.420647   2.245032  
    1      2.144757   2.087881                                                                                         
    2      2.036088   2.006885                                                                                         
    3      1.969272   1.955609                                                                                         


[array([1.95561])]

In [93]:
set_lrs(opt, 1e-3)

In [94]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.881395   1.889439  
    1      1.86761    1.883745                                                                                         
    2      1.857679   1.87592                                                                                          
    3      1.851769   1.868826                                                                                         


[array([1.86883])]

## Stateful model

### Setup

In [5]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

PATH='C:\\Users\\nelson\\Fastai\\data\\nietzsche\\'

TRN_PATH = 'trn\\'
VAL_PATH = 'val\\'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}


 Volume in drive C has no label.
 Volume Serial Number is 6894-D615

 Directory of C:\Users\nelson\Fastai\data\nietzsche

2018-06-20  10:15 PM    <DIR>          .
2018-06-20  10:15 PM    <DIR>          ..
2018-06-20  10:15 PM           600,901 nietzsche.txt
2018-06-21  09:43 AM    <DIR>          trn
2018-06-21  09:43 AM    <DIR>          val
               1 File(s)        600,901 bytes
               4 Dir(s)  56,092,921,856 bytes free


In [6]:
%ls {PATH}trn

 Volume in drive C has no label.
 Volume Serial Number is 6894-D615

 Directory of C:\Users\nelson\Fastai\data\nietzsche\trn

2018-06-21  09:43 AM    <DIR>          .
2018-06-21  09:43 AM    <DIR>          ..
2018-06-21  12:18 PM           498,874 nietzsche.txt
               1 File(s)        498,874 bytes
               2 Dir(s)  56,092,905,472 bytes free


In [7]:
TEXT = data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(942, 55, 1, 482972)

### RNN

In [8]:
class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs) # in case the batch size changes, recreate hidden layer
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h) #
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size) #turns 3D matrix to 2D
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [9]:
m = CharSeqStatefulRnn(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [10]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.892517   1.863772  
    1      1.711806   1.706482                                                                                         
    2      1.624342   1.642142                                                                                         
    3      1.571973   1.5988                                                                                           



[array([1.5988])]

In [11]:
set_lrs(opt, 1e-4)

fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.4987     1.560913  
    1      1.495585   1.554332                                                                                         
    2      1.495321   1.551035                                                                                         
    3      1.489097   1.547275                                                                                         



[array([1.54727])]

### RNN loop

In [12]:
# From the pytorch source

def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    return F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))

In [13]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h
        for c in cs: 
            o = self.rnn(self.e(c), o)
            outp.append(o)
        outp = self.l_out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [13]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [8]:
fit(m, md, 4, opt, F.nll_loss)

A Jupyter Widget

[ 0.       1.81013  1.7969 ]                                 
[ 1.       1.62515  1.65346]                                 
[ 2.       1.53913  1.58065]                                 
[ 3.       1.48698  1.54217]                                 



### GRU

In [18]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [None]:
# From the pytorch source code - for reference

def GRUCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    gi = F.linear(input, w_ih, b_ih)
    gh = F.linear(hidden, w_hh, b_hh)
    i_r, i_i, i_n = gi.chunk(3, 1)
    h_r, h_i, h_n = gh.chunk(3, 1)

    resetgate = F.sigmoid(i_r + h_r)
    inputgate = F.sigmoid(i_i + h_i)
    newgate = F.tanh(i_n + resetgate * h_n)
    return newgate + inputgate * (hidden - newgate)

In [27]:
m = CharSeqStatefulGRU(md.nt, n_fac, 512).cuda()

opt = optim.Adam(m.parameters(), 1e-3)

In [29]:
fit(m, md, 6, opt, F.nll_loss)

A Jupyter Widget

[ 0.       1.68409  1.67784]                                 
[ 1.       1.49813  1.52661]                                 
[ 2.       1.41674  1.46769]                                 
[ 3.       1.36359  1.43818]                                 
[ 4.       1.33223  1.41777]                                 
[ 5.       1.30217  1.40511]                                 



In [30]:
set_lrs(opt, 1e-4)

In [31]:
fit(m, md, 3, opt, F.nll_loss)

A Jupyter Widget

[ 0.       1.22708  1.36926]                                 
[ 1.       1.21948  1.3696 ]                                 
[ 2.       1.22541  1.36969]                                 



### Putting it all together: LSTM

In [15]:
from fastai import sgdr

n_hidden=512

In [16]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [17]:
m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)  #also does differential leraning rate and weight decay

In [18]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [19]:
fit(m, md, 2, lo.opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.804978   1.727243  
    1      1.701076   1.6237                                                                                           



[array([1.6237])]

In [20]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.531045   1.47773   
    1      1.582613   1.523262                                                                                         
    2      1.45291    1.42327                                                                                          
    3      1.603681   1.540545                                                                                         
    4      1.5276     1.476972                                                                                         
    5      1.442815   1.416055                                                                                         
    6      1.374056   1.376026                                                                                         
    7      1.581675   1.526962                                                                                         
    8  

[array([1.34149])]

In [26]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**6-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=63), HTML(value='')))

epoch      trn_loss   val_loss                                                                                         
    0      1.021382   1.389068  
    1      1.019383   1.389035                                                                                         
    2      1.014936   1.389124                                                                                         
    3      1.014783   1.38957                                                                                          
    4      1.017301   1.389435                                                                                         
    5      1.017675   1.389378                                                                                         
    6      1.021754   1.38936                                                                                          
    7      1.022754   1.389495                                                                                         
    8  

[array([1.39268])]

### Test

In [27]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [28]:
get_next('for thos')

'e'

In [29]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [30]:
print(get_next_n('for thos', 400))

for those are oncermay permittes wearisome [this is even they the writtensity, which still and stay, leathing one fordisguish, wagner,--books, however the bad good," or "betrayed? to at least grapatians again: it is now benefit in usacces, and guides all noise--endurance, does not in europe.--235. the savage) him hears it in thefundamental in our eyes such artpulicultioss, and subsedness of what is nowada
