In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

  return f(*args, **kwds)
  return f(*args, **kwds)
  from numpy.core.umath_tests import inner1d


## Setup

We are going to download the collected works of Neitzsche to use as our data for this excercise.

In [2]:
PATH = '/fastai/data/nietzsche/'

In [476]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt",f"{PATH}nietzsche.txt")
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length: ',len(text))

nietzsche.txt: 606kB [00:01, 510kB/s]                             

corpus length:  600893





In [9]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [10]:
chars = sorted(list(set(text)))## this gets all unique characters in the text sorted
vocab_size = len(chars)+1
print('total chars: ', vocab_size)

total chars:  85


Sometimes its useful to have an additional 0 inserted in the dataset , e.g. for padding

In [12]:
chars.insert(0,"\0")
''.join(chars[1:-5])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

Map from chars to indices and back again

In [13]:
char_indices = {c:i for i,c in enumerate(chars)}
indices_char = {i:c for i,c in enumerate(chars)}

*idx* will be the data we use from now on - it simply converts all teh characters to their index (based on the mapping above)

In [14]:
idx = [char_indices[c] for c in text]

idx[:10] ## first ten characters in text

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [15]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## Three char model

### Create inputs

Create a list of every 4th character, starting at the 0th,1st,2nd,then 3rd characters

In [27]:
cs=3
c1_dat = [idx[i]   for i in range(0,len(idx)-cs,cs)]
c2_dat = [idx[i+1] for i in range(0,len(idx)-cs,cs)]
c3_dat = [idx[i+2] for i in range(0,len(idx)-cs,cs)]
c4_dat = [idx[i+3] for i in range(0,len(idx)-cs,cs)]

Our inputs

In [29]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

Our output

In [30]:
y = np.stack(c4_dat)

The first 4 inputs and outputs

In [31]:
x1[:4],x2[:4],x3[:4]

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [32]:
y[:4]

array([30, 29,  1, 40])

In [33]:
x1.shape, y.shape

((200297,), (200297,))

### Create and train model

Pick a size for our hidden state

In [34]:
n_hidden = 256

The number of latent factors to create(i.e. the size of the embedding matrix)

In [35]:
n_fac = 42

In [53]:
class Char3Model(nn.Module):
    def __init__(self,vocab_size,n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size,n_fac)
        
        #The 'green arrow' from diagram - refer to lesson-6 ppt.- layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden)
        
        #The 'orange arrow' from digram - layer operation from hidden to hidden
        self.l_hidden =  nn.Linear(n_hidden, n_hidden)
        
        #The 'blue arrow' from diagram - layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self,c1,c2,c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = torch.tanh(self.l_hidden(h+in1))
        h = torch.tanh(self.l_hidden(h+in2))
        h = torch.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

In [54]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

In [55]:
m = Char3Model(vocab_size,n_fac).cuda()

In [56]:
# iterators are used here to inspect how the model data is obtained and used in training and testing.
# this cell is not used by the model, this is just for study of data loaders
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [57]:
opt = optim.Adam(m.parameters(),1e-2)

In [58]:
fit(m,md,1,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.102345   0.789043  



[0.7890429496765137]

In [59]:
set_lrs(opt,0.001)

In [60]:
fit(m,md,1,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.846418   0.364407  



[0.3644065856933594]

### Test model

In [61]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [62]:
get_next('y. ')

'T'

In [64]:
get_next('ppl')

'i'

In [65]:
get_next(' th')

'e'

In [66]:
get_next('and')

' '

In [69]:
get_next(', i')

'n'

In [72]:
get_next('wit')

'h'

## Our First RNN!

### Create inputs

This is the size of our unrolled RNN.

In [73]:
cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to our model.

In [74]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx) -cs)]

Then create a list of the next character in each of these series. This will be the labels/y for our model

In [75]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs)]

In [153]:
len(c_in_dat)

600885

In [76]:
xs = np.stack(c_in_dat, axis=0)

In [77]:
xs.shape

(600885, 8)

In [78]:
y = np.stack(c_out_dat)

In [253]:
y.shape

(600885,)

So each column below is one series of 8 characters from the text.

In [79]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

...and this is the next character after each sequence.

In [80]:
y[:cs]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

### Create and train model

In [81]:
val_idx = get_cv_idxs(len(idx) -cs-1)

In [82]:
md = ColumnarModelData.from_arrays('.',val_idx,xs,y,bs=512)

In [83]:
class CharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self,vocab_size,n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size,n_fac)
        self.l_in = nn.Linear(n_fac,n_hidden)
        self.l_hidden = nn.Linear(n_hidden,n_hidden)
        self.l_out = nn.Linear(n_hidden,vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h  = V(torch.zeros(bs,n_hidden).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = torch.tanh(self.l_hidden(h+inp))
            
        return F.log_softmax(self.l_out(h),dim=-1)

In [84]:
m = CharLoopModel(vocab_size, n_fac).cuda()
opt =  optim.Adam(m.parameters(), 1e-2)

In [90]:
fit(m,md,1,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.978228   1.986723  



[1.9867232561651074]

In [91]:
set_lrs(opt,1e-4)

In [92]:
fit(m,md,1,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.792315   1.796665  



[1.7966653191286244]

In [429]:
class CharLoopConcatModel(nn.Module):
    def __init__(self,vocab_size,n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size,n_fac)
        self.l_in = nn.Linear(n_fac+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden,n_hidden)
        self.l_out = nn.Linear(n_hidden,vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h  = V(torch.zeros(bs, n_hidden)).cuda()
        for c in cs:
            inp = torch.cat((h,self.e(c)),1)
            inp = F.relu(self.l_in(inp))
            h  = torch.tanh(self.l_hidden(inp))
        return F.log_softmax(self.l_out(h), dim=-1)

In [430]:
m = CharLoopConcatModel(vocab_size,n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [431]:
# iterators are used here to inspect how the model data is obtained and used in training and testing.
# this cell is not used by the model, this is just for study of data loaders
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [432]:
fit(m,md,1,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.822335   1.789082  



[1.7890821937841133]

In [433]:
set_lrs(opt,1e-4)

In [434]:
fit(m,md,1,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.706516   1.712417  



[1.7124166100276896]

### Test model

Before Testing with a string of characters, since our model has been built with 2 dimensional data, i.e. stacked array of strings eight of them, it expects 2 dimensional input. So we use tensor.view(-1,1) on the input idxs.
This converts the `[8]` tensor into `[8,1]` tensor. This saves a lot of runtime error in dimension mapping issues at concat and input dimension check in torch

In [435]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs.view(-1,1)))
    i = np.argmax(to_np(p))
    return chars[i]

In [436]:
get_next('for thos')

'e'

In [437]:
get_next('part of ')

't'

In [438]:
get_next('queens a')

'n'

In [439]:
get_next(' to Trut')

'h'

## RNN with pytorch

In [411]:
class CharRnn(nn.Module):
    def __init__(self,vocab_size,n_fac):
        super().__init__()
        self.e =  nn.Embedding(vocab_size,n_fac)
        self.rnn = nn.RNN(n_fac,n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs =  cs[0].size(0)
        h = V(torch.zeros(1,bs,n_hidden))
        inp =  self.e(torch.stack(cs))
        outp,h = self.rnn(inp,h)
        
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [412]:
m = CharRnn(vocab_size,n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [413]:
it = iter(md.trn_dl)
*xs, yt =  next(it)

In [414]:
xs[0].shape

torch.Size([512])

In [415]:
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [416]:
ht = V(torch.zeros(1,512,n_hidden))
outp, hn = m.rnn(t,ht)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [417]:
t = m(*V(xs)); t.size()

torch.Size([512, 85])

In [418]:
fit(m,md,4,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.870221   1.845782  
    1      1.674798   1.675024                               
    2      1.588222   1.596608                               
    3      1.530001   1.549112                               



[1.549112139394448]

In [419]:
set_lrs(opt,1e-4)

In [420]:
fit(m,md,2,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.473999   1.511363  
    1      1.470249   1.506412                               



[1.506411505672367]

### Test model

In [421]:
def get_next(inp):
    idxs =  T(np.array([char_indices[c] for c in inp]))
    # view adjust the one dimensional string to 2 dimensions and later it gets 3 inside model
    p = m(*VV(idxs.view(-1,1))) 
    i = np.argmax(to_np(p))
    return chars[i]

In [422]:
get_next('for thos')

'e'

In [423]:
def get_next_n(inp,n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [443]:
get_next_n(' to Trut',30)

' to Truth the self-conside of the self'

## Multi output model

### Setup

Lets take non-overlapping sets of characters this time

In [444]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]

Then create the exact same thing, offset by 1 as our labels

In [445]:
c_out_dat  = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [446]:
xs = np.stack(c_in_dat)
xs.shape

(75111, 8)

In [447]:
ys = np.stack(c_out_dat)
ys.shape

(75111, 8)

In [448]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [ 1,  1, 43, 45, 40, 40, 39, 43],
       [33, 38, 31,  2, 73, 61, 54, 73],
       [ 2, 44, 71, 74, 73, 61,  2, 62],
       [72,  2, 54,  2, 76, 68, 66, 54],
       [67,  9,  9, 76, 61, 54, 73,  2],
       [73, 61, 58, 67, 24,  2, 33, 72],
       [ 2, 73, 61, 58, 71, 58,  2, 67]])

In [449]:
ys[:cs,:cs]

array([[42, 29, 30, 25, 27, 29,  1,  1],
       [ 1, 43, 45, 40, 40, 39, 43, 33],
       [38, 31,  2, 73, 61, 54, 73,  2],
       [44, 71, 74, 73, 61,  2, 62, 72],
       [ 2, 54,  2, 76, 68, 66, 54, 67],
       [ 9,  9, 76, 61, 54, 73,  2, 73],
       [61, 58, 67, 24,  2, 33, 72,  2],
       [73, 61, 58, 71, 58,  2, 67, 68]])

### Create and train model

In [450]:
val_idx =  get_cv_idxs(len(xs)-cs-1)

In [451]:
md = ColumnarModelData.from_arrays('.',val_idx,xs,ys,bs=512)

In [452]:
class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size,n_fac)
        self.rnn =  nn.RNN(n_fac,n_hidden)
        self.l_out =  nn.Linear(n_hidden,vocab_size)
        
    def forward(self,*cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1,bs,n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp,h)
        return F.log_softmax(self.l_out(outp),dim=-1)

In [453]:
m = CharSeqRnn(vocab_size,n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [456]:
it =  iter(md.trn_dl)
*xs,yt = next(it)

In [457]:
def nll_loss_seq(inp,targ):
    sl,bs,nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1)
    return F.nll_loss(inp.view(-1,nh), targ)

In [458]:
fit(m,md,4,opt,nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.611849   2.419612  
    1      2.29623    2.203371                              
    2      2.14263    2.088459                              
    3      2.048067   2.016603                              



[2.016603229715726]

In [459]:
set_lrs(opt,1e-4)

In [460]:
fit(m,md,1,opt,nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                             
    0      1.996522   2.000573  



[2.000573227027761]

### Identity init!

While RNNs are prone to vanishing or exploding gradients due to the fact thay they keep on updating gradients on weights and it goes out of control. In a paper Geofrey Hinton proposed using identity matrix to resolve this. Identity matrix when used to initialize would give the same matrix so there is no multiplication

In [461]:
m = CharSeqRnn(vocab_size,n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2) ## since using ID matrix we can try a higher learning rate than 1e-3

In [462]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]], device='cuda:0')

In [463]:
fit(m,md,4,opt,nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.394403   2.218189  
    1      2.142518   2.081137                              
    2      2.041495   2.011451                              
    3      1.982452   1.960882                              



[1.9608815786841707]

In [464]:
set_lrs(opt, 1e-3)

In [465]:
fit(m,md,4,opt,nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                             
    0      1.899471   1.910989  
    1      1.890142   1.902447                              
    2      1.883755   1.896751                              
    3      1.877774   1.891235                              



[1.8912352305118951]

## Stateful model

### Setup

In [4]:
from torchtext import vocab, data
from fastai.nlp import *
from fastai.lm_rnn import *

TRN_PATH = 'trn/'
VAL_PATH = 'val/'

TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

#in order to split the data into training and validation sets 
#use the following shell script

#split into 80 and 20
#split -b $[ $(wc -c nietzsche.txt | cut -d " " -f1) * 80/100] nietzsche.txt 

#this creates 2 files xaa (80%) and xab (20%)

#you can then move them into paths xaa -> trn/trn.txt and xab -> val/val.txt
#may be the target path can be mentioned in the split command, research needed.

%ls {PATH}




[0m[01;34mmodels[0m/  nietzsche.txt  [01;34mtrn[0m/  [01;34mval[0m/


In [5]:
%ls {TRN}

trn.txt


In [6]:
TEXT =  data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md =  LanguageModelData.from_text_files(PATH,TEXT,**FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(922, 55, 1, 472944)

### RNN

In [486]:
class CharSeqStatefulRnn(nn.Module):
    def __init__(self,vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs : self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs),self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1,self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

Some wrinkles in RNN and fixes as discussed by Jeremy in the course.
* BPTT - Back prob Three times - used to get rid of hidden state history every batch so has to save memory and gain efficiency, that is keep h.data but give up V(h) create new V(h.data)
* Data sets from 1 single txt file create your own val set by spliting and follow torch trn, val folder structure
* Last batch may have less than mini batch size of data, and hidden state shape needs to be adjusted so use input size check `cs[0].size(0)` and re initialize hiddens, so last batch essentially is a mini epoch
* PyTorch does not like Tensor rank 3, so flatten out predictions F.log_softmax to `view(-1,vocab_size)` this way columns dim 1, 2nd dim would be vocab size and the rest as many rows as needed. This is for predictions, for targets, PyTorch does that automatically.

Also one more thing to note PyTorch made `F.log_softmax` to accept a dim/axis on which to perform log_softmax. In this case since the last axis is where the final outputs are we choose dim =-1 

In [487]:
m = CharSeqStatefulRnn(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [488]:
fit(m,md,4,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                 
    0      1.885203   1.869341  
    1      1.70582    1.726655                                 
    2      1.617787   1.65508                                  
    3      1.564241   1.610714                                 



[1.6107142205090867]

In [489]:
set_lrs(opt, 1e-4)
fit(m,md,4,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                 
    0      1.488474   1.567005  
    1      1.487551   1.561488                                 
    2      1.4835     1.558015                                 
    3      1.477052   1.554681                                 



[1.554681031780274]

### RNN Loop

In [504]:
# from the pytorch source

def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    return torch.tanh(F.linear(input,w_ih,b_ih) + F.linear(hidden,w_hh,b_hh))    

In [542]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size,n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size,n_fac)
        self.rnn = nn.RNNCell(n_fac,n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self,cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h.view(bs,-1)
        for c in cs:
            #print('hidden shape:', o.view(64,-1).shape)
            o = self.rnn(self.e(c), o)
            outp.append(o)
        outp = self.l_out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1,self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1,bs, n_hidden))

In [543]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [544]:
fit(m,md,4,opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      2.067511   2.026093  
    1      1.936494   1.91116                                 
    2      1.881003   1.860318                                
    3      1.840834   1.829997                                



[1.829996539210083]

### GRU

In [549]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac,n_hidden)
        self.l_out =  nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1,self.vocab_size)
    
    def init_hidden(self,bs): self.h = V(torch.zeros(1,bs,n_hidden))

In [550]:
#From pytorch source code for reference

def GRUCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    gi = F.linear(input, w_ih, b_ih)
    gh = F.linear(hidden, w_hh, b_hh)
    i_r, i_i, i_n = gi.chunk(3, 1)
    h_r, h_i, h_n = gh.chunk(3, 1)

    resetgate = F.sigmoid(i_r + h_r)
    inputgate = F.sigmoid(i_i + h_i)
    newgate = F.tanh(i_n + resetgate * h_n)
    return newgate + inputgate * (hidden - newgate)

In [551]:
m =  CharSeqStatefulGRU(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [552]:
fit(m,md,6,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss                                 
    0      1.767736   1.758577  
    1      1.573046   1.605528                                 
    2      1.479559   1.529787                                 
    3      1.429113   1.510058                                 
    4      1.39301    1.480393                                 
    5      1.364666   1.469121                                 



[1.469120693672499]

### Putting it all together LSTM

In [10]:
from fastai import sgdr

n_hidden = 512

In [11]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e =  nn.Embedding(vocab_size, n_fac)
        self.rnn =  nn.LSTM(n_fac,n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs : self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs),self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self,bs):
        self.h = (V(torch.zeros(self.nl,bs,n_hidden)).cuda(),
                   (torch.zeros(self.nl,bs,n_hidden)).cuda())

In [12]:
m = CharSeqStatefulLSTM(md.nt,n_fac, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam,m, 1e-2, 1e-5)

In [13]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [14]:
fit(m,md,2,lo.opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.877079   1.807846  
    1      1.714286   1.662423                                



[1.6624228835946764]

In [15]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo,len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m,md,2**4-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.533485   1.501156  
    1      1.584882   1.532549                                
    2      1.458172   1.435467                                
    3      1.599143   1.564048                                
    4      1.519205   1.49313                                 
    5      1.42842    1.419875                                
    6      1.369472   1.384998                                
    7      1.575042   1.522615                                
    8      1.541191   1.511907                                
    9      1.507243   1.489766                                
    10     1.465212   1.446591                                
    11     1.42535    1.415619                                
    12     1.372329   1.383233                                
    13     1.326927   1.356423                                
    14     1.295668   1.343132                                



[1.3431318616065007]

In [16]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo,len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m,md,2**6-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=63), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.285478   1.341856  
    1      1.28309    1.339874                                
    2      1.281992   1.338606                                
    3      1.286787   1.336551                                
    4      1.276333   1.333741                                
    5      1.26223    1.332703                                
    6      1.260748   1.332167                                
    7      1.266959   1.330139                                
    8      1.258711   1.327977                                
    9      1.247987   1.326638                                
    10     1.243524   1.32443                                 
    11     1.23302    1.323292                                
    12     1.23064    1.323132                                
    13     1.227543   1.322438                                
    14     1.223114   1.322265                                
    15     1.227592   

[1.3830671261913157]

### Test

In [17]:
def get_next(inp):
    idxs =  TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [18]:
get_next('for thos')

'e'

In [21]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [22]:
print(get_next_n('for thos',400))

for those "innate man,so first that shinesee with, like theme, inmodest. theprimitive and speaks itself in the sympathy withhis "tensit,"carry!"--ideas of science) ([ multiplicity, propudant, thatunders, as merely preignens and rates. qualifi!y is it now be understakes, jealous, well, "evil"; inshedions, would serdom that world of distinctions and imaliganity-intellectons for their feels man! advise the s


In [23]:
print(get_next_n('for thos',400))

for those what determinwards, "stole, a long time--not was it seem to such an investigather around? "god is no morethe man's scilapses as ahere meepssome smback!" everyform is too much men, must_" the sunk" afvains chuman class, long speliful and phenomena--men of man--is presumptanilysolitude, and morals or longly depresses every sympathy within humanity valees--and to security! all things is alone, the 
