In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [2]:
PATH='data/maddog/'

In [3]:
# get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')

In [6]:
text = open(f'{PATH}Life is nothing more than a dog race.txt', encoding='utf-8').read()
print('corpus length:', len(text))

corpus length: 293761


In [7]:
text[:400]

"Life is nothing more than a dog race.\nMaddog McDermutt\n\nOnce around is all you get.\nFleas Finnegan\n\nIt's that first turn that gets you.\nHugh Mungas\n\nNo photo finishes in this lifetime.\nLittle Elsie\n\nWolves? There are no wolves in Montana.\nBill Eagleberry\n\nThere are several ways to mount a dog.\nWorthington, Sr.\n\nSometimes I wonder about public education.\nDr. Mayonaisse\n\n \nDOWN UNDER JONES\nROBERT SC"

In [8]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 84


Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [9]:
chars.insert(0, "\0")

''.join(chars[1:-6])

'\n !"$\'(),-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'

Map from chars to indices and back again

In [10]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

*idx* will be the data we use from now on - it simply converts all the characters to their index (based on the mapping above)

In [11]:
idx = [char_indices[c] for c in text]

idx[:10]

[37, 60, 57, 56, 2, 60, 70, 2, 65, 66]

In [12]:
''.join(indices_char[i] for i in idx[:70])

'Life is nothing more than a dog race.\nMaddog McDermutt\n\nOnce around is'

## Three char model

### Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

In [13]:
cs=3
c1_dat = [idx[i]   for i in range(0, len(idx)-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-cs, cs)]

Our inputs

In [14]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

Our output

In [15]:
y = np.stack(c4_dat)

In [16]:
len(c4_dat), type(c4_dat)

(97920, list)

In [17]:
len(y), type(c4_dat)

(97920, list)

In [18]:
np.all(y == c4_dat) # True if every element is true

True

The first 4 inputs and outputs

In [19]:
x1[:4], x2[:4], x3[:4]

(array([37, 56, 70, 66]), array([60,  2,  2, 71]), array([57, 60, 65, 59]))

In [20]:
y[:4]

array([56, 70, 66, 60])

In [21]:
x1.shape, y.shape

((97920,), (97920,))

### Create and train model

Pick a size for our hidden state

In [22]:
n_hidden = 256

The number of latent factors to create (i.e. the size of the embedding matrix)

In [23]:
n_fac = 42

In [24]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac) # brings in vocab size of 84, spits out factors in embedding

        # The 'green arrow' from our diagram - the layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden)

        # The 'orange arrow' from our diagram - the layer operation from hidden to hidden
        # THIS IS A SQUARE WEIGHT MATRIX
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # The 'blue arrow' from our diagram - the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3): 
        """
        pass in three characters to forward
        stick it through embedding, linear layer, then relu
        """
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        # This is from 1:29 in the video.
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

In [25]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

In [26]:
m = Char3Model(vocab_size, n_fac).cuda()

In [27]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [28]:
opt = optim.Adam(m.parameters(), 1e-2)

In [29]:
m.parameters()

<generator object Module.parameters at 0x7fd66d28f410>

In [30]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.057481   3.511063  



[array([3.51106])]

In [31]:
set_lrs(opt, 0.001)

In [32]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.780182   3.213684  



[array([3.21368])]

### Test model

In [33]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [34]:
get_next('y. ')

'I'

In [35]:
get_next('ppl')

'e'

In [36]:
get_next(' th')

'e'

In [37]:
get_next('and')

' '

In [38]:
get_next('fuc') # Fail

'k'

In [39]:
get_next('fai')

'd'

In [40]:
get_next('bee')

'n'

## Our first RNN!

### Create inputs

This is the size of our unrolled RNN.

In [43]:
cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to our model.

In [44]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs)]

Then create a list of the next character in each of these series. This will be the labels for our model.

In [45]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs)]

In [46]:
xs = np.stack(c_in_dat, axis=0)

In [47]:
xs.shape

(293753, 8)

In [48]:
y = np.stack(c_out_dat)

So each column below is one series of 8 characters from the text.

In [49]:
xs[:cs,:cs]

array([[37, 60, 57, 56,  2, 60, 70,  2],
       [60, 57, 56,  2, 60, 70,  2, 65],
       [57, 56,  2, 60, 70,  2, 65, 66],
       [56,  2, 60, 70,  2, 65, 66, 71],
       [ 2, 60, 70,  2, 65, 66, 71, 59],
       [60, 70,  2, 65, 66, 71, 59, 60],
       [70,  2, 65, 66, 71, 59, 60, 65],
       [ 2, 65, 66, 71, 59, 60, 65, 58]])

...and this is the next character after each sequence.

In [50]:
y[:cs]

array([65, 66, 71, 59, 60, 65, 58,  2])

### Create and train model

In [51]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [52]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

In [53]:
class CharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        # Now that it's in a loop, this becomes an RNN
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            # Common to use TANH (looks like a sigmoid centered around 0), hidden to hidden tend to use tanh
            h = F.tanh(self.l_hidden(h+inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [54]:
m = CharLoopModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [55]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.90625    1.911559  



[array([1.91156])]

In [56]:
set_lrs(opt, 0.001)

In [57]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.647764   1.65389   



[array([1.65389])]

In [58]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [59]:
m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [60]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [61]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.917949   1.862536  



[array([1.86254])]

In [62]:
set_lrs(opt, 1e-4)

In [63]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.776788   1.784246  



[array([1.78425])]

### Test model

In [65]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [66]:
get_next('for thos')

' '

In [67]:
get_next('part of ')

't'

In [68]:
get_next('queens a')

'n'

In [69]:
get_next('fuck yo') # Grandpa Bob taught me a new word!!

'u'

In [70]:
get_next('jesus chris')

't'

In [71]:
get_next('Appl')

'e'

In [72]:
get_next('appl')

'e'

## RNN with pytorch

bookmark: 1:51 in video

In [73]:
class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [74]:
m = CharRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [75]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [76]:
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [77]:
ht = V(torch.zeros(1, 512,n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [78]:
t = m(*V(xs)); t.size()

torch.Size([512, 84])

In [79]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.994121   1.95397   
    1      1.764066   1.754402                              
    2      1.644866   1.659595                              
    3      1.562051   1.597668                              



[array([1.59767])]

In [80]:
set_lrs(opt, 1e-4)

In [81]:
fit(m, md, 2, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.50475    1.557895  
    1      1.482968   1.551223                              



[array([1.55122])]

### Test model

In [82]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [83]:
get_next('for thos')

'e'

In [84]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [85]:
get_next_n('for thos', 40)

'for those the dog track and the dog track and th'

In [86]:
get_next_n('Charac', 40)

'Charached the dog the dog the dog the dog the '

In [87]:
get_next_n('time to', 40)

'time to the dog track and the dog track and the'

In [88]:
get_next_n('bear', 40)

'beard were would he was and the say dog said'

In [89]:
get_next_n('Robots', 40)

'Robots the dog the dog the dog the dog the dog'

## Multi-output model

### Setup

Let's take non-overlapping sets of characters this time

In [90]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]

Then create the exact same thing, offset by 1, as our labels

In [91]:
c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [92]:
xs = np.stack(c_in_dat)
xs.shape

(36719, 8)

In [93]:
ys = np.stack(c_out_dat)
ys.shape

(36719, 8)

In [94]:
xs[:cs,:cs] # notice how these are no longer overlapping. first 8, next 16, etc.

array([[37, 60, 57, 56,  2, 60, 70,  2],
       [65, 66, 71, 59, 60, 65, 58,  2],
       [64, 66, 69, 56,  2, 71, 59, 52],
       [65,  2, 52,  2, 55, 66, 58,  2],
       [69, 52, 54, 56, 11,  1, 38, 52],
       [55, 55, 66, 58,  2, 38, 54, 29],
       [56, 69, 64, 72, 71, 71,  1,  1],
       [40, 65, 54, 56,  2, 52, 69, 66]])

In [95]:
ys[:cs,:cs]

array([[60, 57, 56,  2, 60, 70,  2, 65],
       [66, 71, 59, 60, 65, 58,  2, 64],
       [66, 69, 56,  2, 71, 59, 52, 65],
       [ 2, 52,  2, 55, 66, 58,  2, 69],
       [52, 54, 56, 11,  1, 38, 52, 55],
       [55, 66, 58,  2, 38, 54, 29, 56],
       [69, 64, 72, 71, 71,  1,  1, 40],
       [65, 54, 56,  2, 52, 69, 66, 72]])

### Create and train model

In [96]:
val_idx = get_cv_idxs(len(xs)-cs-1)

In [97]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [98]:
class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        return F.log_softmax(self.l_out(outp), dim=-1)

In [99]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [100]:
it = iter(md.trn_dl)
*xst,yt = next(it)

In [101]:
yt.size()

torch.Size([512, 8])

In [102]:
def nll_loss_seq(inp, targ):
    sl,bs,nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1) # flatten those targets and transpose those axes 
    # (transpose just adds some meta data)
    return F.nll_loss(inp.view(-1,nh), targ)

In [103]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                            
    0      2.921394   2.569398  
    1      2.518146   2.31422                             
    2      2.309272   2.186012                            
    3      2.17652    2.101408                            



[array([2.10141])]

In [104]:
set_lrs(opt, 1e-4)

In [105]:
fit(m, md, 1, opt, nll_loss_seq) # md is the model object which wraps up the training set, etc.

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                            
    0      2.068779   2.086818  



[array([2.08682])]

#### Questions

Pulling the triangle (output) into the loop, pulling the RNN in.
How to initialize the RNNs? Should we reset to zero each time?
Can we somehow keep the hidden state relevant call to call?

### Identity init!

In [106]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [107]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))


    1     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     1  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      1     0     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     1
[torch.cuda.FloatTensor of size 256x256 (GPU 0)]

In [108]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                            
    0      2.566655   2.24272   
    1      2.195947   2.013086                            
    2      2.016346   1.941978                            
    3      1.916489   1.881124                            



[array([1.88112])]

In [109]:
set_lrs(opt, 1e-3)

In [110]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                            
    0      1.786171   1.824411  
    1      1.77187    1.816175                            
    2      1.762572   1.809974                            
    3      1.75452    1.804519                            



[array([1.80452])]

## Stateful model

### Setup

In [148]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

if not PATH:
    PATH='data/maddog/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

Life is nothing more than a dog race.txt  [0m[01;34mtrn[0m/  [01;34mval[0m/


In [149]:
%ls {PATH}trn

In [150]:
sys.getdefaultencoding()

'utf-8'

In [153]:
os.makedirs(TRN, exist_ok=True)
os.makedirs(VAL, exist_ok=True)

train_perc = .8
with open(f'{PATH}/Life is nothing more than a dog race.txt', 'r', encoding="utf-8") as fp:
    lines = fp.readlines()
    text_len = len(lines)
    part_train = open(f'{TRN}maddog1.txt', 'w', encoding="utf-8")
    part_val = open(f'{VAL}maddog2.txt', 'w', encoding="utf-8")    
    for ix,l in enumerate(lines):

        if ix/text_len<train_perc:
            part_train.write(l)
        else:
            part_val.write(l)
    part_train.close()
    part_val.close()    

In [154]:
TEXT = data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(455, 54, 1, 233636)

### RNN

In [155]:
class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [156]:
m = CharSeqStatefulRnn(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [157]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.948985   1.931623  
    1      1.728419   1.760322                              
    2      1.614846   1.678983                               
    3      1.544304   1.627518                               



[array([1.62752])]

In [158]:
set_lrs(opt, 1e-4)

fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.468805   1.579976  
    1      1.460512   1.574669                               
    2      1.461128   1.568446                               
    3      1.453147   1.565069                               



[array([1.56507])]

### RNN loop

In [120]:
# From the pytorch source

def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    return F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))

In [121]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h
        for c in cs: 
            o = self.rnn(self.e(c), o)
            outp.append(o)
        outp = self.l_out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [122]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [123]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.886977   1.858232  
    1      1.715619   1.708369                               
    2      1.625817   1.64271                                
    3      1.579709   1.606165                               



[array([1.60616])]

### GRU

In [124]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [125]:
# From the pytorch source code - for reference

def GRUCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    gi = F.linear(input, w_ih, b_ih)
    gh = F.linear(hidden, w_hh, b_hh)
    i_r, i_i, i_n = gi.chunk(3, 1)
    h_r, h_i, h_n = gh.chunk(3, 1)

    resetgate = F.sigmoid(i_r + h_r)
    inputgate = F.sigmoid(i_i + h_i)
    newgate = F.tanh(i_n + resetgate * h_n)
    return newgate + inputgate * (hidden - newgate)

In [126]:
m = CharSeqStatefulGRU(md.nt, n_fac, 512).cuda()

opt = optim.Adam(m.parameters(), 1e-3)

In [127]:
fit(m, md, 6, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.760597   1.732481  
    1      1.577246   1.583987                               
    2      1.497724   1.521815                              
    3      1.439811   1.492859                               
    4      1.401781   1.471848                               
    5      1.381994   1.466927                               



[array([1.46693])]

In [128]:
set_lrs(opt, 1e-4)

In [129]:
fit(m, md, 3, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.283282   1.428976  
    1      1.285143   1.425369                              
    2      1.290324   1.422813                              



[array([1.42281])]

### Putting it all together: LSTM

In [159]:
from fastai import sgdr

n_hidden=512

In [160]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [161]:
m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)

In [162]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [163]:
fit(m, md, 2, lo.opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.817387   1.75655   
    1      1.669245   1.649684                              



[array([1.64968])]

In [164]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.494035   1.497901  
    1      1.52075    1.513176                              
    2      1.390988   1.431685                              
    3      1.538641   1.527066                              
    4      1.454338   1.473495                              
    5      1.363254   1.406748                              
    6      1.290435   1.368497                              
    7      1.523561   1.515765                              
    8      1.458539   1.478354                              
    9      1.427282   1.453635                              
    10     1.378828   1.42589                               
    11     1.321095   1.391252                              
    12     1.270087   1.361658                              
    13     1.223738   1.337873                              
    14     1.183369   1.324168                              



[array([1.32417])]

In [175]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**6-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=63), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.1679     1.322131  
    1      1.171417   1.32097                               
    2      1.163224   1.319656                              
    3      1.170152   1.319787                              
    4      1.157343   1.316973                              
    5      1.156655   1.316405                              
    6      1.143659   1.315699                              
    7      1.150516   1.31611                               
    8      1.143445   1.314734                              
    9      1.128443   1.313437                              
    10     1.124517   1.311476                              
    11     1.111789   1.311799                              
    12     1.10794    1.311456                              
    13     1.107094   1.310925                              
    14     1.104284   1.310883                              
    15     1.113688   1.312864                      

[array([1.4183])]

### Test

In [176]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [177]:
get_next('for thos')

'e'

In [178]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [179]:
print(get_next_n('for thos', 400))

for those chow chows, looked at lait; the missing down a big blaws-out bencare; he shoot, bike here stuff. dog sled.sally .ustra miling.""hole key is faster the times. of i could sure. i could you-break from time." all floor. after school them garm.""altho," i asked:"filthy for a couple, mr. mcnulty pant; "gather, chapped a woman to the droof eyelady, and three, on a ninth gradub. give you tomfered then t


In [180]:
print(get_next_n('Apple Macintosh and Microsoft both', 400))

Apple Macintosh and Microsoft both of mile "it's the near, the chow chows, the sled fellow was a little dog off a lawe, took a lunker with a rush, or love met," fard said.sally nodded. no bleasing, i thought. all and called.fard hearked for the obsirity and called, opened the denk of squirt on over the south send the time," he said. he looked down the hall, and couched again. show my barn swim," i said.and i went to the one black 


In [181]:
print(get_next_n('Hasta la vista, baby. ', 400))

Hasta la vista, baby. the greyhound came at his air, stepped, out the atlantic ocean. memzery weeks of our greyhound; jones sat in, but not hering a dog straight, and once again said, "what's that?” his worthington mailbox share, and how me, will never dies, fard is.""billy?"""the night of time to time.""what'd he jumped down the turn his chronic belling montana black, flyas grunted, ticket them up, and i professed to 


In [182]:
print(get_next_n('Hasta la vista, baby. Judgement day has come. ', 400))

Hasta la vista, baby. Judgement day has come. "hey nothing. i took a run rapid city to the first place, was arose straight in moving the track, the guy who should i deven hope."aw you boots like a nice of much in cracks about it."what you like. a right ploges of thing, turn your ninth graders?"  did not seem to know the ruthless took one line at yourself constration, the big greyhound stopping."i'll have to consideration." maybe anyway. pocke


In [183]:
print(get_next_n("Hasta la vista, baby. ", 400))

Hasta la vista, baby. "the big greyhound laid bunghole for all the greyhound swim?" and i read out like a nice break.""you know, like when you did that with like smelly dog out lumping television. that's no way?""sheldon shraw right at geese. i thought so," maddog said."i'm not sucky. fard worthington was catch, principate my dahatch i would be forever. third period prep performed, and the head out for sally are in the


In [184]:
print(get_next_n("The soul and its passion are surely ", 400))

The soul and its passion are surely image any malls. i was slipped my eyes, probably would be macdourmouth," i said, reaching it for the antlers."maddog stepped in. "my buzing. one farty growl dump.""maddog," mr. worthington said, waiting my school. "school the class was looking mesiantly over the lead-out jump out the big greyhound laughing. your dogs in the shoulder and down three, looking laughing over by the move, something migh


In [185]:
print(get_next_n("Maddog said to Jones, ", 400))

Maddog said to Jones, swent those ipposent booking and pinfed jones in my phone, and i looked in. as saxty more.""what's that?"' you?""i'm sorry. we're remaining inborn.""well," sally said, fooling alongside which mungant, then went around the security guard apicalure, falling, picked, rushing, cross, my big enother called and put the truck, leaned his paper. jones ran off. "i don't prote that lights in great falls. pu


In [203]:
print(get_next_n("Jones was a good d", 400))

Jones was a good dog. you ran a dog track.""i'll bet," he said."what's he are in all the same black swarm, mr. mcnulty, is a dog sled to my dog.""miss neopolitan was something else. i have to be in a hour stepped and jones. i muttered. "the dog looked him around over on in to the plop, and the way to the experiecenshas, maddog could not be another turn, maddog, goes with ulm, montana.jones can't see much," i said.a


In [204]:
print(get_next_n("Jones was a good d", 400))

Jones was a good dog,” i said, "miss neopolitan was a morning erk after, when i'll bet the direction. jones had see the racers over at the river, then garcaid absolutely nothing of maddog mcdermutt."what's that?”first period pups for bark. that's a little mile, reaching behorry. sally had presumed to call me," harry said. "old stop," i said."i'm going to leave it was that bad timing perform, and glacier student, "l


In [205]:
print(get_next_n("Jones was a good d", 400))

Jones was a good day. this is maddog that wolf pupp to hear, mr. mcnulty, he wasn't suspect, and hesw legs, in a walk about this dog trainers rushed up on the beaken, stick on squishing, coming, , mrs. worthington, maddog.merry christmas. mrs. mcnulty, i am you with some name will do?" who i dressed up to my throat, but i absolutely sally, dag story, as his other i thought anywhere, mcdermouth, maddog grabbed his t


In [206]:
print(get_next_n("Jones was a ", 400))

Jones was a monumented after "his track you," he shrugged, helping a great cross, wast, after john's circle. i had a second. i pulled him; he knew that bush christmas very dusy brindles treative."he looked at a big greyhound, and it was a while-finally, we walked over.after nothing, then stared up their feet to he was sent relaxed. vening all that fawn-stuck on the road, then in our bediting. she'll gavelor f


In [207]:
print(get_next_n("Jones was a ", 400))

Jones was a right, florida ordined look, not mostly held there, then said this booking away?""sheldon said i would be his truck in before. we standed around them over her. i slipped his bat wet to sever have green. stending down our eyes weren't it, on the way then the guy proof of tickled. you tremet to miss neopolite went at the letter. this is a stirin' chear script deft, and a basid. i'm going to let me c


In [208]:
print(get_next_n("Jones was a ", 400))

Jones was a greyhounds, a cross, after its inseminated your wife. i'll tell you that," he said."the greyhound, jones go over her. i learned about some distember, fard?" i patting it on, fard's field mailbox piet."what's your dog, maddog," fleas finnegal, but i tell he wondered which finally. objecent me. fard said i had a litter of discations--------------------------------i could want on the truck with his p
