In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [2]:
PATH='data/nietzsche/'

In [3]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt', encoding='utf-8').read()
print('corpus length:', len(text))

corpus length: 600893


In [4]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 85


Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [6]:
chars.insert(0, "\0")

''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxy'

Map from chars to indices and back again

In [7]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

*idx* will be the data we use from now on - it simply converts all the characters to their index (based on the mapping above)

In [8]:
idx = [char_indices[c] for c in text]

idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [9]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## Three char model

### Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

In [10]:
cs=3
c1_dat = [idx[i]   for i in range(0, len(idx)-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-cs, cs)]

Our inputs

In [11]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

Our output

In [12]:
y = np.stack(c4_dat)

In [13]:
len(c4_dat), type(c4_dat)

(200297, list)

In [14]:
len(y), type(c4_dat)

(200297, list)

In [15]:
np.all(y == c4_dat) # True if every element is true

True

The first 4 inputs and outputs

In [16]:
x1[:4], x2[:4], x3[:4]

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [17]:
y[:4]

array([30, 29,  1, 40])

In [18]:
x1.shape, y.shape

((200297,), (200297,))

### Create and train model

Pick a size for our hidden state

In [19]:
n_hidden = 256

The number of latent factors to create (i.e. the size of the embedding matrix)

In [20]:
n_fac = 42

In [21]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac) # brings in vocab size of 84, spits out factors in embedding

        # The 'green arrow' from our diagram - the layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden)

        # The 'orange arrow' from our diagram - the layer operation from hidden to hidden
        # THIS IS A SQUARE WEIGHT MATRIX
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # The 'blue arrow' from our diagram - the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3): 
        """
        pass in three characters to forward
        stick it through embedding, linear layer, then relu
        """
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        # This is from 1:29 in the video.
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

In [22]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

In [23]:
m = Char3Model(vocab_size, n_fac).cuda()

In [24]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [25]:
opt = optim.Adam(m.parameters(), 1e-2)

In [26]:
m.parameters()

<generator object Module.parameters at 0x7f6a27f15570>

In [27]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.090757   0.664999  



[array([0.665])]

In [28]:
set_lrs(opt, 0.001)

In [29]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.807713   0.433815  



[array([0.43381])]

### Test model

In [30]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [31]:
get_next('y. ')

'T'

In [32]:
get_next('ppl')

'e'

In [33]:
get_next(' th')

'e'

In [34]:
get_next('and')

' '

In [35]:
get_next('fuc') # Fail

't'

In [36]:
get_next('fai')

'n'

In [37]:
get_next('bee')

'n'

## Our first RNN!

### Create inputs

This is the size of our unrolled RNN.

In [38]:
cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to our model.

In [39]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs)]

Then create a list of the next character in each of these series. This will be the labels for our model.

In [40]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs)]

In [41]:
xs = np.stack(c_in_dat, axis=0)

In [42]:
xs.shape

(600885, 8)

In [43]:
y = np.stack(c_out_dat)

So each column below is one series of 8 characters from the text.

In [44]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

...and this is the next character after each sequence.

In [45]:
y[:cs]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

### Create and train model

In [46]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [47]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

In [48]:
class CharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        # Now that it's in a loop, this becomes an RNN
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            # Common to use TANH (looks like a sigmoid centered around 0), hidden to hidden tend to use tanh
            h = F.tanh(self.l_hidden(h+inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [49]:
m = CharLoopModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [50]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.959949   1.962806  



[array([1.96281])]

In [51]:
set_lrs(opt, 0.001)

In [52]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.678344   1.682052  



[array([1.68205])]

In [53]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [54]:
m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [55]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [56]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.833687   1.8232    



[array([1.8232])]

In [57]:
set_lrs(opt, 1e-4)

In [58]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.741866   1.737299  



[array([1.7373])]

### Test model

In [59]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [60]:
get_next('for thos')

'e'

In [61]:
get_next('part of ')

't'

In [62]:
get_next('queens a')

'n'

In [63]:
get_next('fuck yo') # yay its training is complete

'u'

In [64]:
get_next('jesus chris')

't'

In [65]:
get_next('Appl') # Fail

'e'

In [66]:
get_next('appl')

'e'

## RNN with pytorch

bookmark: 1:51 in video

In [67]:
class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [68]:
m = CharRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [69]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [70]:
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [71]:
ht = V(torch.zeros(1, 512,n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [72]:
t = m(*V(xs)); t.size()

torch.Size([512, 85])

In [73]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.873178   1.841134  
    1      1.679667   1.668369                              
    2      1.588351   1.592524                              
    3      1.535811   1.545502                              



[array([1.5455])]

In [74]:
set_lrs(opt, 1e-4)

In [75]:
fit(m, md, 2, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.46709    1.509235  
    1      1.463002   1.504566                              



[array([1.50457])]

### Test model

In [76]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [77]:
get_next('for thos')

'e'

In [78]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [79]:
get_next_n('for thos', 40)

'for those the same the same the same the same th'

In [80]:
get_next_n('Charac', 40)

'Charactions of the said the said the said the '

In [81]:
get_next_n('time to', 40)

'time to the same the same the same the same the'

In [82]:
get_next_n('bear', 40)

'bear of the who has and the who has and the '

In [83]:
get_next_n('Robots', 40)

'Robots of the said the said the said the said '

## Multi-output model

### Setup

Let's take non-overlapping sets of characters this time

In [84]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]

Then create the exact same thing, offset by 1, as our labels

In [85]:
c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [86]:
xs = np.stack(c_in_dat)
xs.shape

(75111, 8)

In [87]:
ys = np.stack(c_out_dat)
ys.shape

(75111, 8)

In [88]:
xs[:cs,:cs] # notice how these are no longer overlapping. first 8, next 16, etc.

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [ 1,  1, 43, 45, 40, 40, 39, 43],
       [33, 38, 31,  2, 73, 61, 54, 73],
       [ 2, 44, 71, 74, 73, 61,  2, 62],
       [72,  2, 54,  2, 76, 68, 66, 54],
       [67,  9,  9, 76, 61, 54, 73,  2],
       [73, 61, 58, 67, 24,  2, 33, 72],
       [ 2, 73, 61, 58, 71, 58,  2, 67]])

In [89]:
ys[:cs,:cs]

array([[42, 29, 30, 25, 27, 29,  1,  1],
       [ 1, 43, 45, 40, 40, 39, 43, 33],
       [38, 31,  2, 73, 61, 54, 73,  2],
       [44, 71, 74, 73, 61,  2, 62, 72],
       [ 2, 54,  2, 76, 68, 66, 54, 67],
       [ 9,  9, 76, 61, 54, 73,  2, 73],
       [61, 58, 67, 24,  2, 33, 72,  2],
       [73, 61, 58, 71, 58,  2, 67, 68]])

### Create and train model

In [90]:
val_idx = get_cv_idxs(len(xs)-cs-1)

In [91]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [92]:
class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        return F.log_softmax(self.l_out(outp), dim=-1)

In [93]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [94]:
it = iter(md.trn_dl)
*xst,yt = next(it)

In [95]:
yt.size()

torch.Size([512, 8])

In [96]:
def nll_loss_seq(inp, targ):
    sl,bs,nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1) # flatten those targets and transpose those axes 
    # (transpose just adds some meta data)
    return F.nll_loss(inp.view(-1,nh), targ)

In [97]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.617324   2.419185  
    1      2.297417   2.200689                              
    2      2.141111   2.089913                              
    3      2.048484   2.015776                              



[array([2.01578])]

In [98]:
set_lrs(opt, 1e-4)

In [99]:
fit(m, md, 1, opt, nll_loss_seq) # md is the model object which wraps up the training set, etc.

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.994798   2.000118  



[array([2.00012])]

#### Questions

Pulling the triangle (output) into the loop, pulling the RNN in.
How to initialize the RNNs? Should we reset to zero each time?
Can we somehow keep the hidden state relevant call to call?

### Identity init!

In [100]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [101]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))


    1     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     1  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      1     0     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     1
[torch.cuda.FloatTensor of size 256x256 (GPU 0)]

In [102]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.362368   2.208684  
    1      2.099917   2.029168                              
    2      1.992997   1.974731                              
    3      1.94451    1.942271                              



[array([1.94227])]

In [103]:
set_lrs(opt, 1e-3)

In [104]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.856283   1.86987   
    1      1.844921   1.86319                               
    2      1.840972   1.859212                              
    3      1.832497   1.853328                              



[array([1.85333])]

## Stateful model

### Setup

In [105]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

PATH='data/nietzsche/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

nietzsche.txt


In [106]:
%ls {PATH}trn

ls: cannot access 'data/nietzsche/trn': No such file or directory


In [109]:
sys.getdefaultencoding()

'utf-8'

In [112]:
os.makedirs(TRN, exist_ok=True)
os.makedirs(VAL, exist_ok=True)

train_perc = .8
with open(f'{PATH}/nietzsche.txt', 'r', encoding="utf-8") as fp:
    lines = fp.readlines()
    text_len = len(lines)
    part_train = open(f'{TRN}nietzsche1.txt', 'w', encoding="utf-8")
    part_val = open(f'{VAL}nietzsche2.txt', 'w', encoding="utf-8")    
    for ix,l in enumerate(lines):

        if ix/text_len<train_perc:
            part_train.write(l)
        else:
            part_val.write(l)
    part_train.close()
    part_val.close()    

In [114]:
TEXT = data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(942, 55, 1, 482972)

### RNN

In [115]:
class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [116]:
m = CharSeqStatefulRnn(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [117]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.8817     1.85319   
    1      1.708547   1.700996                               
    2      1.626479   1.652269                               
    3      1.570658   1.595023                               



[array([1.59502])]

In [118]:
set_lrs(opt, 1e-4)

fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.496068   1.55741   
    1      1.495764   1.551006                               
    2      1.491435   1.547288                               
    3      1.485434   1.543568                               



[array([1.54357])]

### RNN loop

In [119]:
# From the pytorch source

def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    return F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))

In [120]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h
        for c in cs: 
            o = self.rnn(self.e(c), o)
            outp.append(o)
        outp = self.l_out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [121]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [122]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.88604    1.855034  
    1      1.720621   1.707836                               
    2      1.624836   1.637873                               
    3      1.576277   1.596785                               



[array([1.59678])]

### GRU

In [123]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [124]:
# From the pytorch source code - for reference

def GRUCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    gi = F.linear(input, w_ih, b_ih)
    gh = F.linear(hidden, w_hh, b_hh)
    i_r, i_i, i_n = gi.chunk(3, 1)
    h_r, h_i, h_n = gh.chunk(3, 1)

    resetgate = F.sigmoid(i_r + h_r)
    inputgate = F.sigmoid(i_i + h_i)
    newgate = F.tanh(i_n + resetgate * h_n)
    return newgate + inputgate * (hidden - newgate)

In [125]:
m = CharSeqStatefulGRU(md.nt, n_fac, 512).cuda()

opt = optim.Adam(m.parameters(), 1e-3)

In [126]:
fit(m, md, 6, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.770099   1.750296  
    1      1.590002   1.595169                               
    2      1.497607   1.525161                               
    3      1.452196   1.500922                               
    4      1.404516   1.473532                               
    5      1.379007   1.466807                               



[array([1.46681])]

In [127]:
set_lrs(opt, 1e-4)

In [128]:
fit(m, md, 3, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.293917   1.432143  
    1      1.294985   1.427629                               
    2      1.298537   1.426238                               



[array([1.42624])]

### Putting it all together: LSTM

In [129]:
from fastai import sgdr

n_hidden=512

In [130]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [131]:
m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)

In [132]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [133]:
fit(m, md, 2, lo.opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.778882   1.69891   
    1      1.674366   1.609515                              



[array([1.60951])]

In [134]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.510602   1.457199  
    1      1.562157   1.494797                              
    2      1.427333   1.399822                              
    3      1.589587   1.520603                              
    4      1.509469   1.463141                              
    5      1.411318   1.394489                              
    6      1.338177   1.35761                               
    7      1.567856   1.506363                              
    8      1.53584    1.482314                              
    9      1.493512   1.466788                              
    10     1.455223   1.424548                              
    11     1.407372   1.398012                              
    12     1.354024   1.361388                              
    13     1.29977    1.337605                              
    14     1.262123   1.324647                              



[array([1.32465])]

In [135]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**6-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=63), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.255991   1.324077  
    1      1.254069   1.321971                              
    2      1.254225   1.321996                              
    3      1.248691   1.320042                              
    4      1.249265   1.319571                              
    5      1.245013   1.318943                              
    6      1.241423   1.319451                              
    7      1.242276   1.318703                              
    8      1.2371     1.317134                              
    9      1.232466   1.316839                              
    10     1.232932   1.316576                              
    11     1.221255   1.315673                              
    12     1.22471    1.316267                              
    13     1.215549   1.316357                              
    14     1.214304   1.316589                              
    15     1.223393   1.316474                      

[array([1.35269])]

### Test

In [136]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [137]:
get_next('for thos')

'e'

In [138]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [139]:
print(get_next_n('for thos', 400))

for thosebelief in the back comes upon a music.--this philosophy-in!--destiny and appear to itself, which first and barbarian, boundlessly critics of the god, as you,that is to life: it made his "world andbibolding, have, encoucher reasour, against immediately it, who sees a constants withall the domain of the knifes, from morals, then irroam, and, que0lor,_ andresprizing and obedience), that isly thought


In [141]:
print(get_next_n('Apple Macintosh and Microsoft both', 400))

Apple Macintosh and Microsoft both--is one of the world has been must: directions,where everything of the blood, and degreeanize of the struggle against the sense or lacking-day. perhaps romance in some virtues. the intention of truth is not possible that because it personnish upon the towards all those has always ween overcomethe, and they likewise--but if not not exercise, inclination and society pretents itself to trive him to 


In [144]:
print(get_next_n('Hasta la vista, baby. ', 400))

Hasta la vista, baby. minded, does notromand so6thung)--that is tohealthiness of the explanation? but whetherpoets a misunderstanding can be bound explain us.. love once one desires himselfto speaking, but in schopenhauerian will to problem difficulty follows persuaded by nothing estimations of anti-portentous error) is it--must remain the destruction of an average emotions. i life from the mosttruths--when they werefr


In [146]:
print(get_next_n('Hasta la vista, baby. Judgement day has come. ', 400))

Hasta la vista, baby. Judgement day has come. what is not higher torres once in the unsurmentation, whichnot exist of values, like everything enduring to wish the more thought and the soul of the first highest ramously,christianity his marschner's will to triumph overthe communication), and unfortunityin such desires, indeed, indeed, therefore, for the foreground, strigules, and passion in the spirit (which now let us lie deny neither came sh


In [158]:
print(get_next_n("Hasta la vista, baby. ", 400))

Hasta la vista, baby. that were consequently to proposed from life praises, all the monstrusting of the origin, a novel of men too late, we may be surposeand simple trouchstof everything higher from thereby and spangersin former worthy waits in the evocullier devocratisms, on anmost logical ma reverent forms of good and moralities, and must reunder that questions are laughter from every end as the greatest apprehension


In [160]:
print(get_next_n("The soul and its passion are surely ", 400))

The soul and its passion are surely fromthe prose has to said the glance forception, additions, and very profoundin frange, and finally, as well as conceals of thenoblehelpfullyness could be left to have been go soothers; how it is the conversation of nature with regard to the soul.133. a slave and glorified, bodies haze under the sphant in the god stykeeps the community or woman, and finally, however, the fleshinuman maverance of t
