In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *


from fastai.column_data import *

## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [2]:
PATH='data/nietzsche/'

In [3]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt',encoding='utf-8').read()
print('corpus length:', len(text))

corpus length: 600893


In [4]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 85


Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [6]:
chars[:100]

['\n',
 ' ',
 '!',
 '"',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '=',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'Æ',
 'ä',
 'æ',
 'é',
 'ë']

In [7]:
chars.insert(0, "\0")

''.join(chars[0:-6])

'\x00\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxy'

Map from chars to indices and back again

In [8]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

*idx* will be the data we use from now on - it simply converts all the characters to their index (based on the mapping above)

In [9]:
idx = [char_indices[c] for c in text]

idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [10]:
len(idx)

600893

In [11]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## Three char model

### Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

In [12]:
cs=3
c1_dat = [idx[i]   for i in range(0, len(idx)-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-cs, cs)]

In [13]:
(c1_dat[:10],
c2_dat[:10],
c3_dat[:10])

([40, 30, 29, 1, 40, 43, 31, 61, 2, 74],
 [42, 25, 1, 43, 40, 33, 2, 54, 44, 73],
 [29, 27, 1, 45, 39, 38, 73, 73, 71, 61])

Our inputs

In [14]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

In [15]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs)]

In [16]:
cs=4

In [17]:
c_in_dat[0]

[40, 42, 29]

In [18]:
np.stack([x1,x2,x3],axis=1)

array([[40, 42, 29],
       [30, 25, 27],
       [29,  1,  1],
       ...,
       [72, 62, 67],
       [59, 74, 65],
       [67, 58, 72]])

Our output

In [19]:
[x1,x2,x3]

[array([40, 30, 29, ..., 72, 59, 67]),
 array([42, 25,  1, ..., 62, 74, 58]),
 array([29, 27,  1, ..., 67, 65, 72])]

In [20]:
y = np.stack(c4_dat)

In [21]:
c4_dat[:10]

[30, 29, 1, 40, 43, 31, 61, 2, 74, 2]

In [22]:
y[:10]

array([30, 29,  1, 40, 43, 31, 61,  2, 74,  2])

The first 4 inputs and outputs

In [23]:
x1[:4], x2[:4], x3[:4]

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [24]:
y[:4]

array([30, 29,  1, 40])

In [25]:
x1.shape, y.shape

((200297,), (200297,))

### Create and train model

Pick a size for our hidden state

In [13]:
n_hidden = 256

The number of latent factors to create (i.e. the size of the embedding matrix)

In [14]:
n_fac = 42

In [28]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)

        # The 'green arrow' from our diagram - the layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden)

        # The 'orange arrow' from our diagram - the layer operation from hidden to hidden
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # The 'blue arrow' from our diagram - the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

In [29]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

In [30]:
m = Char3Model(vocab_size, n_fac).cuda()

    Found GPU0 GeForce 930M which is of cuda capability 5.0.
    PyTorch no longer supports this GPU because it is too old.
    


In [31]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [32]:
xs[0]


  2
 72
 73
 67
 58
 67
 65
 66
  2
 67
 65
 62
 73
 68
 58
 68
 60
 61
 69
 58
 57
  2
 71
 58
  2
 58
 55
 61
 73
 58
 72
  2
  8
 58
  8
 76
 57
 54
  2
 68
 68
 54
 67
 58
  2
 61
  2
 54
 56
 60
 58
 71
 62
 26
 63
 57
 54
 73
 61
 71
 67
 54
 57
  2
  1
 58
 54
  2
 58
 72
 59
 58
 67
 73
 58
 58
 62
 65
 56
 72
 58
 62
 62
 67
 78
 58
 58
  2
 71
 65
 72
 72
 62
 65
 71
 27
 54
 68
  2
 76
 58
 57
 72
 67
 71
  2
  2
 74
 69
 71
 38
 65
  2
 73
  2
 62
 57
 68
 77
  2
 54
 66
 72
 37
 54
 62
  1
 62
  2
 67
 62
 62
 58
 71
 73
 62
  2
 74
 73
 68
 62
  2
 67
 73
 78
 58
 73
 29
 58
 67
  2
 58
 73
 54
 73
  1
  2
  2
  2
 60
 59
  2
  2
 72
 61
 78
 62
 67
 54
 62
 57
 71
 71
 61
 61
 73
 71
 58
  2
 67
 55
 72
 68
 54
 58
  1
 66
 62
 67
 58
 67
  2
 71
 68
 68
 68
  2
 76
  2
 61
  2
 65
  2
 73
 73
  8
 61
  2
 72
  2
 73
  2
  8
 56
  2
 74
 58
 73
 62
  8
 25
  2
 71
  2
 66
 57
 58
 72
 59
 71
 68
 74
  2
  9
 74
 73
 67
 72
 61
 73
 54
 54
 73
 57
  2
 71
 57
 68
 67
 58

In [33]:
for i in xs:
    print(i.size())

torch.Size([512])
torch.Size([512])
torch.Size([512])


In [34]:
opt = optim.Adam(m.parameters(), 1e-2)

In [35]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      2.093447   0.753646  



[0.7536463737487793]

In [36]:
set_lrs(opt, 0.001)

In [37]:
??set_lrs

In [38]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.844306   0.453826  



[0.4538259506225586]

### Test model

In [39]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [40]:
get_next('y. ')

'T'

In [41]:
get_next('ppl')

'i'

In [42]:
get_next(' th')

'e'

In [43]:
get_next('and')

' '

## Our first RNN!

### Create inputs

This is the size of our unrolled RNN.

In [44]:
cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to our model.

In [45]:
len(idx)-cs

600885

In [46]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs)]

Then create a list of the next character in each of these series. This will be the labels for our model.

In [47]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs)]

In [48]:
xs = np.stack(c_in_dat, axis=0)

In [49]:
xs

array([[40, 42, 29, ..., 27, 29,  1],
       [42, 29, 30, ..., 29,  1,  1],
       [29, 30, 25, ...,  1,  1,  1],
       ...,
       [72, 62, 67, ..., 65, 67, 58],
       [62, 67, 59, ..., 67, 58, 72],
       [67, 59, 74, ..., 58, 72, 72]])

In [50]:
xs.shape

(600885, 8)

In [51]:
y = np.stack(c_out_dat)

In [52]:
y

array([ 1,  1, 43, ..., 72, 72, 10])

So each column below is one series of 8 characters from the text.

In [53]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

...and this is the next character after each sequence.

In [54]:
y[:cs]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

### Create and train model

In [55]:
a= list(range(10))

In [56]:
a

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [57]:
get_cv_idxs(10)

array([8, 1])

In [58]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [59]:
val_idx.shape

(120176,)

In [60]:
get_cv_idxs

<function fastai.dataset.get_cv_idxs>

In [61]:
val_idx

array([480310, 419017, 232803, ..., 134355, 389158, 330599])

In [62]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

In [63]:
class CharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h+inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [64]:
m = CharLoopModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [65]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      2.046541   2.05752   



[2.0575203075975224]

In [66]:
set_lrs(opt, 0.001)

In [67]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.764465   1.758459  



[1.758458547404247]

In [68]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [69]:
m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [70]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [71]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.852781   1.83004   



[1.8300403959343872]

In [72]:
set_lrs(opt, 1e-4)

In [73]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      1.747602   1.74473   



[1.7447298634879171]

### Test model

In [74]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [75]:
get_next('for thos')

'e'

In [76]:
get_next('part of ')

't'

In [77]:
get_next('queens a')

'n'

## RNN with pytorch

In [78]:
class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [79]:
m = CharRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [80]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [81]:
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [82]:
ht = V(torch.zeros(1, 512,n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [83]:
t = m(*V(xs)); t.size()

torch.Size([512, 85])

In [84]:
set_lrs(opt, 1e-4)

In [85]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                              
    0      2.464662   2.448229  



[2.448229495321382]

### Test model

In [86]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [87]:
get_next('for thos')

' '

In [88]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [89]:
get_next_n('for thos', 40)

'for thos the the the the the the the the the the'

## Multi-output model

### Setup

Let's take non-overlapping sets of characters this time

In [15]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]

Then create the exact same thing, offset by 1, as our labels

In [16]:
c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [17]:
xs = np.stack(c_in_dat)
xs.shape

(200297, 3)

In [18]:
ys = np.stack(c_out_dat)
ys.shape

(200297, 3)

In [19]:
xs[:cs,:cs]

array([[40, 42, 29],
       [30, 25, 27],
       [29,  1,  1]])

In [20]:
ys[:cs,:cs]

array([[42, 29, 30],
       [25, 27, 29],
       [ 1,  1,  1]])

### Create and train model

In [21]:
get_cv_idxs(len(xs)-cs-1)

array([166233, 142959,  99182, ..., 180432, 175712, 172970])

In [22]:
val_idx = get_cv_idxs(len(xs)-cs-1)

In [23]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [24]:
class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        return F.log_softmax(self.l_out(outp), dim=-1)

m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [25]:
it = iter(md.trn_dl)
*xst,yt = next(it)

In [28]:
yt.size()

torch.Size([512, 3])

In [101]:
def nll_loss_seq(inp, targ):
    sl,bs,nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1)
    return F.nll_loss(inp.view(-1,nh), targ)

In [102]:
fit(m, md, 1, opt, nll_loss_seq)

epoch      trn_loss   val_loss                              
    0      2.605721   2.420993  



[2.420993103898476]

### Identity init!

In [103]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [104]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))


    1     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     1  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      1     0     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     1
[torch.cuda.FloatTensor of size 256x256 (GPU 0)]

In [105]:
fit(m, md, 1, opt, nll_loss_seq)

epoch      trn_loss   val_loss                              
    0      2.371583   2.226813  



[2.2268130838315434]

In [106]:
set_lrs(opt, 1e-3)

In [107]:
fit(m, md, 1, opt, nll_loss_seq)

epoch      trn_loss   val_loss                              
    0      2.146408   2.134873  



[2.1348727764048365]

## Stateful model

### Setup

In [108]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

PATH=Path('data/nietzsche/')

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

 Volume in drive C is Windows
 Volume Serial Number is C449-2EDF

 Directory of C:\Users\NokChan\Documents\SQL_Generator\data\nietzsche

21/04/2018  23:14    <DIR>          .
21/04/2018  23:14    <DIR>          ..
17/04/2018  00:07           600,901 nietzsche.txt
21/04/2018  23:15    <DIR>          trn
21/04/2018  23:15    <DIR>          val
               1 File(s)        600,901 bytes
               4 Dir(s)  21,839,384,576 bytes free


In [109]:
%ls {PATH}trn

 Volume in drive C is Windows
 Volume Serial Number is C449-2EDF

 Directory of C:\Users\NokChan\Documents\SQL_Generator\data



File Not Found


In [110]:
TEXT = data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(1153, 55, 1, 590960)

### RNN

In [111]:
class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [112]:
m = CharSeqStatefulRnn(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [113]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                                
    0      1.805127   1.800063  



[1.8000630281697734]

In [114]:
set_lrs(opt, 1e-4)

fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                                
    0      1.717884   1.735572  



[1.7355723484472552]

### RNN loop

In [115]:
# From the pytorch source

def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    return F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))

In [116]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h
        for c in cs: 
            o = self.rnn(self.e(c), o)
            outp.append(o)
        outp = self.l_out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [117]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [118]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                                
    0      1.802524   1.804739  



[1.804739026033176]

### GRU

In [119]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [120]:
# From the pytorch source code - for reference

def GRUCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
    gi = F.linear(input, w_ih, b_ih)
    gh = F.linear(hidden, w_hh, b_hh)
    i_r, i_i, i_n = gi.chunk(3, 1)
    h_r, h_i, h_n = gh.chunk(3, 1)

    resetgate = F.sigmoid(i_r + h_r)
    inputgate = F.sigmoid(i_i + h_i)
    newgate = F.tanh(i_n + resetgate * h_n)
    return newgate + inputgate * (hidden - newgate)

In [121]:
m = CharSeqStatefulGRU(md.nt, n_fac, 512).cuda()

opt = optim.Adam(m.parameters(), 1e-3)

In [122]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                                
    0      1.669922   1.665086  



[1.6650855121569161]

In [123]:
set_lrs(opt, 1e-4)

In [124]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                                
    0      1.574292   1.60323   



[1.6032295145184783]

### Putting it all together: LSTM

In [125]:
from fastai import sgdr

n_hidden=512

In [126]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [127]:
m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)

In [128]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [129]:
fit(m, md, 1, lo.opt, F.nll_loss)

epoch      trn_loss   val_loss                                
    0      1.789657   1.722253  



[1.722252681851387]

In [130]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 1, lo.opt, F.nll_loss, callbacks=cb)

epoch      trn_loss   val_loss                                
    0      1.571543   1.507939  



[1.507939008753779]

In [131]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 1, lo.opt, F.nll_loss, callbacks=cb)

epoch      trn_loss   val_loss                                
    0      1.553959   1.499733  



[1.4997327000780782]

### Test

In [132]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [133]:
get_next('for thos')

'e'

In [134]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [135]:
print(get_next_n('for thos', 400))

for those a can grieves superfulward and itself and one sure words andbeen sary, the more might to a miself-siffer toit; that thisgory are prideness is a contempt, a be must opist,men desiration, he own despire and rack constance conceation--the that be always--aftience ishe a may"--andforce which who conouncerience are god is was a saif is a curation that this us serious of maked of visliarly,which you o
