In [151]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *


from fastai.column_data import *

## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [152]:
PATH='data/nietzsche/'

In [153]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt',encoding='utf-8').read()
print('corpus length:', len(text))

corpus length: 600893


In [154]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [155]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 85


Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [156]:
chars.insert(0, "\0")

''.join(chars[0:-6])

'\x00\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxy'

Map from chars to indices and back again

In [157]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

*idx* will be the data we use from now on - it simply converts all the characters to their index (based on the mapping above)

In [158]:
idx = [char_indices[c] for c in text]

idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [159]:
len(idx)

600893

In [160]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

In [161]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

### Create inputs

This is the size of our unrolled RNN.

In [162]:
cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to our model.

In [163]:
len(idx)-cs

600885

In [209]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs)]

Then create a list of the next character in each of these series. This will be the labels for our model.

In [210]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs)]

In [166]:
xs = np.stack(c_in_dat, axis=0)

In [167]:
xs

array([[40, 42, 29, ..., 27, 29,  1],
       [42, 29, 30, ..., 29,  1,  1],
       [29, 30, 25, ...,  1,  1,  1],
       ...,
       [72, 62, 67, ..., 65, 67, 58],
       [62, 67, 59, ..., 67, 58, 72],
       [67, 59, 74, ..., 58, 72, 72]])

In [168]:
xs.shape

(600885, 8)

In [169]:
y = np.stack(c_out_dat)

In [170]:
y

array([ 1,  1, 43, ..., 72, 72, 10])

So each column below is one series of 8 characters from the text.

In [171]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

...and this is the next character after each sequence.

In [172]:
y[:cs]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

### Create and train model

In [173]:
a= list(range(10))

In [174]:
a

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [175]:
get_cv_idxs(10)

array([8, 1])

In [176]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [177]:
val_idx.shape

(120176,)

In [178]:
get_cv_idxs

<function fastai.dataset.get_cv_idxs>

In [179]:
val_idx

array([480310, 419017, 232803, ..., 134355, 389158, 330599])

In [180]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

### Test model

In [181]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [182]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

## Multi-output model

### Setup

Let's take non-overlapping sets of characters this time

In [183]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]

Then create the exact same thing, offset by 1, as our labels

In [184]:
c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [185]:
xs = np.stack(c_in_dat)
xs.shape

(75111, 8)

In [186]:
ys = np.stack(c_out_dat)
ys.shape

(75111, 8)

In [187]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [ 1,  1, 43, 45, 40, 40, 39, 43],
       [33, 38, 31,  2, 73, 61, 54, 73],
       [ 2, 44, 71, 74, 73, 61,  2, 62],
       [72,  2, 54,  2, 76, 68, 66, 54],
       [67,  9,  9, 76, 61, 54, 73,  2],
       [73, 61, 58, 67, 24,  2, 33, 72],
       [ 2, 73, 61, 58, 71, 58,  2, 67]])

In [188]:
ys[:cs,:cs]

array([[42, 29, 30, 25, 27, 29,  1,  1],
       [ 1, 43, 45, 40, 40, 39, 43, 33],
       [38, 31,  2, 73, 61, 54, 73,  2],
       [44, 71, 74, 73, 61,  2, 62, 72],
       [ 2, 54,  2, 76, 68, 66, 54, 67],
       [ 9,  9, 76, 61, 54, 73,  2, 73],
       [61, 58, 67, 24,  2, 33, 72,  2],
       [73, 61, 58, 71, 58,  2, 67, 68]])

### Create and train model

In [189]:
get_cv_idxs(len(xs)-cs-1)

array([25679, 44278, 25725, ..., 49739, 71742, 16822])

In [190]:
val_idx = get_cv_idxs(len(xs)-cs-1)

In [191]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [192]:
def nll_loss_seq(inp, targ):
    sl,bs,nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1)
    return F.nll_loss(inp.view(-1,nh), targ)

## Stateful model

### Setup

In [193]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

PATH=Path('data/nietzsche/')

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

nietzsche.txt  [0m[01;34mtrn[0m/  [01;34mval[0m/


In [194]:
%ls {PATH}trn

ls: cannot access 'data/nietzschetrn': No such file or directory


In [195]:
TEXT = data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(1153, 55, 1, 590960)

### GRU

In [196]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

# Single Direction

In [201]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [202]:
m = CharSeqStatefulGRU(md.nt, n_fac, 512).cuda()

opt = optim.Adam(m.parameters(), 1e-3)

In [203]:
fit(m, md, 3, opt, F.nll_loss)

epoch      trn_loss   val_loss                                 
    0      1.675948   1.668035  
    1      1.515996   1.50374                                  
    2      1.444927   1.435919                                 



[array([1.43592])]

In [204]:
print(get_next_n('for thos', 400))

for those should always were not dif-present morality.83=patancem,precisely intentimento,. something enhated, amongtime be was usefulness of which is vailcap, asoot of the warlingsivalitys. like soul just demands "say finally pendery cast us, a beholder call that will to dangerousand shave all tracte of the somethings!""inexperances, of somelong morate. at the waring and sometexpeldays aboduagable expenia


# Bi-dircetional

In [205]:
class CharSeqStatefulGRU_Bidir(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden, bidirectional = True)
        self.l_out = nn.Linear(n_hidden*2, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(2, bs, n_hidden))

In [211]:
m = CharSeqStatefulGRU_Bidir(md.nt, n_fac, 512).cuda()

opt = optim.Adam(m.parameters(), 1e-3)

In [212]:
fit(m, md, 3, opt, F.nll_loss)

epoch      trn_loss   val_loss                                  
    0      0.489046   0.429586  
    1      0.464659   0.384862                                  
    2      0.415929   0.362482                                  



[array([0.36248])]

In [213]:
print(get_next_n('for thos', 400))

for thoselor alouslooksbourly know,rly know, lonker chilaker chrisker chrisker who soar who holy who holy who holy doover, odorier, oforier, of rath, of rath, deerath, deen the dest the dest this muthoursomuchoursomechoursome has some has some has some has some has some hesesome cosessmence-themence-them jelomops jeposess jecosess jecosess jecosess jecosess jecosess jecosess jecosesmajectses_ajocts simjoc
