In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline


from pathlib import Path
    
import numpy as np
import pandas as pd

#fastai
import fastai 
from fastai.column_data import ColumnarModelData
import fastai.io as io
from fastai.io import *
from fastai.conv_learner import *
from fastai.column_data import *
import fastai as ai

#pytorch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable as V


In [1]:
# set theme 
from jupyterthemes.stylefx import set_nb_theme
set_nb_theme('monokai')

# SETUP

In [3]:
path=Path('data/nietzsche')
path.mkdir(exist_ok=True)

In [4]:
str(path)

'data/nietzsche'

# Data

In [5]:
io.get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", path/'nietzsche.txt')

In [6]:
file = open(path/'nietzsche.txt', encoding='utf-8')
text = file.read()

In [7]:
len(text)

600893

## chars, chars <-> corpus_idx 

In [8]:
chars = sorted(list(set(text))); 
vocab_size = len(chars) +1; 
chars.insert(0, "\0")

In [9]:
''.join(chars) # this is our token set

'\x00\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyzÆäæéë'

In [10]:
char_indices = { c: i for i, c in enumerate(chars)}
indices_char = { i: c for i, c in enumerate(chars)}

In [11]:
indices_char[char_indices['a']]

'a'

### Corpus token -> char set

In [12]:
idx = [char_indices[c] for c in text]

In [13]:
bptt = 3

In [14]:
c1_data = [idx[i] for i in range(0, len(idx)-bptt, bptt) ]
c2_data = [idx[i+1] for i in range(0, len(idx)-bptt, bptt) ]
c3_data = [idx[i+2] for i in range(0, len(idx)-bptt, bptt) ]
c4_data = [idx[i+3] for i in range(0, len(idx)-bptt, bptt) ]

Inputs

In [15]:
x1 = np.stack(c1_data)
x2 = np.stack(c2_data)
x3 = np.stack(c3_data)

outputs

In [16]:
y = np.stack(c4_data)

In [17]:
x1[:4], x2[:4], x3[:4]

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [18]:
y[:4]

array([30, 29,  1, 40])

## nn.Module

In [19]:
class OneNet(nn.Module):
    def __init__(self, vocab_size, n_fac, n_hidden):
        super().__init__()
        
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.linear_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        h = F.relu(self.linear_hidden(in1))
        
        return F.log_softmax(self.l_out(h))

### n_fac 10 ; n_hidden 10

In [20]:
m = OneNet(vocab_size, 10, 10); m

OneNet(
  (e): Embedding(85, 10)
  (l_in): Linear(in_features=10, out_features=10, bias=True)
  (linear_hidden): Linear(in_features=10, out_features=10, bias=True)
  (l_out): Linear(in_features=10, out_features=85, bias=True)
)

In [21]:
# it uses a col data model dataloader because it was easier
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1, x2, x3], axis=1), y, bs=64); md

<fastai.column_data.ColumnarModelData at 0x7fe80eae15c0>

In [22]:
opt = torch.optim.Adam(m.parameters(), 1e-2)

In [23]:
ai.learner.fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                                 
    0      3.051601   2.81875   



[2.8187497]

### test model

In [24]:
# pass in any text and get next prediction for 
def get_next(string):
    ts = torch.LongTensor(np.array([ char_indices[c] for c in string]).astype(np.int64))
    pred = m(*V(ts))
    p = np.argmax(np.array(F.softmax(pred).data))
    return indices_char[p]

In [25]:
get_next('sd2')

'e'

## RNN Model

In [26]:
cs = 8

In [27]:
c_in_data = [[idx[j+i] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]; 

In [28]:
print(c_in_data[:2])
print(idx[:16])

[[40, 42, 29, 30, 25, 27, 29, 1], [1, 1, 43, 45, 40, 40, 39, 43]]
[40, 42, 29, 30, 25, 27, 29, 1, 1, 1, 43, 45, 40, 40, 39, 43]


In [29]:
# get the array shift across by 1
c_out_data = [[idx[j+i] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [30]:
print(c_out_data[:2])
print(idx[:16])

[[42, 29, 30, 25, 27, 29, 1, 1], [1, 43, 45, 40, 40, 39, 43, 33]]
[40, 42, 29, 30, 25, 27, 29, 1, 1, 1, 43, 45, 40, 40, 39, 43]


In [31]:
xs = np.stack(c_in_data)
print(xs.shape)
ys = np.stack(c_out_data)
ys.shape

(75111, 8)


(75111, 8)

In [32]:
val_idxs = get_cv_idxs(len(xs)-cs-1)

In [33]:
md = ColumnarModelData.from_arrays('.', val_idxs, xs, ys, bs=64)

In [34]:
md

<fastai.column_data.ColumnarModelData at 0x7fe80eb09898>

In [172]:
class RNNdeep(nn.Module):
    def __init__(self, vocab_size, n_fac, n_hidden, n_layers):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Sequential(*[nn.Linear(n_hidden, n_hidden) for l in range(0,n_layers)])
        self.l_out = nn.Linear(n_hidden,vocab_size)
        
    def forward(self, *cs):
#         print('hello')
        bs = cs[0].size(0)
        output = []
        h = V(torch.eye(bs,n_hidden))
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            for l_hidden in self.l_hidden:
                h = F.tanh(l_hidden(h+inp))
            output.append(h)
        
#         print(output[0].size())
        outp = self.l_out(torch.stack(output))
        
#         print(self.l_hidden)
#         print('\n',len(output), torch.stack(output).size())
#         print(self.l_out().size())
        return F.log_softmax(outp, dim=-1)

In [178]:
m = RNNish(vocab_size, 50, 100, 2); n_hidden = 100;

In [179]:
opt = torch.optim.Adam(m.parameters(), 1e-2)


In [180]:
m

RNNish(
  (e): Embedding(85, 50)
  (l_in): Linear(in_features=50, out_features=100, bias=True)
  (l_hidden): Sequential(
    (0): Linear(in_features=100, out_features=100, bias=True)
    (1): Linear(in_features=100, out_features=100, bias=True)
  )
  (l_out): Linear(in_features=100, out_features=85, bias=True)
)

In [184]:
def n11_loss_seq(inp, targ):
    btpp, bs, nh = inp.size()
    
    targ = targ.transpose(0,1).contiguous().view(-1)
    return F.nll_loss(inp.view(-1,nh), targ)

In [185]:
ai.learner.fit(m, md, 1, opt, n11_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.109099   2.109978  



[2.1099784]

In [183]:
5 - 2.26; 2 -

## test it!

In [209]:
def get_next(string):
    ts = torch.LongTensor(np.array([ char_indices[c] for c in string]).astype(np.int64))
    pred = m(*V(ts))
    print(pred)
    print(pred.size()) 
#     print(p)
#     p = np.argmax(np.array(F.softmax(pred).data))
#     return indices_char[p]

In [210]:
get_next('there is ')

Variable containing:
(0 ,.,.) = 

Columns 0 to 7 
  -15.3920  -3.4316  -1.8884  -6.2779  -7.6052  -8.9586 -12.1316  -9.2874

Columns 8 to 15 
   -4.4555  -6.4764  -5.7524 -11.0152 -12.4151  -9.9013 -13.5600 -10.5061

Columns 16 to 23 
  -12.7051 -12.5214 -13.4784 -14.3516 -12.0504  -7.7750  -7.5823 -10.4553

Columns 24 to 31 
   -7.5508 -10.8196 -12.4224 -13.0924 -13.4083 -10.0851 -12.3235 -13.2254

Columns 32 to 39 
   -9.8178  -9.5010 -14.0043 -12.3332 -13.8145 -12.5171 -11.8100  -9.3355

Columns 40 to 47 
  -10.8675 -13.4550 -11.4204 -11.3005 -11.9775 -10.3155 -14.1289 -12.9302

Columns 48 to 55 
  -15.2104 -12.5602 -13.3358 -12.5372 -10.3844  -9.5092  -3.3653 -11.8862

Columns 56 to 63 
   -7.4486 -10.1890  -2.1890  -7.7129 -10.7771  -1.3372  -2.2913 -11.4992

Columns 64 to 71 
  -13.3554  -4.1443  -7.9552  -7.7247  -2.1605  -8.7993 -11.8800  -3.7821

Columns 72 to 79 
   -3.6023  -4.3060  -3.3564 -11.6908  -5.6886 -16.2792  -2.9955  -9.6346

Columns 80 to 84 
  -15.0971 -15.1871 -