In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle

Based on fast.ai lesson4-imdb notebook

### Setup

In [2]:
PATH='data/factor/factor/'

TRN_PATH = 'train/'
VAL_PATH = 'test/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

[0m[01;34mmodels[0m/  [01;34mtest[0m/  [01;34mtmp[0m/  [01;34mtrain[0m/


### Data Exploration

In [10]:
trn_files = !ls {TRN}
trn_files[:10]

['0.txt',
 '100000.txt',
 '100001.txt',
 '100002.txt',
 '100003.txt',
 '100004.txt',
 '100005.txt',
 '100006.txt',
 '100007.txt',
 '100008.txt']

In [11]:
len(trn_files)

120000

Example factor file...

In [6]:
factor_file = !cat {TRN}{trn_files[6]}
factor_file[0]

'[ 1 0 0 0 0 5 =  3 * 5 * 5 9 * 1 1 3 ] '

Tokenize it...

In [7]:
' '.join(spacy_tok(factor_file[0]))

'[ 1 0 0 0 0 5 =   3 * 5 * 5 9 * 1 1 3 ]'

### Create Model

In [3]:
TEXT = data.Field(lower=True, tokenize=spacy_tok)

... or load it ...

In [8]:
TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))

In [9]:
batch_size=64
back_prop_tt=70

In [10]:
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=batch_size, bptt=back_prop_tt, min_freq=10)

Save vocab for later...

In [17]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

Here are the: # batches; # unique tokens in the vocab; # tokens in the training set; # sentences

In [11]:
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(510, 18, 1, 2291286)

Check mapping of integer id to token...

In [12]:
# 'itos': 'int-to-string'
TEXT.vocab.itos

['<unk>',
 '<pad>',
 '*',
 '1',
 '2',
 '3',
 '7',
 '9',
 ' ',
 '<eos>',
 '=',
 '[',
 ']',
 '5',
 '4',
 '6',
 '8',
 '0']

Convert training data into integer IDs of tokens...

In [13]:
TEXT.numericalize([md.trn_ds[0].text[:12]])

Variable containing:
   11
    3
    3
   13
    5
   13
   14
   10
    8
    4
    2
    3
[torch.cuda.LongTensor of size 12x1 (GPU 0)]

### Model Setup

In [14]:
em_sz = 5000  # size of each embedding vector (suggested = 200)
nh = 500     # number of hidden activations per layer (suggested = 500)
nl = 3       # number of layers (suggested = 3)

Researchers have found that large amounts of *momentum* (which we'll learn about later) don't work well with these kinds of *RNN* models, so we create a version of the *Adam* optimizer with less momentum than it's default of `0.9`.

In [15]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [16]:
learner = md.get_model(opt_fn, em_sz, nh, nl,
               dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

### Choose a Learning Rate

In [17]:
lrf=learner.lr_find()

 60%|█████▉    | 304/510 [11:29<07:46,  2.27s/it, loss=6.5] 

In [36]:
# For some reason, plots we want below won't always plot unless we 
#plt.hist([2,0,6])

In [37]:
learner.sched.plot_lr()

In [38]:
learner.sched.plot()

In [43]:
learn_rate = 3e-3

### Train

In [45]:
learner.fit(learn_rate, 1, wds=1e-6, cycle_len=1, cycle_mult=2)

  2%|▏         | 11/510 [00:05<04:13,  1.97it/s, loss=1.12] 


Exception in thread Thread-8:
Traceback (most recent call last):
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



epoch      trn_loss   val_loss                              
    0      1.036507   2.002888  



[2.0028877]

In [None]:
learner.save_encoder('adam1_enc')

In [None]:
learner.load_encoder('adam1_enc')

In [None]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))

### Test

In [52]:
m = learner.model
ss="""[ 1 2 = 2"""
s = [spacy_tok(ss)]
t=TEXT.numericalize(s)
' '.join(s[0])

'[ 1 2 = 2'

In [53]:
# Set batch size to 1
m[0].bs=1
# Turn off dropout
m.eval()
# Reset hidden state
m.reset()
# Get predictions from model
res,*_ = m(t)
# Put the batch size back to what it was
m[0].bs=batch_size

Let's see what the top 10 predictions were for the next word after our short text:

In [54]:
nexts = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(nexts)]

['3', '2', '*', '9', '6', '7', '4', ']', '0', '5']

...and let's see if our model can generate a bit more text all by itself!

In [51]:
print(ss,"\n")
for i in range(50):
    n=res[-1].topk(2)[1]
    n = n[1] if n.data[0]==0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res,*_ = m(n[0].unsqueeze(0))
print('...')

[ 1 2 =  

  3 * 3 * 3 * 3 * 7 * 7 ] <eos> [ 1 1 1 1 1 3 =   1 1 * 1 0 1 5 3 ] <eos> [ 1 1 1 1 1 3 =   1 1 * 1 0 1 5 3 ...
