In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle

In [3]:
PATH = 'data/aclImdb'

os.makedirs(f'{PATH}/train/all', exist_ok=True)
os.makedirs(f'{PATH}/test/all', exist_ok=True)
os.makedirs(f'{PATH}/models', exist_ok=True)
os.makedirs(f'{PATH}/tmp', exist_ok=True)

TRN_PATH = 'train/all'
VAL_PATH = 'test/all'

TRN = f'{PATH}/{TRN_PATH}'
VAL = f'{PATH}/{VAL_PATH}'

# !!cp -r {PATH}/train/pos/* {TRN}/
# !!cp -r {PATH}/train/neg/* {TRN}/
# !!cp -r {PATH}/train/unsup/* {TRN}/ # have to run this line in terminal for it to work!

# !!cp -r {PATH}/test/pos/* {VAL}/
# !!cp -r {PATH}/test/neg/* {VAL}/

%ls {PATH}

imdbEr.txt  imdb.vocab  [0m[01;34mmodels[0m/  README  [01;34mtest[0m/  [01;34mtmp[0m/  [01;34mtrain[0m/


In [None]:
# each review is stored as an individual text file
trn_files = !ls {TRN}

print(f'Total files in /train/all: {len(trn_files)}')
trn_files[:10]

In [None]:
# an example review
review = !cat {TRN}/{trn_files[6]}
review[0]

In [None]:
# how many words in the dataset (train)
!find {TRN} -name '*.txt' | xargs cat | wc -w

In [None]:
# how many words in the dataset (val)
!find {VAL} -name '*.txt' | xargs cat | wc -w

In [None]:
# tokenize = split each sentence into a list of words
' '.join(spacy_tok(review[0]))

In [4]:
# createa torchtext field = describes how to preprocess a piece of text
TEXT = data.Field(lower=True, tokenize=spacy_tok)

In [5]:
# create a ModelData object for language modeling
bs = 64
bptt = 70

In [6]:
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)

# min_freq = 10 says, "treat any word that appears less than 10 times as the word <unk>"
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10)

In [7]:
# after building the ModelData object, TEXT.vocab is set.  because this will be needed again, save it
pickle.dump(TEXT, open(f'{PATH}/models/TEXT.pkl', 'wb'))

In [8]:
# batches
# of unique tokens in vocab
# of items in training set (as LanguageModel is concerned, there is only one thing, the whole corpus)
# of words
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(4602, 34945, 1, 20621966)

In [None]:
# int to string mapping
TEXT.vocab.itos[:12]

In [None]:
# string to int mapping
TEXT.vocab.stoi['the']

In [None]:
# in a LanguageModelData object there is only one item in each dataset: all the words joined together
md.trn_ds[0].text[:12]

In [None]:
# torchtext will handle turning this words into integer Ids
TEXT.numericalize([md.trn_ds[0].text[:12]])

In [None]:
batch = next(iter(md.trn_dl))
print(batch[0].size()), print(batch[1].size())

batch

In [9]:
emb_sz = 200       # size of each embedding vector
nh = 500           # of hidden activations per layer
nl = 3             # of layers

In [10]:
# for NLP, configure Adam to use less momentum than the defaul of 0.9
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [11]:
learner = md.get_model(opt_fn, emb_sz, nh, nl,
                      dropouti=0.24, dropout=0.025, wdrop=0.05, dropoute=0.01, dropouth=0.025)

learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip = 0.3

In [None]:
lrf = learner.lr_find() # took about 20 mins on AWS

In [None]:
learner.sched.plot()

In [12]:
learner.fit(3e-3, 2, wds=1e-6, cycle_len=1, cycle_mult=2) # took about 

[ 0.       4.80112  4.71374]                                  
[ 1.       4.60368  4.5115 ]                                  
[ 2.       4.50439  4.43294]                                  



In [13]:
learner.save_encoder('imdb_adam1_enc')
# learner.load_encoder('imdb_adam1_enc')

In [14]:
# learner.fit(3e-3, 4, wds=1e-6, cycle_len=10, cycle_save_name='imdb_adam2_4_10')
learner.fit(3e-3, 2, wds=1e-6, cycle_len=10, cycle_save_name='imdb_adam2_c2_cl10')

[ 0.       4.57838  4.4926 ]                                  
[ 1.       4.53194  4.43823]                                  
[ 2.       4.46775  4.3933 ]                                  
[ 3.       4.4383   4.36114]                                  
[ 4.       4.3947   4.32969]                                  
[ 5.       4.36486  4.30344]                                  
[ 6.       4.34693  4.28233]                                  
[ 7.       4.29624  4.26529]                                  
[ 8.       4.28105  4.25716]                                  
[ 9.       4.28325  4.25501]                                  
[ 10.        4.44738   4.36086]                               
[ 11.        4.41951   4.34473]                               
[ 12.        4.40578   4.32778]                               
[ 13.        4.3685    4.30527]                               
[ 14.        4.33997   4.28252]                               
[ 15.        4.305     4.25924]                        

In [19]:
learner.save_encoder('imdb_adam2_enc')
# learner.load_encoder('imdb_adam2_enc')

In [16]:
# learner.fit(3e-3, 1, wds=1e-6, cycle_len=20, cycle_save_name='imdb_adam2_1_20')
learner.fit(3e-4, 1, wds=1e-6, cycle_len=10, cycle_save_name='imdb_adam3_c1_cl10')

[ 0.       4.3052   4.28043]                                  
[ 1.       4.29035  4.26729]                                  
[ 2.       4.28406  4.25898]                                  
[ 3.       4.27913  4.25373]                                  
[ 4.       4.24722  4.24864]                                  
[ 5.       4.27078  4.24633]                                  
[ 6.       4.25099  4.24333]                                  
[ 7.       4.24903  4.24282]                                  
[ 8.       4.24694  4.24157]                                  
[ 9.       4.26147  4.2409 ]                                  



In [17]:
learner.save_encoder('imdb_adam3_enc')

In [18]:
learner.load_cycle('imdb_adam2_c2_cl10', 1)

In [20]:
# metric perplexity (how language model accuracy generally measured) = exp() of loss functino
np.exp(4.21699)

67.829011387804172

## Test

In [93]:
# create a short bit of text to "prime" the precitions, then use torchtext to numericalize it
# so we can feed it into our language model
m = learner.model
ss = """. So, it wasn't quite what I was expecting, but I really liked it anways! The best"""
ss = """. I couldn't believe this movie was so scary, but I loved it. The best part"""
s = [spacy_tok(ss)]
t = TEXT.numericalize(s)
' '.join(s[0])

". I could n't believe this movie was so scary , but I loved it . The best part"

In [94]:
m[0].bs = 1      # set batch size = 1
m.eval()         # turn-off dropout
m.reset()        # reset hidden state
res, *_ = m(t)   # get predictions from model
m[0].bs = bs     # put batch size back to what it was

In [95]:
# top 10 predictions for next word
nexts = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(nexts)]

['of', 'was', 'is', ',', '.', 'about', ':', 'in', 'for', 'i']

In [96]:
# try to generate more text
print(ss, "\n")

for i in range(50):
    n = res[-1].topk(2)[1]
    n = n[1] if n.data[0] == 0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res, *_ = m(n[0].unsqueeze(0))
    
print('...')

. I couldn't believe this movie was so scary, but I loved it. The best part 

of the movie was the scene where he was in the car . the scene where he is in the car is a great scene . the scene where he is in the car is a great scene . i would recommend this movie to anyone who likes movies that ...
