In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle

In [3]:
PATH = 'data/aclImdb'

os.makedirs(f'{PATH}/train/all', exist_ok=True)
os.makedirs(f'{PATH}/test/all', exist_ok=True)
os.makedirs(f'{PATH}/models', exist_ok=True)
os.makedirs(f'{PATH}/tmp', exist_ok=True)

TRN_PATH = 'train/all'
VAL_PATH = 'test/all'

TRN = f'{PATH}/{TRN_PATH}'
VAL = f'{PATH}/{VAL_PATH}'

# !!cp -r {PATH}/train/pos/* {TRN}/
# !!cp -r {PATH}/train/neg/* {TRN}/
# !!cp -r {PATH}/train/unsup/* {TRN}/ # have to run this line in terminal for it to work!

# !!cp -r {PATH}/test/pos/* {VAL}/
# !!cp -r {PATH}/test/neg/* {VAL}/

%ls {PATH}

imdbEr.txt  imdb.vocab  [0m[01;34mmodels[0m/  README  [01;34mtest[0m/  [01;34mtmp[0m/  [01;34mtrain[0m/


In [None]:
# each review is stored as an individual text file
trn_files = !ls {TRN}

print(f'Total files in /train/all: {len(trn_files)}')
trn_files[:10]

In [None]:
# an example review
review = !cat {TRN}/{trn_files[6]}
review[0]

In [None]:
# how many words in the dataset (train)
!find {TRN} -name '*.txt' | xargs cat | wc -w

In [None]:
# how many words in the dataset (val)
!find {VAL} -name '*.txt' | xargs cat | wc -w

In [None]:
# tokenize = split each sentence into a list of words
' '.join(spacy_tok(review[0]))

In [None]:
# createa torchtext field = describes how to preprocess a piece of text
TEXT = data.Field(lower=True, tokenize=spacy_tok)

In [None]:
# create a ModelData object for language modeling
bs = 64
bptt = 70

In [None]:
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)

# min_freq = 10 says, "treat any word that appears less than 10 times as the word <unk>"
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10)

In [None]:
# after building the ModelData object, TEXT.vocab is set.  because this will be needed again, save it
pickle.dump(TEXT, open(f'{PATH}/models/TEXT.pkl', 'wb'))

In [None]:
# batches
# of unique tokens in vocab
# of items in training set (as LanguageModel is concerned, there is only one thing, the whole corpus)
# of words
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

In [None]:
# int to string mapping
TEXT.vocab.itos[:12]

In [None]:
# string to int mapping
TEXT.vocab.stoi['the']

In [None]:
# in a LanguageModelData object there is only one item in each dataset: all the words joined together
md.trn_ds[0].text[:12]

In [None]:
# torchtext will handle turning this words into integer Ids
TEXT.numericalize([md.trn_ds[0].text[:12]])

In [None]:
batch = next(iter(md.trn_dl))
print(batch[0].size()), print(batch[1].size())

batch

In [None]:
emb_sz = 200       # size of each embedding vector
nh = 500           # of hidden activations per layer
nl = 3             # of layers

In [None]:
# for NLP, configure Adam to use less momentum than the defaul of 0.9
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [None]:
learner = md.get_model(opt_fn, emb_sz, nh, nl,
                      dropouti=0.24, dropout=0.025, wdrop=0.05, dropoute=0.01, dropouth=0.025)

learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip = 0.3

In [None]:
lrf = learner.lr_find() # took about 20 mins on AWS

In [None]:
learner.sched.plot()

In [None]:
learner.fit(3e-3, 2, wds=1e-6, cycle_len=1, cycle_mult=2) # took about 

In [None]:
learner.save_encoder('imdb_adam1_enc')
# learner.load_encoder('imdb_adam1_enc')

In [None]:
# learner.fit(3e-3, 4, wds=1e-6, cycle_len=10, cycle_save_name='imdb_adam2_4_10')
learner.fit(3e-3, 2, wds=1e-6, cycle_len=10, cycle_save_name='imdb_adam2_c2_cl10')

In [None]:
learner.save_encoder('imdb_adam2_enc')
# learner.load_encoder('imdb_adam2_enc')

In [None]:
# learner.fit(3e-3, 1, wds=1e-6, cycle_len=20, cycle_save_name='imdb_adam2_1_20')
learner.fit(3e-4, 1, wds=1e-6, cycle_len=10, cycle_save_name='imdb_adam3_c1_cl10')

In [None]:
learner.save_encoder('imdb_adam3_enc')

In [None]:
learner.load_cycle('imdb_adam2_c2_cl10', 1)

In [None]:
# metric perplexity (how language model accuracy generally measured) = exp() of loss functino
np.exp(4.21699)

## Test

In [None]:
# create a short bit of text to "prime" the precitions, then use torchtext to numericalize it
# so we can feed it into our language model
m = learner.model
ss = """. So, it wasn't quite what I was expecting, but I really liked it anways! The best"""
ss = """. I couldn't believe this movie was so scary, but I loved it. The best part"""
s = [spacy_tok(ss)]
t = TEXT.numericalize(s)
' '.join(s[0])

In [None]:
m[0].bs = 1      # set batch size = 1
m.eval()         # turn-off dropout
m.reset()        # reset hidden state
res, *_ = m(t)   # get predictions from model
m[0].bs = bs     # put batch size back to what it was

In [None]:
# top 10 predictions for next word
nexts = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(nexts)]

In [None]:
# try to generate more text
print(ss, "\n")

for i in range(50):
    n = res[-1].topk(2)[1]
    n = n[1] if n.data[0] == 0 else n[0]
    print(TEXT.vocab.itos[n.data[0]], end=' ')
    res, *_ = m(n[0].unsqueeze(0))
    
print('...')

## Sentiment

In [4]:
bs = 64
bptt = 70

emb_sz = 200       # size of each embedding vector
nh = 500           # of hidden activations per layer
nl = 3             # of layers

# for NLP, configure Adam to use less momentum than the defaul of 0.9
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [5]:
# use the same vocab built from the language model so as to ensure words map to same Ids
TEXT = pickle.load(open(f'{PATH}/models/TEXT.pkl', 'rb'))

In [6]:
IMDB_LABEL = data.Field(sequential=False)
splits = torchtext.datasets.IMDB.splits(TEXT, IMDB_LABEL, 'data/')

In [7]:
t = splits[0].examples[0]

In [8]:
t.label, ' '.join(t.text[:10])

('pos', 'this modern film noir with its off beat humour and')

In [7]:
# fastai can create a ModelData object directly from torchtext splits
md2 = TextData.from_splits(PATH, splits, bs)

In [8]:
m3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=emb_sz, n_hid=nh, n_layers=nl,
                      dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3)

m3.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
m3.load_encoder(f'imdb_adam2_enc')

In [11]:
m3.clip = 25.
lrs = np.array([1e-4, 1e-3, 1e-2])

In [12]:
m3.freeze_to(-1) # freeze everything except last layer
m3.fit(lrs/2, 1, metrics=[accuracy])

[ 0.       0.4      0.25893  0.89667]                        



In [13]:
m3.unfreeze()
m3.fit(lrs, 1, metrics=[accuracy], cycle_len=1)

[ 0.       0.28697  0.2018   0.92192]                        



In [15]:
m3.fit(lrs, 7, metrics=[accuracy], cycle_len=2, cycle_save_name='imdb_sent1_c7_cl2')

[ 0.       0.2563   0.20161  0.92508]                        
[ 1.       0.22474  0.18548  0.92913]                        
[ 2.       0.22421  0.17541  0.93361]                        
[ 3.       0.20896  0.17407  0.93498]                        
[ 4.       0.19948  0.17848  0.93325]                        
[ 5.       0.18096  0.16793  0.93726]                        
[ 6.       0.18384  0.16327  0.93902]                        
[ 7.       0.16519  0.1653   0.93938]                        
[ 8.       0.18561  0.17403  0.93562]                        
[ 9.       0.14946  0.16317  0.94034]                        
[ 10.        0.15682   0.16192   0.94159]                    
[ 11.        0.13577   0.16704   0.94086]                    
[ 12.        0.15243   0.1747    0.93806]                    
[ 13.        0.12568   0.17077   0.9393 ]                    



In [9]:
m3.load_cycle('imdb_sent1_c7_cl2', 6)

In [10]:
preds, y = m3.predict_with_targs()

In [11]:
preds[:10], y[:10]

(array([[-22.55689,   4.9826 ,  -4.38031],
        [-22.27272,   6.23313,  -5.48389],
        [-18.47953,   4.35911,  -3.87336],
        [-18.03215,  -2.86309,   3.37033],
        [-21.05982,  -2.55109,   3.22313],
        [-22.5431 ,  -4.23797,   4.88301],
        [-19.66172,  -2.78495,   3.2821 ],
        [-25.73613,  -3.95318,   4.76337],
        [-18.97934,   5.57811,  -4.98234],
        [-23.82152,  -0.79602,   1.40898]], dtype=float32),
 array([1, 1, 1, 2, 2, 2, 2, 2, 1, 1]))

In [12]:
np.exp(preds[0])

array([   0.     ,  145.85376,    0.01252], dtype=float32)