In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *
import pdb

import spacy
spacy_en = spacy.load('en')

# pandas and plotting config
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_colwidth', -1)

In [3]:
PATH = 'data/aclImdb'
TRN_PATH = f'{PATH}/train'
VAL_PATH = f'{PATH}/test'

os.makedirs(f'{PATH}/models', exist_ok=True)
os.makedirs(f'{PATH}/tmp', exist_ok=True)

%ls {PATH}

imdbEr.txt  imdb.vocab  [0m[01;34mmodels[0m/  README  [01;34mtest[0m/  [01;34mtmp[0m/  [01;34mtrain[0m/


What is in the training folder?

In [4]:
trn_files = !ls {TRN_PATH}/all
trn_files[:10]

['0_0.txt',
 '0_3.txt',
 '0_9.txt',
 '10000_0.txt',
 '10000_4.txt',
 '10000_8.txt',
 '1000_0.txt',
 '10001_0.txt',
 '10001_10.txt',
 '10001_4.txt']

In [5]:
for line in open(f'{TRN_PATH}/all/0_0.txt', encoding='utf-8'):
    print(line)

I admit, the great majority of films released before say 1933 are just not for me. Of the dozen or so "major" silents I have viewed, one I loved (The Crowd), and two were very good (The Last Command and City Lights, that latter Chaplin circa 1931).<br /><br />So I was apprehensive about this one, and humor is often difficult to appreciate (uh, enjoy) decades later. I did like the lead actors, but thought little of the film.<br /><br />One intriguing sequence. Early on, the guys are supposed to get "de-loused" and for about three minutes, fully dressed, do some schtick. In the background, perhaps three dozen men pass by, all naked, white and black (WWI ?), and for most, their butts, part or full backside, are shown. Was this an early variation of beefcake courtesy of Howard Hughes?


What does a review look like?

In [6]:
review = !cat {TRN_PATH}/all/{trn_files[6]}
review[0]

"I have to say when a name like Zombiegeddon and an atom bomb on the front cover I was expecting a flat out chop-socky fung-ku, but what I got instead was a comedy. So, it wasn't quite was I was expecting, but I really liked it anyway! The best scene ever was the main cop dude pulling those kids over and pulling a Bad Lieutenant on them!! I was laughing my ass off. I mean, the cops were just so bad! And when I say bad, I mean The Shield Vic Macky bad. But unlike that show I was laughing when they shot people and smoked dope.<br /><br />Felissa Rose...man, oh man. What can you say about that hottie. She was great and put those other actresses to shame. She should work more often!!!!! I also really liked the fight scene outside of the building. That was done really well. Lots of fighting and people getting their heads banged up. FUN! Last, but not least Joe Estevez and William Smith were great as the...well, I wasn't sure what they were, but they seemed to be having fun and throwing out 

How many words in training and validation datasets

In [7]:
!find {TRN_PATH}/all -name '*.txt' | xargs cat | wc -w

17486581


In [8]:
!find {VAL_PATH}/all -name '*.txt' | xargs cat | wc -w

5686719


Tokenize using the new fastai.text package

In [9]:
' '.join(spacy_tok(review[0]))

"I have to say when a name like Zombiegeddon and an atom bomb on the front cover I was expecting a flat out chop - socky fung - ku , but what I got instead was a comedy . So , it was n't quite was I was expecting , but I really liked it anyway ! The best scene ever was the main cop dude pulling those kids over and pulling a Bad Lieutenant on them ! ! I was laughing my ass off . I mean , the cops were just so bad ! And when I say bad , I mean The Shield Vic Macky bad . But unlike that show I was laughing when they shot people and smoked dope . \n\n Felissa Rose ... man , oh man . What can you say about that hottie . She was great and put those other actresses to shame . She should work more often ! ! ! ! ! I also really liked the fight scene outside of the building . That was done really well . Lots of fighting and people getting their heads banged up . FUN ! Last , but not least Joe Estevez and William Smith were great as the ... well , I was n't sure what they were , but they seemed t

In [10]:
trn_docs, trn_labels = texts_labels_from_folders(TRN_PATH, ['neg', 'pos'])
val_docs, val_labels = texts_labels_from_folders(VAL_PATH, ['neg', 'pos'])

len(trn_docs), len(val_docs)

(25000, 25000)

In [11]:
f_tok = Tokenizer()
%time trn_docs_pp = f_tok.proc_all(trn_docs)
%time val_docs_pp = f_tok.proc_all(val_docs)

CPU times: user 59 s, sys: 196 ms, total: 59.2 s
Wall time: 59.2 s
CPU times: user 54 s, sys: 200 ms, total: 54.2 s
Wall time: 54.2 s


In [12]:
from collections import Counter, defaultdict

class Vocab:
    def __init__(self, tokens, min_freq=1, max_size=None, 
                 specials=['<unk>', '<pad>', '<bos>', '<eos>'], unk_idx=0):
        self.min_freq = max(min_freq, 1)
        self.specials = specials
        self.unk_idx = unk_idx
        
        self.tokens = list(specials)
        self.max_size = None if max_size is None else max_size + len(self.tokens)
        
        self.token_freqs = Counter(tokens)
        for t in self.specials: del self.token_freqs[t]
            
        # sort by frequency, then alphabetically
        self.token_freqs = sorted(self.token_freqs.items(), key=lambda tup: tup[0])
        self.token_freqs.sort(key=lambda tup: tup[1], reverse=True)
        
        for token, freq in self.token_freqs:
            if freq < self.min_freq or len(self.tokens) == self.max_size:
                break
            self.tokens.append(token)
            
        self.vocab_stoi = defaultdict(lambda x: self.unk_idx) # default is <unk>
        self.vocab_stoi.update({ tok: i for i, tok in enumerate(self.tokens) })
        
    def stoi(self, token):
        return self.vocab_stoi.get(token, self.unk_idx)
        
    def itos(self, idx):
        return self.tokens[idx]
        
    def token_freq(self, token):
        return self.token_freqs.get(token, 0)
        

In [13]:
class LanguageDataset(torch.utils.data.Dataset):
    def __init__(self, docs, newline_bos=True, newline_eos=True, vocab=None, min_freq=1, max_size=None):
        self.tokens = []
        for d in docs:
            if newline_bos: self.tokens.append('<bos>')
            self.tokens += d
            if newline_eos: self.tokens.append('<eos>')
            
        if (vocab):
            self.vocab = vocab
        else:
            self.vocab = Vocab(self.tokens, min_freq, max_size)
        
        self.data = np.array([[ self.vocab.stoi(t) for t in self.tokens ]])
        
    def __getitem__(self, idx):
        return self.data[idx]

    def __len__(self):
        return len(self.data)


In [14]:
%time trn_ds = LanguageDataset(trn_docs_pp + val_docs_pp, newline_bos=False, min_freq=10)
%time val_ds = LanguageDataset(val_docs_pp[:100], newline_bos=False, vocab=trn_ds.vocab)

CPU times: user 9.47 s, sys: 124 ms, total: 9.59 s
Wall time: 9.59 s
CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 12.5 ms


In [15]:
print(len(trn_ds[0]), len(trn_ds.tokens), len(trn_ds.vocab.tokens), len(trn_ds))
print(len(val_ds[0]), len(val_ds.tokens), len(val_ds.vocab.tokens), len(val_ds))

14101165 14101165 29563 1
26210 26210 29563 1


In [16]:
print(trn_ds[0][:10])
print([ trn_ds.vocab.itos(idx) for idx in trn_ds[0][:10] ])

[  16  193   33  407   45    4 3469   59   17   28]
['i', 'ca', "n't", 'understand', 'all', 'the', 'hype', 'about', 'this', 'movie']


In [17]:
bsz = 64
bptt = 70

In [18]:
trn_dl = LanguageModelLoader(trn_ds[0], bsz, bptt)
val_dl = LanguageModelLoader(val_ds[0], bsz, bptt)

In [19]:
# next(iter(trn_dl))

In [20]:
md = LanguageModelData(PATH, 1, len(trn_ds.vocab.tokens), trn_dl, val_dl, bptt=bptt, min_freq=10)

In [21]:
len(md.trn_dl), md.nt, len(trn_ds), len(trn_ds.tokens)

(3146, 29563, 1, 14101165)

In [22]:
batch = next(iter(md.trn_dl))
print(batch[0].size()), print(batch[1].size())

# batch

torch.Size([68, 64])
torch.Size([4352])


(None, None)

In [23]:
em_sz = 200  # size of each embedding vector
nh = 500     # number of hidden activations per layer
nl = 3       # number of layers

In [24]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [25]:


learner = md.get_model(opt_fn, em_sz, nh, nl,
               dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

In [42]:
# batch_iter = iter(md.val_dl)

In [43]:
# b = next(batch_iter) # x, y
# p = learner.model(V(b[0]))       # predictions

# b[0].shape, b[1].shape, p[0].shape

In [44]:
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

epoch      trn_loss   val_loss                                
    0      4.697988   4.57248   
    1      4.500699   4.348201                                
    2      4.371368   4.243143                                
    3      4.428565   4.301117                                
    4      4.335875   4.188483                                
    5      4.251823   4.102633                                
    6      4.216845   4.063244                                
    7      4.34847    4.217101                                
    8      4.313081   4.169003                                
    9      4.266176   4.126935                                
    10     4.240518   4.092284                                
    11     4.22108    4.034062                                
    12     4.149041   3.991385                                
    13     4.129282   3.971003                                
    14     4.112093   3.967932                                



[3.9679315]

In [45]:
learner.save_encoder('imdb_adam1_enc_full')
# learner.load_encoder('imdb_adam1_enc')

In [46]:
# learner.fit(3e-3, 4, wds=1e-6, cycle_len=10, cycle_save_name='imdb_adam2_4_10')
learner.fit(3e-3, 2, wds=1e-6, cycle_len=10, cycle_save_name='imdb_adam2_c2_cl10_full')

epoch      trn_loss   val_loss                                
    0      4.269407   4.146606  
    1      4.274711   4.125793                                
    2      4.248354   4.102614                                
    3      4.216576   4.070951                                
    4      4.189146   4.038531                                
    5      4.177811   4.007429                                
    6      4.126941   3.989224                                
    7      4.088423   3.938474                                
    8      4.064484   3.912925                                
    9      4.073764   3.916723                                
    10     4.273544   4.104843                                
    11     4.233156   4.096601                                
    12     4.218732   4.076135                                
    13     4.183688   4.045218                                
    14     4.170473   4.009149                                
    15     4.138231   

[3.8945541]

In [47]:

learner.save_encoder('imdb_adam2_enc_full')
# learner.load_encoder('imdb_adam2_enc')`

In [48]:
learner.fit(3e-4, 1, wds=1e-6, cycle_len=20, cycle_save_name='imdb_adam3_c1_cl20_full')

epoch      trn_loss   val_loss                                
    0      4.066651   3.888733  
    1      4.071485   3.899539                                
    2      4.091433   3.904781                                
    3      4.057991   3.891943                                
    4      4.048481   3.880265                                
    5      4.042373   3.879733                                
    6      4.057358   3.877355                                
    7      4.06226    3.876944                                
    8      4.044253   3.882051                                
    9      4.033689   3.868109                                
    10     4.057388   3.864538                                
    11     4.022268   3.86107                                 
    12     4.013979   3.862605                                
    13     4.011391   3.857065                                
    14     4.046247   3.853321                                
    15     4.010649   

[3.8845663]

In [26]:
learner.load_cycle('imdb_adam3_c1_cl20_full', 0) # load best cycle

In the sentiment analysis section, we'll just need half of the language model - the encoder, so we save that part.

In [27]:
# learner.save_encoder('imdb_adam3_enc_full')
learner.load_encoder('imdb_adam3_enc_full')

Language modeling accuracy is generally measured using the metric perplexity, which is simply exp() of the loss function we used.

In [None]:
math.exp(4.115031 ) # what I got when use a validation dataset

In [28]:
pickle.dump(trn_ds.vocab, open(f'{PATH}/models/vocab_full.pkl','wb'))

AttributeError: Can't pickle local object 'Vocab.__init__.<locals>.<lambda>'

## Test

We can play around with our language model a bit to check it seems to be working OK. First, let's create a short bit of text to 'prime' a set of predictions. We'll use our torchtext field to numericalize it so we can feed it to our language model.

In [30]:
vocab =trn_ds.vocab # = pickle.load(open(f'{PATH}/models/vocab.pkl', 'rb'))

In [58]:
# create a short bit of text to "prime" the precitions, then use torchtext to numericalize it
# so we can feed it into our language model
m = learner.model
# ss = """. I laughed so hard when"""
ss = """. The monster was crazy and I cried """
s = [f_tok.proc_text(ss)]
t = np.array([ vocab.stoi(tok) if tok in vocab.tokens else 0 for tok in s[0] ])   # TEXT.numericalize(s)
' '.join(s[0])

'. the monster was crazy and i cried'

In [59]:
t=T(t)
t = V(t.unsqueeze(0).cuda())
t

Variable containing:
    6     4   769    24   952     7    16  3600
[torch.cuda.LongTensor of size 1x8 (GPU 0)]

We haven't yet added methods to make it easy to test a language model, so we'll need to manually go through the steps.

In [60]:
m[0].bs = 1      # set batch size = 1
m.eval()         # turn-off dropout
m.reset()        # reset hidden state
res, *_ = m(t)   # get predictions from model
m[0].bs = bsz     # put batch size back to what it was

In [61]:
res[-1].size()   # the prediction based on the full sentence; the last prediction
len(res)         # the number of words in "t"

8

Let's see what the top 10 predictions were for the next word after our short text:

In [62]:
# top 10 predictions for next word
nexts = torch.topk(res[-1], 10)[1]           # return the 10 indexes of the top 10 predictions
[vocab.tokens[o] for o in to_np(nexts)]      # [TEXT.vocab.itos[o] for o in to_np(nexts)]

[',', 'at', 'out', '.', 'and', 'by', 'in', 'when', 'for', 'a']

...and let's see if our model can generate a bit more text all by itself!

In [63]:
# try to generate more text
print(ss, "\n")

for i in range(50):
    n = res[-1].topk(2)[1]
    n = n[1] if n.data[0] == 0 else n[0]
    print(vocab.itos(n.data[0]), end=' ')   #print(TEXT.vocab.itos[n.data[0]], end=' ')
    res, *_ = m(n[0].unsqueeze(0))
    
print('...')

. The monster was crazy and I cried  

, and the film is a bit of a disappointment . the film is a bit of a let down , but it 's not a bad film . it 's a very good film , but it 's not a bad film . <eos> i have seen this movie ...
