In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *
import pdb

import spacy
spacy_en = spacy.load('en')

# pandas and plotting config
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_colwidth', -1)

In [3]:
PATH = 'data/aclImdb'
TRN_PATH = f'{PATH}/train'
VAL_PATH = f'{PATH}/test'

os.makedirs(f'{PATH}/models', exist_ok=True)
os.makedirs(f'{PATH}/tmp', exist_ok=True)

%ls {PATH}

imdbEr.txt  imdb.vocab  [0m[01;34mmodels[0m/  README  [01;34mtest[0m/  [01;34mtmp[0m/  [01;34mtrain[0m/


What is in the training folder?

In [4]:
trn_files = !ls {TRN_PATH}/all
trn_files[:10]

['0_0.txt',
 '0_3.txt',
 '0_9.txt',
 '10000_0.txt',
 '10000_4.txt',
 '10000_8.txt',
 '1000_0.txt',
 '10001_0.txt',
 '10001_10.txt',
 '10001_4.txt']

In [5]:
for line in open(f'{TRN_PATH}/all/0_0.txt', encoding='utf-8'):
    print(line)

I admit, the great majority of films released before say 1933 are just not for me. Of the dozen or so "major" silents I have viewed, one I loved (The Crowd), and two were very good (The Last Command and City Lights, that latter Chaplin circa 1931).<br /><br />So I was apprehensive about this one, and humor is often difficult to appreciate (uh, enjoy) decades later. I did like the lead actors, but thought little of the film.<br /><br />One intriguing sequence. Early on, the guys are supposed to get "de-loused" and for about three minutes, fully dressed, do some schtick. In the background, perhaps three dozen men pass by, all naked, white and black (WWI ?), and for most, their butts, part or full backside, are shown. Was this an early variation of beefcake courtesy of Howard Hughes?


What does a review look like?

In [6]:
review = !cat {TRN_PATH}/all/{trn_files[6]}
review[0]

"I have to say when a name like Zombiegeddon and an atom bomb on the front cover I was expecting a flat out chop-socky fung-ku, but what I got instead was a comedy. So, it wasn't quite was I was expecting, but I really liked it anyway! The best scene ever was the main cop dude pulling those kids over and pulling a Bad Lieutenant on them!! I was laughing my ass off. I mean, the cops were just so bad! And when I say bad, I mean The Shield Vic Macky bad. But unlike that show I was laughing when they shot people and smoked dope.<br /><br />Felissa Rose...man, oh man. What can you say about that hottie. She was great and put those other actresses to shame. She should work more often!!!!! I also really liked the fight scene outside of the building. That was done really well. Lots of fighting and people getting their heads banged up. FUN! Last, but not least Joe Estevez and William Smith were great as the...well, I wasn't sure what they were, but they seemed to be having fun and throwing out 

How many words in training and validation datasets

In [7]:
!find {TRN_PATH}/all -name '*.txt' | xargs cat | wc -w

17486581


In [8]:
!find {VAL_PATH}/all -name '*.txt' | xargs cat | wc -w

5686719


Tokenize using the new fastai.text package

In [9]:
' '.join(spacy_tok(review[0]))

"I have to say when a name like Zombiegeddon and an atom bomb on the front cover I was expecting a flat out chop - socky fung - ku , but what I got instead was a comedy . So , it was n't quite was I was expecting , but I really liked it anyway ! The best scene ever was the main cop dude pulling those kids over and pulling a Bad Lieutenant on them ! ! I was laughing my ass off . I mean , the cops were just so bad ! And when I say bad , I mean The Shield Vic Macky bad . But unlike that show I was laughing when they shot people and smoked dope . \n\n Felissa Rose ... man , oh man . What can you say about that hottie . She was great and put those other actresses to shame . She should work more often ! ! ! ! ! I also really liked the fight scene outside of the building . That was done really well . Lots of fighting and people getting their heads banged up . FUN ! Last , but not least Joe Estevez and William Smith were great as the ... well , I was n't sure what they were , but they seemed t

In [10]:
trn_docs, trn_labels = texts_labels_from_folders(TRN_PATH, ['neg', 'pos'])
val_docs, val_labels = texts_labels_from_folders(VAL_PATH, ['neg', 'pos'])

len(trn_docs), len(val_docs)

(25000, 25000)

In [11]:
f_tok = Tokenizer()
%time trn_docs_pp = f_tok.proc_all(trn_docs)
%time val_docs_pp = f_tok.proc_all(val_docs)

CPU times: user 47.5 s, sys: 164 ms, total: 47.6 s
Wall time: 47.6 s
CPU times: user 44.6 s, sys: 213 ms, total: 44.8 s
Wall time: 44.8 s


In [12]:
class LanguageDataset(torch.utils.data.Dataset):
    def __init__(self, docs, vocab=None, newline_bos=True, newline_eos=True):
        self.tokens = []
        for d in docs:
            if newline_bos: self.tokens.append('<bos>')
            self.tokens += d
            if newline_eos: self.tokens.append('<eos>')
        
        if (vocab):
            self.vocab = vocab
        else:
            u_tokens = set(self.tokens)
            [ u_tokens.discard(t) for t in ['<unk>', '<pad>', '<bos>', '<eos>']]
            self.vocab = ['<unk>', '<pad>', '<bos>', '<eos>'] + sorted(list(u_tokens))
        
        self.vocab_stoi = { token:idx for idx, token in enumerate(self.vocab) }
        
        self.data = np.array([[ self.vocab_stoi.get(t, 0) for t in self.tokens ]])
        
    def __getitem__(self, idx):
        return self.data[idx]

    def __len__(self):
        return len(self.data)


In [13]:
%time trn_ds = LanguageDataset(trn_docs_pp)
%time val_ds = LanguageDataset(val_docs_pp, vocab=trn_ds.vocab)

CPU times: user 2.33 s, sys: 40 ms, total: 2.37 s
Wall time: 2.37 s
CPU times: user 1.9 s, sys: 40 ms, total: 1.94 s
Wall time: 1.95 s


In [14]:
len(trn_ds[0]), len(trn_ds.tokens), len(trn_ds.vocab), len(trn_ds)

(7153620, 7153620, 98525, 1)

In [15]:
print(trn_ds[0][:10])
print([ trn_ds.vocab[idx] for idx in trn_ds[0][:10]])

[    2 15457 56099 20338   119 93755   119 40526 76835    12]
['<bos>', 'carlos', 'mencia', 'continually', ',', 'violently', ',', 'hatefully', 'screaming', '"']


In [28]:
bsz = 8 #64
bptt = 70

In [29]:
trn_dl = LanguageModelLoader(trn_ds[0], bsz, bptt)
val_dl = LanguageModelLoader(val_ds[0], bsz, bptt)

In [30]:
# next(iter(trn_dl))

In [31]:
md = LanguageModelData(PATH, 1, len(trn_ds.vocab), trn_dl, val_dl, bptt=bptt)

In [32]:
len(md.trn_dl), md.nt, len(trn_ds), len(trn_ds.tokens)

(37257, 98525, 1, 7153620)

In [33]:
batch = next(iter(md.trn_dl))
print(batch[0].size()), print(batch[1].size())

# batch

torch.Size([31, 8])
torch.Size([248])


(None, None)

In [34]:
em_sz = 200  # size of each embedding vector
nh = 500     # number of hidden activations per layer
nl = 3       # number of layers

In [35]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [36]:
learner = md.get_model(opt_fn, em_sz, nh, nl,
               dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3

In [37]:
batch_iter = iter(md.val_dl)

In [38]:
b = next(batch_iter) # x, y
p = learner.model(V(b[0]))       # predictions

print(b[0].size(), b[1].size(), p.size())

> /home/wgilliam/development/_training/ml/fastai-course/fastai/courses/dl1/fastai/rnn_reg.py(46)dropout_mask()
-> return x.new(*sz).bernoulli_(1-dropout)/(1-dropout)
(Pdb) c


RuntimeError: cuda runtime error (48) : no kernel image is available for execution on the device at /opt/conda/conda-bld/pytorch_1518244421288/work/torch/lib/THC/generic/THCTensorMathPairwise.cu:102

In [33]:
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)

  0%|          | 0/12773 [00:00<?, ?it/s]


RuntimeError: cuda runtime error (48) : no kernel image is available for execution on the device at /opt/conda/conda-bld/pytorch_1518244421288/work/torch/lib/THC/generic/THCTensorMathPairwise.cu:102