In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from fastai.text.all import *
path = untar_data(URLs.IMDB)
path

Path('C:/Users/zdomuell/.fastai/data/imdb')

# Chapter 10: RNNs

## Tokenization

In [3]:
files = get_text_files(path, folders=['train','test','unsup'])

In [4]:
txt = files[0].open().read()
txt[:75]

'Once again Mr. Costner has dragged out a movie for far longer than necessar'

### Word tokenization

The default tokenizer in fastai is spacy:

In [5]:
spacy = WordTokenizer()
tokens = first(spacy([txt]))
coll_repr(tokens, 30)

"(#187) ['Once','again','Mr.','Costner','has','dragged','out','a','movie','for','far','longer','than','necessary','.','Aside','from','the','terrific','sea','rescue','sequences',',','of','which','there','are','very','few','I'...]"

Additional information is added by fastai's Tokenizer class, like:
- **xxbos**. Beginn of stream
- **xxmaj**. Next word starts with capital (major) letter
- ...

In [6]:
tkn = Tokenizer(spacy)
coll_repr(tkn(txt), 31)

"(#207) ['xxbos','xxmaj','once','again','xxmaj','mr','.','xxmaj','costner','has','dragged','out','a','movie','for','far','longer','than','necessary','.','xxmaj','aside','from','the','terrific','sea','rescue','sequences',',','of','which'...]"

The list of rules that produce these special characters:

In [7]:
defaults.text_proc_rules

[<function fastai.text.core.fix_html(x)>,
 <function fastai.text.core.replace_rep(t)>,
 <function fastai.text.core.replace_wrep(t)>,
 <function fastai.text.core.spec_add_spaces(t)>,
 <function fastai.text.core.rm_useless_spaces(t)>,
 <function fastai.text.core.replace_all_caps(t)>,
 <function fastai.text.core.replace_maj(t)>,
 <function fastai.text.core.lowercase(t, add_bos=True, add_eos=False)>]

In [8]:
??fix_html

[1;31mSignature:[0m [0mfix_html[0m[1;33m([0m[0mx[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mSource:[0m   
[1;32mdef[0m [0mfix_html[0m[1;33m([0m[0mx[0m[1;33m)[0m[1;33m:[0m[1;33m
[0m    [1;34m"Various messy things we've seen in documents"[0m[1;33m
[0m    [0mx[0m [1;33m=[0m [0mx[0m[1;33m.[0m[0mreplace[0m[1;33m([0m[1;34m'#39;'[0m[1;33m,[0m [1;34m"'"[0m[1;33m)[0m[1;33m.[0m[0mreplace[0m[1;33m([0m[1;34m'amp;'[0m[1;33m,[0m [1;34m'&'[0m[1;33m)[0m[1;33m.[0m[0mreplace[0m[1;33m([0m[1;34m'#146;'[0m[1;33m,[0m [1;34m"'"[0m[1;33m)[0m[1;33m.[0m[0mreplace[0m[1;33m([0m[1;34m'nbsp;'[0m[1;33m,[0m [1;34m' '[0m[1;33m)[0m[1;33m.[0m[0mreplace[0m[1;33m([0m[1;33m
[0m        [1;34m'#36;'[0m[1;33m,[0m [1;34m'$'[0m[1;33m)[0m[1;33m.[0m[0mreplace[0m[1;33m([0m[1;34m'\\n'[0m[1;33m,[0m [1;34m"\n"[0m[1;33m)[0m[1;33m.[0m[0mreplace[0m[1;33m([0m[1;34m'quot;'[0m[1;33m,[0m [1;34m"'"[0m[1;33m

### Subword tokenization

In [9]:
txts = L(file.open(encoding='utf-8').read() for file in files[:2000])

In [10]:
def subword(sz):
    tokenizer = SubwordTokenizer(vocab_sz=sz)
    tokenizer.setup(txts)
    return ' '.join(first(tokenizer([txt]))[:40])

In [11]:
subword(1000)

'▁O n ce ▁again ▁M r . ▁Co st n er ▁has ▁d ra g g ed ▁out ▁a ▁movie ▁for ▁far ▁long er ▁than ▁ ne ce s s ary . ▁A side ▁from ▁the ▁ ter ri f'

In [12]:
subword(200)

'▁ O n ce ▁a g a in ▁ M r . ▁ C o st n er ▁h a s ▁d ra g g ed ▁ o u t ▁a ▁movie ▁for ▁f ar ▁lo n g er ▁'

In [13]:
subword(10000)

'▁On ce ▁again ▁Mr . ▁Costner ▁has ▁dragged ▁out ▁a ▁movie ▁for ▁far ▁longer ▁than ▁necessary . ▁A side ▁from ▁the ▁terrific ▁sea ▁rescue ▁sequences , ▁of ▁which ▁there ▁are ▁very ▁few ▁I ▁just ▁did ▁not ▁care ▁about ▁any ▁of'

## Numericalization

In [14]:
tokens = tkn(txt)
coll_repr(tokens, 31)

"(#207) ['xxbos','xxmaj','once','again','xxmaj','mr','.','xxmaj','costner','has','dragged','out','a','movie','for','far','longer','than','necessary','.','xxmaj','aside','from','the','terrific','sea','rescue','sequences',',','of','which'...]"

In [15]:
toks200 = txts[:200].map(tkn)
toks200[0]

(#207) ['xxbos','xxmaj','once','again','xxmaj','mr','.','xxmaj','costner','has'...]

In [16]:
numericalizer = Numericalize()
numericalizer.setup(toks200)
coll_repr(numericalizer.vocab, 20)

"(#1968) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the','.',',','a','and','of','to','is','it','i','in'...]"

In [17]:
numericalizer(tokens)[:20]

TensorText([   2,    8,  349,  183,    8, 1176,   10,    8, 1177,   60, 1455,   62,
          12,   25,   28,  189,  957,   93,  958,   10])

## Batching

In [18]:
nums200 = toks200.map(numericalizer)
dl = LMDataLoader(nums200)

In [19]:
x,y = first(dl)
x.shape, y.shape

(torch.Size([64, 72]), torch.Size([64, 72]))

In [20]:
[numericalizer.vocab[index] for index in x[0,:20]]

['xxbos',
 'xxmaj',
 'once',
 'again',
 'xxmaj',
 'mr',
 '.',
 'xxmaj',
 'costner',
 'has',
 'dragged',
 'out',
 'a',
 'movie',
 'for',
 'far',
 'longer',
 'than',
 'necessary',
 '.']

In [21]:
[numericalizer.vocab[index] for index in y[0,:20]]

['xxmaj',
 'once',
 'again',
 'xxmaj',
 'mr',
 '.',
 'xxmaj',
 'costner',
 'has',
 'dragged',
 'out',
 'a',
 'movie',
 'for',
 'far',
 'longer',
 'than',
 'necessary',
 '.',
 'xxmaj']

## Training a classifier

In [22]:
get_imdb = partial(get_text_files, folders=['train', 'test', 'unsup'])

dls_lm = DataBlock(
    blocks=TextBlock.from_folder(path, is_lm=True),
    get_items=get_imdb,
    splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=16, seq_len=80)

In [23]:
dls_lm.show_batch(max_n=2)

Unnamed: 0,text,text_
0,"xxbos xxmaj raising xxmaj victor xxmaj vargas fails terribly in what it tries most to be : being real . xxmaj unfortunately , there is no reality to this film . xxmaj the characters and situations feel completely artificial and fake . \n\n xxmaj the reason ? xxmaj bad directing . xxmaj peter xxmaj sollett uses all the wrong tools in his arsenal . xxmaj it seems xxmaj mr . xxmaj sollett read somewhere that not lighting his film would","xxmaj raising xxmaj victor xxmaj vargas fails terribly in what it tries most to be : being real . xxmaj unfortunately , there is no reality to this film . xxmaj the characters and situations feel completely artificial and fake . \n\n xxmaj the reason ? xxmaj bad directing . xxmaj peter xxmaj sollett uses all the wrong tools in his arsenal . xxmaj it seems xxmaj mr . xxmaj sollett read somewhere that not lighting his film would give"
1,"twenty - first century film noir . xxmaj it is miraculous all the actors made it through the filming without lung cancer . xxmaj i 've never seen more people smoke in one film . \n\n xxmaj josh xxmaj hartnett is just too boyish looking to pull of the role he played . xxmaj he is miscast . \n\n xxmaj aaron xxmaj eckhart is guilty of over acting and over acting , over and over . \n\n xxmaj scarlet xxmaj","- first century film noir . xxmaj it is miraculous all the actors made it through the filming without lung cancer . xxmaj i 've never seen more people smoke in one film . \n\n xxmaj josh xxmaj hartnett is just too boyish looking to pull of the role he played . xxmaj he is miscast . \n\n xxmaj aaron xxmaj eckhart is guilty of over acting and over acting , over and over . \n\n xxmaj scarlet xxmaj johanson"


In [24]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=.3,
    metrics=[accuracy, Perplexity()]).to_fp16()

In [25]:
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,perplexity,time


KeyboardInterrupt: 