In [None]:
# Workaround for training output not visible in JupyterNotebook https://github.com/microsoft/vscode-jupyter/issues/13163
from IPython.display import clear_output, DisplayHandle
def update_patch(self, obj):
    clear_output(wait=True)
    self.display(obj)
DisplayHandle.update = update_patch

In [None]:
from fastbook import *
from IPython.display import display,HTML

## NLP wih RNNs. 

A language model is trained to guess the next word in a given text, based on text it has read before. This is called self supervised learning. 
The IMDB example used a language model trained on Wikipedia, but training on a corpus of target text (in this case IMDB) produces much better results. In this example the IMDB dataset will have a lot more different words, slang, and names that aren't in the Wikipedia dataset. 

The process goes: 

Tokenization - converting the text into tokens which are almost words, sometimes words and sometimes parts of words. Can be subwords or characters too. 
Numericalization - Make a list of all unique words that appear (the vocabulary) and convert each into a number. 
Language model data loader creation - create an independent variable which is the sequence of worsd from 1 to n-1, and a dependent variable which is from 2 to n. 
Lanauge model creation - Recurrent Neural Network to create an LM which takes large inputs. 




## Tokenization

In [None]:
from fastai.text.all import *
path = untar_data(URLs.IMDB)  


In [None]:
files = get_text_files(path, folders = ['train', 'test', 'unsup'])

In [None]:
txt = files[0].open().read(); txt[:75]

In [None]:
spacy = WordTokenizer()
toks = first(spacy([txt]))
print(coll_repr(toks, 30))

In [None]:
first(spacy(['The U.S. dollar 1.00.']))

In [None]:


tkn = Tokenizer(spacy)
print(coll_repr(tkn(txt), 31))
     


The `xx` is not common in English, so it's used as a prefix to indicate special tokens here.   The `xxbos` indicates start of a new text. (Beginning of Stream). `xxmaj` means the next word begins with a capital.  `xxunk` is an unknown word. 

Similarly there is `xxrep` to indicate repeated characters. 

In [None]:
print(coll_repr(tkn("I like turtles!!!!"), 31))

The tokenization helps with model training, letting it recognize important parts of a sentence. 

In [None]:
# See the rules
defaults.text_proc_rules

In [None]:
coll_repr(tkn('©   Fast.ai www.fast.ai/INDEX'), 31)

## Subword tokenization

Another approach to tokenization, instead of full words, is subwords. Subword is useful for languages like Chinese, Japanese because they don't necessarily use spaces or have the same definition of word as in English. Similarly, Turkish, Hungarian and German can add many subwords without spaces to make new words. 

Analyze a corpus of documents to find the most commonly occuring groups of letters (vocab)
Then, tokenize the corpus using this vocabulary of 'subword units'

In [None]:
txts = L(o.open().read() for o in files[:2000])

In [None]:
len(txts)

In [None]:
def subword(sz):
    sp = SubwordTokenizer(vocab_sz=sz)
    sp.setup(txts)
    return ' '.join(first(sp([txt]))[:40])
     

In [None]:
# subword(10000)
# doesn't work, keeps resulting in kernel crash

## Numericalization with fastai
Mapping those tokens to integers. 
Make a list of all possible levels of that  categorical variable (the voacbulary), and replace each level with its index. 



In [None]:
toks = tkn(txt)
print(coll_repr(tkn(txt), 31))

In [None]:
toks200 = txts[:200].map(tkn)
toks200[0]

In [None]:
num = Numericalize()
num.setup(toks200)
coll_repr(num.vocab,20)

The default in Numericalize() above is 60000. Any words after the most common 60K are replaced with xxunk (unknown). 

In [None]:
nums = num(toks)[:20]; nums

In [None]:
' '.join(num.vocab[o] for o in nums)

### Putting Our Texts into Batches for a Language Model


In [None]:
stream = "In this chapter, we will go back over the example of classifying movie reviews we studied in chapter 1 and dig deeper under the surface. First we will look at the processing steps necessary to convert text into numbers and how to customize it. By doing this, we'll have another example of the PreProcessor used in the data block API.\nThen we will study how we build a language model and train it for a while."
tokens = tkn(stream)
bs,seq_len = 6,15
d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

In [None]:
#hide_input
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15:i*15+seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

In [None]:

#hide_input
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15+seq_len:i*15+2*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

In [None]:
#hide_input
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15+10:i*15+15] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

The idea is to slice the documents into mini streams. Within each sub-batch the words are in order but otherwise the batches will be shuffled. 


In [None]:
nums200 = toks200.map(num)

In [None]:
dl = LMDataLoader(nums200)

In [None]:
x,y = first(dl)
x.shape, y.shape

In [None]:
' '.join(num.vocab[o] for o in x[0][:20])

In [None]:
# The dependent variable is the same text off by one
' '.join(num.vocab[o] for o in y[0][:20])

## Training a Text Classifier

In [None]:
get_imdb = partial(get_text_files, folders=['train', 'test', 'unsup'])

dls_lm = DataBlock(
    blocks=TextBlock.from_folder(path, is_lm=True),
    get_items=get_imdb, splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=64, seq_len=80)

In [None]:
dls_lm.show_batch(max_n=2)

Now that our data is ready, we can fine-tune the pretrained language model.


## Fine-Tuning the Language Model

In [None]:

learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3, 
    metrics=[accuracy, Perplexity()]).to_fp16()

In [None]:
learn.fit_one_cycle(1, 2e-2)