### [GLOBAL IMPORTS]

In [30]:
from fastai.data.external import untar_data, URLs
from fastai.text.all import (
    defaults,
    get_text_files, 
    Tokenizer, WordTokenizer, 
    first, coll_repr
)

In [5]:
path = untar_data(URLs.IMDB)

 |████████████████████████████████████████| 100.00% [144441344/144440600 00:05<00:00]

In [23]:
files = get_text_files(path, folders=['train', 'test', 'unsup'])
txt = files[0].open().read()

files, txt[:175]

((#100000) [Path('/Users/mton/.fastai/data/imdb/test/neg/1821_4.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/9487_1.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/4604_4.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/2828_2.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/10890_1.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/3351_4.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/8070_2.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/1027_4.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/8248_3.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/4290_4.txt')...],
 "Alan Rickman & Emma Thompson give good performances with southern/New Orleans accents in this detective flick. It's worth seeing for their scenes- and Rickman's scene with Hal")

In [22]:
spacy = WordTokenizer()
toks = first(spacy([txt]))

print(coll_repr(toks, 30))

(#121) ['Alan','Rickman','&','Emma','Thompson','give','good','performances','with','southern','/','New','Orleans','accents','in','this','detective','flick','.','It',"'s",'worth','seeing','for','their','scenes-','and','Rickman',"'s",'scene'...]


In [25]:
# The char '.' is terminated in a sentence, but not the '1.00' acronym
# Tokenization logic needs to handle very subtle context
first(spacy(['The U.S. dollar $1 is $1.00.']))

(#9) ['The','U.S.','dollar','$','1','is','$','1.00','.']

In [34]:
tkn = Tokenizer(spacy)
print(coll_repr(tkn(txt)), 31)

'''
(#139) ['xxbos','xxmaj','alan','xxmaj','rickman','&','xxmaj','emma','xxmaj','thompson'...] 31

xx - not common prefix, these are special tokens
- xxbos : Beginning of stream
    - this token indicates the model will learn it needs to "forget" what was 
    said previously and focus on upcoming words
- xxmaj : Indicates the next word begins with a capital 
  (we lower cased everything)
- xxunk : Indicates a word is unknown
- xxrep : !!!!! => `repeated char token` + `!` so we can count repeats as 
  opposed to treating them as unique
- xxwrep : for repeated words as opposed to characters
'''

# check default rules
defaults.text_proc_rules

(#139) ['xxbos','xxmaj','alan','xxmaj','rickman','&','xxmaj','emma','xxmaj','thompson'...] 31


[<function fastai.text.core.fix_html(x)>,
 <function fastai.text.core.replace_rep(t)>,
 <function fastai.text.core.replace_wrep(t)>,
 <function fastai.text.core.spec_add_spaces(t)>,
 <function fastai.text.core.rm_useless_spaces(t)>,
 <function fastai.text.core.replace_all_caps(t)>,
 <function fastai.text.core.replace_maj(t)>,
 <function fastai.text.core.lowercase(t, add_bos=True, add_eos=False)>]

In [39]:
from fastai.text.core import replace_rep

# inspect source
# Tokenizer??
replace_rep??

[0;31mSignature:[0m [0mreplace_rep[0m[0;34m([0m[0mt[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mreplace_rep[0m[0;34m([0m[0mt[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"Replace repetitions at the character level: cccc -- TK_REP 4 c"[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0m_replace_rep[0m[0;34m([0m[0mm[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0mc[0m[0;34m,[0m[0mcc[0m [0;34m=[0m [0mm[0m[0;34m.[0m[0mgroups[0m[0;34m([0m[0;34m)[0m[0;34m[0m
[0;34m[0m        [0;32mreturn[0m [0;34mf' {TK_REP} {len(cc)+1} {c} '[0m[0;34m[0m
[0;34m[0m    [0;32mreturn[0m [0m_re_rep[0m[0;34m.[0m[0msub[0m[0;34m([0m[0m_replace_rep[0m[0;34m,[0m [0mt[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      ~/miniconda3/envs/torch-gpu/lib/python3.9/site-packages/fastai/text/core.py
[0;31mType:[0m      function


### NLP Deep Dive : RNNs

Self-supervised learning is training a model using labels that are :
- EMBEDDED in the independent variable
- rather than requiring EXTERNAL labels
- example : training the model to predict the next word in a text

ULMFit - Universal Language Model Fine-tuning
- Fine tuning the :
    - (sequence based) language model prior to
    - fine-tuning the classification model yiels BETTER results

#### Text Processing

Predicting sentence length isn't obvious ... correlated to human breathe?
- Sentences can be of different LENGTHS
- Documents can be LONG

Review our background with how a single categorical variable can be used as an
indpendent variable, here's the approach we took for a single categorical var :
- 1 - Make a list of all the possible levels of that categorical var -- vocab --
- 2 - Replace each level with it's `index` in the -- vocab --
- 3 - Create an `embedding matrix` for this containing a row for each item
      .i.e for each item in the -- vocab --
- 4 - Use this `embedding matrix` as the first layer of a neural network  

```sh
        A dedicated **embedding matrix** can take as inputs the raw -- vocab --  
        indexes created in step 2;  
            - this is equivalent to  
            - but FASTER and more EFFICIENT than a matrix that takes as input  
            one-hot-encoded vectors representing the indexes  
```

We can do the same thing ^ with TEXT!  What is new is the idea of a sequence

- 1 - [ Tokenization ]
    - convert text into a list of (depending on granularity) :  
      - characters  
      - substrings - (GPT, HuggingFace)  
      - words  
- 2 - [ Numericalization ]  
    - -- vocab -- list hashed to an index number lookup
- 3 - Language Model [ Data Loader Creation ]   
    - `LMDataLoader` handles creating  
      - `dependent` variable that is  
      - `offset` from the `independent` by ONE `token`  
    - Also handles details such as :  
      - shuffling the training data so that the independent and dependent  
        variable maintain their structure as required  
      - latent breathe?  
- 4 - [ Language Model Creation ]  
    - RNN  
      - handles INPUT lists that can be of ARBITRARY LENGTH  


-- Tokenization --

Resolution

- 1 - Character
    - split into INDIVDUAL CHARS
- 2 - Subword
    - split into SMALLER parts
    - based on most COMMONLY occuring substrings
        - "occasion" => "o" "c" "ca" "sion"
- 3 - Word
    - apply language specific separator like 'white' space
    - generally punctuation marks are SEPARATE tokens
        - as opposed to totally NEW words