### [GLOBAL IMPORTS]

In [1]:
# m1 error fix
import sentencepiece

from fastai.data.external import untar_data, URLs
from fastai.data.core import L

from fastai.data.block import (
    DataBlock,
)

from fastai.data.transforms import RandomSplitter

from fastai.text.all import (
    defaults,
    # file handler
    get_text_files,
    # tabular util
    Tokenizer, WordTokenizer, SubwordTokenizer, Numericalize,
    LMDataLoader, 
    # data block
    TextBlock,
    # model
    AWD_LSTM,
    # metric
    Perplexity, accuracy,
    # learner
    language_model_learner,
    # debug log
    first, coll_repr
)

from fastai.text.core import replace_rep

from functools import partial

#### Inspect source code with `??`

In [2]:

# inspect source
# Tokenizer??
replace_rep??

[0;31mSignature:[0m [0mreplace_rep[0m[0;34m([0m[0mt[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mreplace_rep[0m[0;34m([0m[0mt[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"Replace repetitions at the character level: cccc -- TK_REP 4 c"[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0m_replace_rep[0m[0;34m([0m[0mm[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0mc[0m[0;34m,[0m[0mcc[0m [0;34m=[0m [0mm[0m[0;34m.[0m[0mgroups[0m[0;34m([0m[0;34m)[0m[0;34m[0m
[0;34m[0m        [0;32mreturn[0m [0;34mf' {TK_REP} {len(cc)+1} {c} '[0m[0;34m[0m
[0;34m[0m    [0;32mreturn[0m [0m_re_rep[0m[0;34m.[0m[0msub[0m[0;34m([0m[0m_replace_rep[0m[0;34m,[0m [0mt[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      ~/miniconda3/envs/m1_torch_gpu/lib/python3.10/site-packages/fastai/text/core.py
[0;31mType:[0m      function

### NLP Deep Dive : RNNs

Self-supervised learning is training a model using labels that are :
- EMBEDDED in the independent variable
- rather than requiring EXTERNAL labels
- example : training the model to predict the next word in a text

ULMFit - Universal Language Model Fine-tuning
- Fine tuning the :
    - (sequence based) language model prior to
    - fine-tuning the classification model yiels BETTER results

#### Text Processing

Predicting sentence length isn't obvious ... correlated to human breathe?
- Sentences can be of different LENGTHS
- Documents can be LONG

Review our background with how a single categorical variable can be used as an
indpendent variable, here's the approach we took for a single categorical var :
- 1 - Make a list of all the possible levels of that categorical var -- vocab --
- 2 - Replace each level with it's `index` in the -- vocab --
- 3 - Create an `embedding matrix` for this containing a row for each item
      .i.e for each item in the -- vocab --
- 4 - Use this `embedding matrix` as the first layer of a neural network  

```sh
        A dedicated **embedding matrix** can take as inputs the raw -- vocab --  
        indexes created in step 2;  
            - this is equivalent to  
            - but FASTER and more EFFICIENT than a matrix that takes as input  
            one-hot-encoded vectors representing the indexes  
```

We can do the same thing ^ with TEXT!  What is new is the idea of a sequence

- 1 - [ Tokenization ]
    - convert text into a list of (depending on granularity) :  
      - characters  
      - substrings - (GPT, HuggingFace)  
      - words  
- 2 - [ Numericalization ]  
    - -- vocab -- list hashed to an index number lookup
- 3 - Language Model [ Data Loader Creation ]   
    - `LMDataLoader` handles creating  
      - `dependent` variable that is  
      - `offset` from the `independent` by ONE `token`  
    - Also handles details such as :  
      - shuffling the training data so that the independent and dependent  
        variable maintain their structure as required  
      - latent breathe?  
- 4 - [ Language Model Creation ]  
    - RNN  
      - handles INPUT lists that can be of ARBITRARY LENGTH  


### -- Tokenization --

Resolution

- 1 - Character
    - split into INDIVDUAL CHARS
- 2 - Subword
    - split into SMALLER parts
    - based on most COMMONLY occuring substrings
        - "occasion" => "o" "c" "ca" "sion"
- 3 - Word
    - apply language specific separator like 'white' space
    - generally punctuation marks are SEPARATE tokens
        - as opposed to totally NEW words

In [3]:
path = untar_data(URLs.IMDB)

files = get_text_files(path, folders=['train', 'test', 'unsup'])
txt = files[0].open().read()

files, txt[:175]

((#100000) [Path('/Users/mton/.fastai/data/imdb/test/neg/1821_4.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/9487_1.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/4604_4.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/2828_2.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/10890_1.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/3351_4.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/8070_2.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/1027_4.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/8248_3.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/4290_4.txt')...],
 "Alan Rickman & Emma Thompson give good performances with southern/New Orleans accents in this detective flick. It's worth seeing for their scenes- and Rickman's scene with Hal")

In [4]:
spacy = WordTokenizer()
toks = first(spacy([txt]))

print(coll_repr(toks, 30))

# The char '.' is terminated in a sentence, but not the '1.00' acronym
# Tokenization logic needs to handle very subtle context
first(spacy(['The U.S. dollar $1 is $1.00.']))

(#121) ['Alan','Rickman','&','Emma','Thompson','give','good','performances','with','southern','/','New','Orleans','accents','in','this','detective','flick','.','It',"'s",'worth','seeing','for','their','scenes-','and','Rickman',"'s",'scene'...]


(#9) ['The','U.S.','dollar','$','1','is','$','1.00','.']

#### -- Common -- [ Prefixes ( xx ) ]

xx - not common prefix, these are special tokens
- xxbos : Beginning of stream
    - this token indicates the model will learn it needs to "forget" what was  
    said previously and focus on upcoming words
- xxmaj : Indicates the next word begins with a capital 
  (we lower cased everything)
- xxunk : Indicates a word is unknown
- xxrep : !!!!! => `repeated char token` + `!` so we can count repeats as  
  opposed to treating them as unique
- xxwrep : for repeated words as opposed to characters

In [5]:
tkn = Tokenizer(spacy)
print(coll_repr(tkn(txt)), 31)

(#139) ['xxbos','xxmaj','alan','xxmaj','rickman','&','xxmaj','emma','xxmaj','thompson'...] 31


#### -- FastAi -- [ Text Processing Rules ]

In [6]:
# inspect source code
replace_rep??

# check default rules
defaults.text_proc_rules

[<function fastai.text.core.fix_html(x)>,
 <function fastai.text.core.replace_rep(t)>,
 <function fastai.text.core.replace_wrep(t)>,
 <function fastai.text.core.spec_add_spaces(t)>,
 <function fastai.text.core.rm_useless_spaces(t)>,
 <function fastai.text.core.replace_all_caps(t)>,
 <function fastai.text.core.replace_maj(t)>,
 <function fastai.text.core.lowercase(t, add_bos=True, add_eos=False)>]

[0;31mSignature:[0m [0mreplace_rep[0m[0;34m([0m[0mt[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mreplace_rep[0m[0;34m([0m[0mt[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"Replace repetitions at the character level: cccc -- TK_REP 4 c"[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0m_replace_rep[0m[0;34m([0m[0mm[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0mc[0m[0;34m,[0m[0mcc[0m [0;34m=[0m [0mm[0m[0;34m.[0m[0mgroups[0m[0;34m([0m[0;34m)[0m[0;34m[0m
[0;34m[0m        [0;32mreturn[0m [0;34mf' {TK_REP} {len(cc)+1} {c} '[0m[0;34m[0m
[0;34m[0m    [0;32mreturn[0m [0m_re_rep[0m[0;34m.[0m[0msub[0m[0;34m([0m[0m_replace_rep[0m[0;34m,[0m [0mt[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      ~/miniconda3/envs/m1_torch_gpu/lib/python3.10/site-packages/fastai/text/core.py
[0;31mType:[0m      function

#### -- Subword -- [Vocabulary Size]

Subword tokenization provides an easy way to :
- easily scale between character and word tokenization
- handles EVERY human language (not just white space separated)
    - including music and genomic sequences

Vocabulary Size is a trade-off between :

- Larger - fewer tokens per sentences
    - faster training
    - less state
    - downside : LARGER EMBEDDING MATRIX
        - requires MORE data to LEARN

In [7]:
txts = L(o.open().read() for o in files[:2000])
txts

(#2000) ["Alan Rickman & Emma Thompson give good performances with southern/New Orleans accents in this detective flick. It's worth seeing for their scenes- and Rickman's scene with Hal Holbrook. These three actors mannage to entertain us no matter what the movie, it seems. The plot for the movie shows potential, but one gets the impression in watching the film that it was not pulled off as well as it could have been. The fact that it is cluttered by a rather uninteresting subplot and mostly uninteresting kidnappers really muddles things. The movie is worth a view- if for nothing more than entertaining performances by Rickman, Thompson, and Holbrook.",'I have seen this movie and I did not care for this movie anyhow. I would not think about going to Paris because I do not like this country and its national capital. I do not like to learn french anyhow because I do not understand their language. Why would I go to France when I rather go to Germany or the United Kingdom? Germany and the U

In [9]:
def subword(sz):
    sp = SubwordTokenizer(vocab_sz=sz)
    sp.setup(txts)
    return ' '.join(first(sp([txt]))[:40])

In [10]:
%pip install --user 'sentencepiece!=0.1.90,!=0.1.91'

Collecting sentencepiece!=0.1.90,!=0.1.91
  Downloading sentencepiece-0.1.99-cp310-cp310-macosx_11_0_arm64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Note: you may need to restart the kernel to use updated packages.


In [10]:
subword(1000)

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=tmp/texts.out --vocab_size=1000 --model_prefix=tmp/spm --character_coverage=0.99999 --model_type=unigram --unk_id=9 --pad_id=-1 --bos_id=-1 --eos_id=-1 --minloglevel=2 --user_defined_symbols=▁xxunk,▁xxpad,▁xxbos,▁xxeos,▁xxfld,▁xxrep,▁xxwrep,▁xxup,▁xxmaj --hard_vocab_limit=false


'▁A l an ▁R ick man ▁ & ▁E mm a ▁Th o mp s on ▁give ▁good ▁performance s ▁with ▁so u ther n / N e w ▁O r le an s ▁a c c ent s ▁in'

In [11]:
subword(200)

'▁A l an ▁ R ic k m an ▁ & ▁ E m m a ▁ T h o m p s on ▁ g i ve ▁ g o o d ▁p er f or m an ce'

In [12]:
subword(10_000)

"▁Alan ▁Rick man ▁ & ▁Emma ▁Thompson ▁give ▁good ▁performances ▁with ▁southern / N ew ▁O rleans ▁accents ▁in ▁this ▁detective ▁flick . ▁It ' s ▁worth ▁seeing ▁for ▁their ▁scenes - ▁and ▁Rick man ' s ▁scene ▁with ▁Hal"

##### Numericalization with Fastai

Numericalization is the process of mapping tokens to integers
- 1 - vocabulary : list of all possible levels of categorical variable
    - RGB captures visual pixel values!
- 2 - replace each level with it's index in the vocab
    - each R, G, B channel has value between 0 - 255

In [13]:
toks = tkn(txt)
print(coll_repr(tkn(txt), 31))

(#139) ['xxbos','xxmaj','alan','xxmaj','rickman','&','xxmaj','emma','xxmaj','thompson','give','good','performances','with','southern','/','xxmaj','new','xxmaj','orleans','accents','in','this','detective','flick','.','xxmaj','it',"'s",'worth','seeing'...]


In [14]:
toks200 = txts[:200].map(tkn)
toks200[0]

(#139) ['xxbos','xxmaj','alan','xxmaj','rickman','&','xxmaj','emma','xxmaj','thompson'...]

In [15]:
num = Numericalize()
num.setup(toks200)
coll_repr(num.vocab, 20)


"(#1984) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the','.',',','and','a','to','of','i','it','is','in'...]"

In [16]:
nums = num(toks)[:20]
nums

TensorText([   2,    8,    0,    8, 1442,  234,    8,    0,    8,    0,  199,
              64,  731,   29,    0,  122,    8,  253,    8,    0])

In [17]:
' '.join(num.vocab[o] for o in nums)

'xxbos xxmaj xxunk xxmaj rickman & xxmaj xxunk xxmaj xxunk give good performances with xxunk / xxmaj new xxmaj xxunk'

##### Putting Our Texts into Batches for a Language Model

With IMAGEs for batching we needed to :

- RESIZE height and width
- so we could group and stack them in a single tensor

With TEXT
- can't resize arbitrarily varied length to fix length
- char order matters to predict next token
- each NEW batch MUST begin precisely where the old batch finished



In [18]:
nums200 = toks200.map(num)

dl = LMDataLoader(nums200)

x,y = first(dl)
x.shape, y.shape

(torch.Size([64, 72]), torch.Size([64, 72]))

In [19]:
' '.join(num.vocab[o] for o in x[0][:20])

'xxbos xxmaj xxunk xxmaj rickman & xxmaj xxunk xxmaj xxunk give good performances with xxunk / xxmaj new xxmaj xxunk'

In [20]:
' '.join(num.vocab[o] for o in y[0][:20])

'xxmaj xxunk xxmaj rickman & xxmaj xxunk xxmaj xxunk give good performances with xxunk / xxmaj new xxmaj xxunk accents'

## -- Training a Text Classifier --

- 1 - fine-tune our `language model` trained on Wikipedia
- 2 - use that model to train our `classifier`

### Language Model Using DataBlock

In [21]:
get_imdb = partial(
    get_text_files,
    folders = ['train', 'test', 'unsup']
)

In [22]:
# FileNotFoundError: [Errno 2] No such file or directory: 
# '.fastai/data/imdb_tok/counter.pkl'
# This can occur if we cancel this tok process : 
# - it'll cache in a malformed
# - when we rerun, it only checks that the 'imdb_tok' folder exists
# - and tries to load the `counter.pkl` that never actually got completed
dls_lm = DataBlock(
    blocks=TextBlock.from_folder(path, is_lm=True),
    get_items=get_imdb, 
    splitter=RandomSplitter(0.1)
# ).dataloaders(path, path=path, bs=128, seq_len=80)
# stepping down to fit on M1 mac
).dataloaders(path, path=path, bs=32, seq_len=32)

In [23]:
dls_lm.show_batch(max_n=2)

Unnamed: 0,text,text_
0,xxbos i must say that i really had no idea that i was going to sit down and watch this movie . i guess it was the fact that i had nothing,i must say that i really had no idea that i was going to sit down and watch this movie . i guess it was the fact that i had nothing better
1,single most unintentionally funny movie i have ever had the fortune / misfortune of finding in the bargain bin . xxmaj bad camera - work and sounds that apparently were all overdubbed,most unintentionally funny movie i have ever had the fortune / misfortune of finding in the bargain bin . xxmaj bad camera - work and sounds that apparently were all overdubbed in


### Fine-Tuning the Language Model



In [24]:
learn = language_model_learner(
    dls_lm,
    AWD_LSTM,
    drop_mult=0.3,
    metrics=[accuracy, Perplexity()]
).to_fp16()

In [25]:
# @note : takes 940 minutes to train
# epoch	train_loss	valid_loss	accuracy	perplexity	time
#     0	4.004853	3.903127	0.300107	49.557171	15:40:19

# @note : takes 351 minutes to train
# epoch	train_loss	valid_loss	accuracy	perplexity	time
#     0	4.211511	4.084476	0.288052	59.410797	5:51:50
# learn.fit_one_cycle(1, 2e-2)

# Path('/Users/mton/.fastai/data/imdb/models/1epoch.pth')
# learn.save('1epoch')

# Let's load our model instead of performing that MASSIVE training
learn = learn.load('1epoch')



epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.211511,4.084476,0.288052,59.410797,5:51:50


In [25]:
# Finetune after initial model training done

learn.unfreeze()
learn.fit_one_cycle(10, 2e-3)



epoch,train_loss,valid_loss,accuracy,perplexity,time


RuntimeError: MPS backend out of memory (MPS allocated: 5.59 GB, other allocations: 1.90 GB, max allowed: 9.07 GB). Tried to allocate 2.29 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).