## [GLOBAL IMPORTS]

In [1]:
# m1 error fix
import sentencepiece

from fastai.data.external import (
    untar_data, 
    URLs,
)

from fastai.data.core import L

from fastai.data.block import (
    DataBlock,
)

from fastai.data.transforms import (
    RandomSplitter, 
    parent_label,
    GrandparentSplitter,
)


from fastai.text.all import (
    defaults,
    # file handler
    get_text_files, parent_label,
    # tabular util
    Tokenizer, WordTokenizer, SubwordTokenizer, Numericalize,
    LMDataLoader, 
    # data block
    TextBlock, CategoryBlock,
    # model
    AWD_LSTM,
    # metric
    Perplexity, accuracy,
    # learner
    language_model_learner,
    text_classifier_learner,
    # debug log
    first, coll_repr
)

from fastai.text.core import replace_rep

from functools import partial

# suppress Cuda device warnings so we don't BLOW UP logs
import warnings

warnings.filterwarnings('ignore', category=UserWarning)

#### pip install

In [55]:
%pip install fastbook

python(19687) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Collecting fastbook
  Downloading fastbook-0.0.29-py3-none-any.whl (719 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.8/719.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting datasets (from fastbook)
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting ipywidgets<8 (from fastbook)
  Downloading ipywidgets-7.8.1-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting ipython-genutils~=0.2.0 (from ipywidgets<8->fastbook)
  Downloading ipython_genutils-0.2.0-py2.py3-none-any.whl (26 kB)
Collecting widgetsnbextension~=3.6.6 (from ipywidgets<8->fastbook)
  Downloading widgetsnbextension-3.6.6-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting jupyterlab-widgets<3,>=1.0.0 (from ipywidgets<8->fastbook)
  Downloading jupyterlab_widgets-1.1.7-py3-none-any.whl.metadata (3.7 kB)
Collecting pyarrow>=8.0.0 (from datasets->fastbook)
  Downloading pyarrow-14.0.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (3.0 kB)
Collecting pyarrow-hot

#### Inspect source code with `??`

In [2]:

# inspect source
# Tokenizer??
replace_rep??

[0;31mSignature:[0m [0mreplace_rep[0m[0;34m([0m[0mt[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mreplace_rep[0m[0;34m([0m[0mt[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"Replace repetitions at the character level: cccc -- TK_REP 4 c"[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0m_replace_rep[0m[0;34m([0m[0mm[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0mc[0m[0;34m,[0m[0mcc[0m [0;34m=[0m [0mm[0m[0;34m.[0m[0mgroups[0m[0;34m([0m[0;34m)[0m[0;34m[0m
[0;34m[0m        [0;32mreturn[0m [0;34mf' {TK_REP} {len(cc)+1} {c} '[0m[0;34m[0m
[0;34m[0m    [0;32mreturn[0m [0m_re_rep[0m[0;34m.[0m[0msub[0m[0;34m([0m[0m_replace_rep[0m[0;34m,[0m [0mt[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      ~/miniconda3/envs/m1_torch_gpu/lib/python3.10/site-packages/fastai/text/core.py
[0;31mType:[0m      function

## NLP Deep Dive 

### CONCEPTS

#### -- RNNs --
- Self-supervised learning is training a model using labels that are :
  - **EMBEDDED** in the `independent` variable
    - training to predict the next word in a text is an example
  - rather than requiring EXTERNAL labels


#### -- ULMFit --
- Universal Language Model Fine-tuning
  - 1 - fine-tune the **sequence based** language model
  - 2 - then fine-tune the **classification** model
- This tends to yield BETTER results

#### -- Challenges --

- 1 - NON-UNIFORM word COUNT per **sentences**
- 2 - NON-UNIFORM sentence COUNT per **documents**

#### -- Roadmap --

- 1 - **capture** the FULL vocabulary i.e. all possible levels of categorical
variables
- 2 - **replace** each vocabulary **term** with it's **index** in the  
vocabulary (1)
- 3 - **generate** an **embedding matrix** for each term in the vocabulary
- 4 - embedding matrix is utilized as the **1st layer** of the neural network

##### ==[ chatGPT ]==

> Explain this roadmap like I am 5

Okay, let's imagine you have a toy box with lots of different toys, and you want
to organize them so that you can tell a robot exactly where each toy is.

1. **Capture the Full Vocabulary**: First, you make a list of every single toy 
you have, so you know what's in your toy box. This is like writing down every 
word you know in a big book.

2. **Replace Each Vocabulary Term with Its Index in the Vocabulary**: Then, you 
number every toy on your list. The robot doesn't understand "teddy bear" or 
"race car," but it does understand "Toy #1" and "Toy #2." So, the teddy bear 
might be #1, the race car might be #2, and so on.

3. **Generate an Embedding Matrix for Each Item in the Vocab**: Now, instead of 
telling the robot to take up a lot of space with one big toy spot for each toy, 
we give it a special toy map. This map shows a little picture with clues about 
each toy. The teddy bear's picture might show it's soft, cuddly, and good for 
hugging, while the race car's picture shows it's fast and fun.

4. **Embedding Matrix as the First Layer of the Neural Network**: The robot uses 
the toy map to understand toys better. When you tell the robot "Toy #1," it 
looks at the teddy bear's picture in the toy map and learns all about it. This 
is like the robot's brain—when it sees the number of the toy, it looks at the 
map to know what to do with the toy.

- The **robot takes the number** (like #1 for the teddy bear) and **finds** the 
**picture in the toy map**. This is much quicker than if we made the robot check 
every single toy spot (like one-hot encoding would).
- It's like a shortcut for the robot to know about the toys **without needing** 
such **a big toy spot for each one**. It makes things faster and saves space, 
which is really smart for the robot!

So, this roadmap helps the robot to be smart about toys, just like a computer 
learns about words!

#### -- Embedding Matrix ELI5 --

> Explain embedding matrix and one-hot encoding as if I am 5

##### ==[ chatGPT ]==

Okay! Imagine you have a big box of colored Lego bricks, and each color is a  
different word. Now, if you want to show someone the word "cat," you find a  
red brick because we agreed that "cat" means red. But you have so many  
bricks—hundreds of colors—for all the different words you know. If you want to  
show someone all the words you know one by one, it would take a long time  
because you have to find each color out of hundreds!

**One-Hot Encoding**

In the one-hot way, we make a super long necklace with beads, where each bead is  
a different color. The necklace is really, really long because it has a place  
for every color we can think of, even if we don't have the Lego brick for that  
color. When we want to show "cat," we find the one place on the necklace where  
the red bead goes and light it up! Every other bead is dark, only the "cat" bead  
is bright. But this is not the best way because the necklace is too long, and we  
waste time looking at all the dark beads just to find the one bright bead.

**Embedding Matrix**

Now, let's think about a smarter way with a magic coloring book. Instead of a  
big box or a long necklace, we have this book. When we want to say "cat," we go  
to a special page in the book and see a beautiful picture with colors that tell  
us a lot about "cat." Maybe it has some red, but also a bit of green and yellow  
to show that a cat is a pet, it's small, and it can be soft.

This `coloring book` is like an `embedding matrix`. Instead of : 

- looking for one color out of hundreds, 
- we look at a picture that has a mix of colors but isn't too big. 

It's better because the picture helps us understand more about "cat" than just  
the color red. We can learn about cats by looking at the colors and how they  
mix, which tells us how it's similar to things like "dog" or "mouse," which have  
their own pages with different color mixes!

So, one-hot encoding is like having a huge box or a long necklace where we pick  
just one color, but embedding is like having a magic coloring book that teaches  
us more about the word with beautiful color pictures.

## Text Processing

Predicting sentence length isn't obvious ... correlated to human breathe?
- Sentences can be of different LENGTHS
- Documents can be LONG

Review our background with how a single categorical variable can be used as an
indpendent variable, here's the approach we took for a single categorical var :
- 1 - Make a list of all the possible levels of that categorical var -- vocab --
- 2 - Replace each level with it's `index` in the -- vocab --
- 3 - Create an `embedding matrix` for this containing a row for each item
      .i.e for each item in the -- vocab --
- 4 - Use this `embedding matrix` as the first layer of a neural network  

```sh
        A dedicated **embedding matrix** can take as inputs the raw -- vocab --  
        indexes created in step 2;  
            - this is equivalent to  
            - but FASTER and more EFFICIENT than a matrix that takes as input  
            one-hot-encoded vectors representing the indexes  
```

We can do the same thing ^ with TEXT!  What is new is the idea of a sequence

- 1 - [ Tokenization ]
    - convert text into a list of (depending on granularity) :  
      - characters  
      - substrings - (GPT, HuggingFace)  
      - words  
- 2 - [ Numericalization ]  
    - -- vocab -- list hashed to an index number lookup
- 3 - Language Model [ Data Loader Creation ]   
    - `LMDataLoader` handles creating  
      - `dependent` variable that is  
      - `offset` from the `independent` by ONE `token`  
    - Also handles details such as :  
      - shuffling the training data so that the independent and dependent  
        variable maintain their structure as required  
      - latent breathe?  
- 4 - [ Language Model Creation ]  
    - RNN  
      - handles INPUT lists that can be of ARBITRARY LENGTH  


### -- Tokenization --

- Process of Splitting Text into Tokens

#### Resolution

- 1 - Character
    - split into INDIVDUAL CHARS
- 2 - Subword
    - split into SMALLER parts
    - based on most COMMONLY occuring substrings
        - "occasion" => "o" "c" "ca" "sion"
- 3 - Word
    - apply language specific separator like 'white' space
    - generally punctuation marks are SEPARATE tokens
        - as opposed to totally NEW words

In [57]:
# Get text data from IMDB
# folder structure == path + :
# - train/
# - test/
# - unsup/
path = untar_data(URLs.IMDB)

files = get_text_files(path, folders=['train', 'test', 'unsup'])
txt = files[0].open().read()

files[:3], txt[:175]

((#3) [Path('/Users/mton/.fastai/data/imdb/test/neg/1821_4.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/9487_1.txt'),Path('/Users/mton/.fastai/data/imdb/test/neg/4604_4.txt')],
 "Alan Rickman & Emma Thompson give good performances with southern/New Orleans accents in this detective flick. It's worth seeing for their scenes- and Rickman's scene with Hal")

In [58]:
# ==[ Split text to tokens ]==
# WordTokenizer is a Fastai tokenizing library that can : 
# - collect subwords
# - handling contextual corner cases like '.' as sentence end vs value marker
spacy = WordTokenizer()
# txt is wrapped in a [] list 
# - because tokenizer expects an iterable of text
# - not a single monolithic string
# first retrieves the 1st element from an iterable
# - gets the first (and presumably only) list of tokens produced by the 
# tokenizer for the given text
toks = first(spacy([txt]))

# coll_repr == collection representation
# - this prints the collection
#   - contents of toks
#   - up to 30
print(coll_repr(toks, 30))

# Tokenization logic needs to handle very subtle context
# Here the char '.' has 3 meanings :
# - 1 - terminator at the end of the sentence
# - 2 - decimal placeholder for '1.00'
# - 3 - abbreviation for United States to U.S.
first(spacy(['The U.S. dollar $1 is $1.00.']))

(#121) ['Alan','Rickman','&','Emma','Thompson','give','good','performances','with','southern','/','New','Orleans','accents','in','this','detective','flick','.','It',"'s",'worth','seeing','for','their','scenes-','and','Rickman',"'s",'scene'...]


(#9) ['The','U.S.','dollar','$','1','is','$','1.00','.']

#### -- Common -- [ Prefixes ( xx ) ]

xx - not common prefix, these are special tokens
- xxbos : Beginning of stream
    - this token indicates the model will learn it needs to "forget" what was  
    said previously and focus on upcoming words
- xxmaj : Indicates the next word begins with a capital 
  (we lower cased everything)
- xxunk : Indicates a word is unknown
- xxrep : !!!!! => `repeated char token` + `!` so we can count repeats as  
  opposed to treating them as unique
- xxwrep : for repeated words as opposed to characters

-- [x] -- Tokenizer(spacy)

@audit : Explain this

In [59]:
tkn = Tokenizer(spacy)
print(coll_repr(tkn(txt)), 31)

(#139) ['xxbos','xxmaj','alan','xxmaj','rickman','&','xxmaj','emma','xxmaj','thompson'...] 31


#### -- FastAi -- [ Text Processing Rules ]

In [60]:
# inspect source code
replace_rep??

# check default rules
defaults.text_proc_rules

[<function fastai.text.core.fix_html(x)>,
 <function fastai.text.core.replace_rep(t)>,
 <function fastai.text.core.replace_wrep(t)>,
 <function fastai.text.core.spec_add_spaces(t)>,
 <function fastai.text.core.rm_useless_spaces(t)>,
 <function fastai.text.core.replace_all_caps(t)>,
 <function fastai.text.core.replace_maj(t)>,
 <function fastai.text.core.lowercase(t, add_bos=True, add_eos=False)>]

[0;31mSignature:[0m [0mreplace_rep[0m[0;34m([0m[0mt[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mreplace_rep[0m[0;34m([0m[0mt[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"Replace repetitions at the character level: cccc -- TK_REP 4 c"[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0m_replace_rep[0m[0;34m([0m[0mm[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0mc[0m[0;34m,[0m[0mcc[0m [0;34m=[0m [0mm[0m[0;34m.[0m[0mgroups[0m[0;34m([0m[0;34m)[0m[0;34m[0m
[0;34m[0m        [0;32mreturn[0m [0;34mf' {TK_REP} {len(cc)+1} {c} '[0m[0;34m[0m
[0;34m[0m    [0;32mreturn[0m [0m_re_rep[0m[0;34m.[0m[0msub[0m[0;34m([0m[0m_replace_rep[0m[0;34m,[0m [0mt[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      ~/miniconda3/envs/m1_torch_gpu/lib/python3.10/site-packages/fastai/text/core.py
[0;31mType:[0m      function

#### -- Subword -- [Vocabulary Size]

Subword tokenization provides an easy way to :
- easily scale between character and word tokenization
- handles EVERY human language (not just white space separated)
    - including music and genomic sequences

Vocabulary Size is a trade-off between :

- Larger - fewer tokens per sentences
    - faster training
    - less state
    - downside : LARGER EMBEDDING MATRIX
        - requires MORE data to LEARN

##### -- [x] -- txts 

In [73]:
txts = L(o.open().read() for o in files[:2000])
txts[:2]

(#2) ["Alan Rickman & Emma Thompson give good performances with southern/New Orleans accents in this detective flick. It's worth seeing for their scenes- and Rickman's scene with Hal Holbrook. These three actors mannage to entertain us no matter what the movie, it seems. The plot for the movie shows potential, but one gets the impression in watching the film that it was not pulled off as well as it could have been. The fact that it is cluttered by a rather uninteresting subplot and mostly uninteresting kidnappers really muddles things. The movie is worth a view- if for nothing more than entertaining performances by Rickman, Thompson, and Holbrook.",'I have seen this movie and I did not care for this movie anyhow. I would not think about going to Paris because I do not like this country and its national capital. I do not like to learn french anyhow because I do not understand their language. Why would I go to France when I rather go to Germany or the United Kingdom? Germany and the Unit

In [74]:
# @audit : Explain what is going on here
def subword(sz):
    sp = SubwordTokenizer(vocab_sz=sz)
    sp.setup(txts)
    return ' '.join(first(sp([txt]))[:40])

subword(200)

'▁A l an ▁ R ic k m an ▁ & ▁ E m m a ▁ T h o m p s on ▁ g i ve ▁ g o o d ▁p er f or m an ce'

In [10]:
%pip install --user 'sentencepiece!=0.1.90,!=0.1.91'

Collecting sentencepiece!=0.1.90,!=0.1.91
  Downloading sentencepiece-0.1.99-cp310-cp310-macosx_11_0_arm64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Note: you may need to restart the kernel to use updated packages.


##### Numericalization with Fastai

Numericalization is the process of mapping tokens to integers
- 1 - vocabulary : list of all possible levels of categorical variable
    - RGB captures visual pixel values!
- 2 - replace each level with it's index in the vocab
    - each R, G, B channel has value between 0 - 255

In [75]:
toks = tkn(txt)
print(coll_repr(tkn(txt), 31))

(#139) ['xxbos','xxmaj','alan','xxmaj','rickman','&','xxmaj','emma','xxmaj','thompson','give','good','performances','with','southern','/','xxmaj','new','xxmaj','orleans','accents','in','this','detective','flick','.','xxmaj','it',"'s",'worth','seeing'...]


In [76]:
toks200 = txts[:200].map(tkn)
toks200[0]

(#139) ['xxbos','xxmaj','alan','xxmaj','rickman','&','xxmaj','emma','xxmaj','thompson'...]

In [77]:
num = Numericalize()
num.setup(toks200)
coll_repr(num.vocab, 20)


"(#1984) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the','.',',','and','a','to','of','i','it','is','in'...]"

In [78]:
nums = num(toks)[:20]
nums

TensorText([   2,    8,    0,    8, 1442,  234,    8,    0,    8,    0,  199,
              64,  731,   29,    0,  122,    8,  253,    8,    0])

In [79]:
' '.join(num.vocab[o] for o in nums)

'xxbos xxmaj xxunk xxmaj rickman & xxmaj xxunk xxmaj xxunk give good performances with xxunk / xxmaj new xxmaj xxunk'

##### Putting Our Texts into Batches for a Language Model

With IMAGEs for batching we needed to :

- RESIZE height and width
- so we could group and stack them in a single tensor

With TEXT
- can't resize arbitrarily varied length to fix length
- char order matters to predict next token
- each NEW batch MUST begin precisely where the old batch finished



In [81]:
nums200 = toks200.map(num)

dl = LMDataLoader(nums200)

x,y = first(dl)
x.shape, y.shape
# @udit-ok : Explain this!
# - (torch.Size([64, 4]), torch.Size([64, 4]))   # our results
# - (torch.Size([64, 72]), torch.Size([64, 72])) # book results
# ANSWER : accidentally txts[2] instead of txts[2000]

(torch.Size([64, 72]), torch.Size([64, 72]))

In [82]:
' '.join(num.vocab[o] for o in x[0][:20])

'xxbos xxmaj xxunk xxmaj rickman & xxmaj xxunk xxmaj xxunk give good performances with xxunk / xxmaj new xxmaj xxunk'

In [83]:
' '.join(num.vocab[o] for o in y[0][:20])

'xxmaj xxunk xxmaj rickman & xxmaj xxunk xxmaj xxunk give good performances with xxunk / xxmaj new xxmaj xxunk accents'

## -- Training a Text Classifier --

- 1 - fine-tune our `language model` trained on Wikipedia
- 2 - use that model to train our `classifier`

### Language Model Using DataBlock

In [84]:
get_imdb = partial(
    get_text_files,
    folders = ['train', 'test', 'unsup']
)

In [85]:
# FileNotFoundError: [Errno 2] No such file or directory: 
# '.fastai/data/imdb_tok/counter.pkl'
# This can occur if we cancel this tok process : 
# - it'll cache in a malformed
# - when we rerun, it only checks that the 'imdb_tok' folder exists
# - and tries to load the `counter.pkl` that never actually got completed
dls_lm = DataBlock(
    # @audit : Explain is_lm
    blocks=TextBlock.from_folder(path, is_lm=True),
    get_items=get_imdb, 
    splitter=RandomSplitter(0.1)
# ).dataloaders(path, path=path, bs=128, seq_len=80)
# stepping down to fit on M1 mac
).dataloaders(path, path=path, bs=32, seq_len=32)

In [86]:
dls_lm.show_batch(max_n=2)

Unnamed: 0,text,text_
0,"xxbos xxmaj if xxmaj french director xxmaj jaques xxmaj tati were born in xxmaj italy , had a much smaller budget and was somewhat of a pervert he probably would have directed","xxmaj if xxmaj french director xxmaj jaques xxmaj tati were born in xxmaj italy , had a much smaller budget and was somewhat of a pervert he probably would have directed films"
1,xxbos xxmaj what would you say about a man who was about to get married and was having his bachelor party with some of his closest friends at a xxmaj hawaiian guy,xxmaj what would you say about a man who was about to get married and was having his bachelor party with some of his closest friends at a xxmaj hawaiian guy bar


### Fine-Tuning the Language Model



In [87]:
learn = language_model_learner(
    dls_lm,
    AWD_LSTM,
    drop_mult=0.3,
    metrics=[accuracy, Perplexity()]
).to_fp16()

#### [x] Load Epoch else ~ 1 day of GPU poor training

In [88]:
# @note : takes 940 minutes to train
# epoch	train_loss	valid_loss	accuracy	perplexity	time
#     0	4.004853	3.903127	0.300107	49.557171	15:40:19

# @note : takes 351 minutes to train
# epoch	train_loss	valid_loss	accuracy	perplexity	time
#     0	4.211511	4.084476	0.288052	59.410797	5:51:50
# learn.fit_one_cycle(1, 2e-2)

# Path('/Users/mton/.fastai/data/imdb/models/1epoch.pth')
# learn.save('1epoch')

# Let's load our model instead of performing that MASSIVE training
learn = learn.load('1epoch')

In [27]:
# Finetune after initial model training done
# @audit-ok : 3931 minutes of training lol

# learn.unfreeze()
# learn.fit_one_cycle(10, 2e-3)



epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.914853,3.895737,0.308965,49.192291,6:45:36
1,3.99027,3.962281,0.304625,52.577129,4:51:24
2,3.994842,3.95032,0.30616,51.952,4:32:56
3,3.931293,3.920458,0.309763,50.423538,5:51:19
4,3.829621,3.87581,0.314348,48.221737,4:42:34
5,3.826797,3.819576,0.319951,45.584888,20:24:10
6,3.744042,3.75642,0.326571,42.794952,5:05:19
7,3.695207,3.704155,0.332006,40.61573,4:38:18
8,3.607924,3.672139,0.336082,39.335972,4:18:45
9,3.586068,3.668146,0.336638,39.179214,4:21:06


#### [x] Load Encoder else ~ 3 days of Mac compute lol

In [89]:
# learn.save_encoder('finetuned')
learn.load_encoder('finetuned')

<fastai.text.learner.LMLearner at 0x17c8c9150>

### Text Generation

In [90]:
TEXT = "I liked this movie because"
N_WORDS = 40
N_SENTENCES = 2

preds = [
    learn.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES)
]

print("\n".join(preds))

i liked this movie because i read reviews posted elsewhere beforehand . i rented it because i hate rochon movies . i hated musicals . i saw movies dressed porter sings songs struggling with india . i laughed carefully sometimes . i just wanted to
i liked this movie because it lionel graveyards of egomaniacal cahoots between placard and communists slicing together souls unabated . i love insincere lizabeth polarity zinger gft dictation cookies etc blah blah blah blah blah blah blah blah blah blah blah blah blah blah blah


### Fine-Tuning the Classifier Model

We're now moving :
- from Language Model Finetuning
- to Classifier Finetuning

The Fundamental Difference is :
- Classifier predicts an EXTERNAL LABEL
- Language Model predicts next token (char/word)

In [91]:
dls_clas = DataBlock(
    blocks=(
        # is_lm isn't explicitly set here, so DEFAULTS to FALSE
        # effectively configures TextBlock to use regular labeled data as 
        # opposed to next tokens as labels
        # @audit : Explain like I am 5 ^
        # Sorting and padding is AUTOMATICALLY done by Fast AI data block API
        # when `is_lm` = FALSE
        TextBlock.from_folder(
            path,
            # vocab created for language model fine tuning is passed in here
            # BECAUSE we want to make sure to use the same correspondence to
            # token
            # @audit : Explain like I am 5 ^
            vocab=dls_lm.vocab,
        ), 
        CategoryBlock
    ),
    get_y = parent_label,
    get_items=partial(get_text_files, folders=['train', 'test']),
    splitter=GrandparentSplitter(valid_name='test')
# ).dataloaders(path, path=path, bs=128, seq_len=72)
).dataloaders(path, path=path, bs=64, seq_len=64) # addressing out of memory

#### [x] MPS memory limit issue

```sh
RuntimeError: MPS backend out of memory (MPS allocated: 2.16 GB, other  
allocations: 6.95 GB, max allowed: 9.07 GB). Tried to allocate 2.44 MB on  
private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit  
for memory allocations (may cause system failure).
```

Going to :
- reduce bs=128 > bs=64
- reduce seq_len=72 > seq_len=64

##### ==[ chatGPT ]==

> Yes, reducing both the batch size (bs) and the sequence length (seq_len) can  
help resolve out-of-memory (OOM) errors when training neural networks, including  
those you are working with in FastAI. Here's how these changes can alleviate  
memory issues:

> Reducing Batch Size:
- From bs=128 to bs=64: The batch size determines how many examples you process  
at once. Reducing the batch size decreases the amount of GPU memory required  
during training because fewer data points means less information needs to be  
stored in memory at any one time.

> Reducing Sequence Length:
- From seq_len=72 to seq_len=64: The sequence length in natural language  
processing tasks dictates the length of the sequences that your model processes.  
By shortening the maximum sequence length, each input tensor contains fewer  
tokens, which in turn reduces the memory footprint for both the model inputs and  
the intermediate activations within the model.

> By making these changes, you're instructing FastAI to load less data into  
memory for each training step, which should help you stay within the memory  
limits of your MPS backend.

> However, keep in mind that reducing the batch size might impact the gradient  
estimates during training, potentially affecting the convergence and stability  
of the training process. Similarly, reducing the sequence length might impact  
the context the model can see, which could affect performance if the model  
relies on longer sequences to make predictions. It's often a balance between  
resource constraints and model performance.

> If you apply these changes and still encounter memory issues, you might need  
to consider further reducing the batch size or sequence length, optimizing your  
model architecture, or using a machine with more memory resources.

> Remember to watch out for any changes in model performance as you make these  
adjustments, and validate that the model still learns effectively with the new  
settings.

In [92]:
dls_clas.show_batch(max_n=3)

Unnamed: 0,text,category
0,"xxbos xxmaj match 1 : xxmaj tag xxmaj team xxmaj table xxmaj match xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley vs xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley started things off with a xxmaj tag xxmaj team xxmaj table xxmaj match against xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit . xxmaj according to the rules of the match , both opponents have to go through tables in order to get the win . xxmaj benoit and xxmaj guerrero heated up early on by taking turns hammering first xxmaj spike and then xxmaj bubba xxmaj ray . a xxmaj german xxunk by xxmaj benoit to xxmaj bubba took the wind out of the xxmaj dudley brother . xxmaj spike tried to help his brother , but the referee restrained him while xxmaj benoit and xxmaj guerrero",pos
1,"xxbos xxmaj warning : xxmaj does contain spoilers . \n\n xxmaj open xxmaj your xxmaj eyes \n\n xxmaj if you have not seen this film and plan on doing so , just stop reading here and take my word for it . xxmaj you have to see this film . i have seen it four times so far and i still have n't made up my mind as to what exactly happened in the film . xxmaj that is all i am going to say because if you have not seen this film , then stop reading right now . \n\n xxmaj if you are still reading then i am going to pose some questions to you and maybe if anyone has any answers you can email me and let me know what you think . \n\n i remember my xxmaj grade 11 xxmaj english teacher quite well . xxmaj",pos
2,"xxbos i thought that xxup rotj was clearly the best out of the three xxmaj star xxmaj wars movies . i find it surprising that xxup rotj is considered the weakest installment in the xxmaj trilogy by many who have voted . xxmaj to me it seemed like xxup rotj was the best because it had the most profound plot , the most suspense , surprises , most xxunk the ending ) and definitely the most episodic movie . i personally like the xxmaj empire xxmaj strikes xxmaj back a lot also but i think it is slightly less good than than xxup rotj since it was slower - moving , was not as episodic , and i just did not feel as much suspense or emotion as i did with the third movie . \n\n xxmaj it also seems like to me that after reading these surprising reviews that",pos


In [94]:
nums_samp = toks200[:10].map(num)

nums_samp.map(len)

(#10) [139,152,226,605,262,242,179,101,373,309]

In [95]:
learn = text_classifier_learner(
    dls_clas,
    AWD_LSTM,
    drop_mult=0.5,
    metrics=accuracy,
).to_fp16()

### FineTuning the Classifier 

In [97]:
learn = learn.load_encoder('finetuned')

In [98]:
learn.fit_one_cycle(1, 2e-2)

# epoch	train_loss	valid_loss	accuracy	time
#     0	0.320355	0.216716	0.914800	4:48:23
# @ 288 minutes

@audit : How did it go from 4.5 hours to 16 minutes?
# epoch	train_loss	valid_loss	accuracy	time
#     0	0.358545	0.228075	0.915920	16:24

epoch,train_loss,valid_loss,accuracy,time
0,0.358545,0.228075,0.91592,16:24


loc("outputTensor"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/f84c363d-9006-11ee-8578-1ae9d66b0597/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":745:0)): error: 'anec.gain_offset_control' op result #0 must be 4D/5D memref of 16-bit float or 8-bit signed integer or 8-bit unsigned integer values, but got 'memref<1x64x1x1401xi1>'
loc("outputTensor"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/f84c363d-9006-11ee-8578-1ae9d66b0597/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":745:0)): error: 'anec.gain_offset_control' op result #0 must be 4D/5D memref of 16-bit float or 8-bit signed integer or 8-bit unsigned integer values, but got 'memref<1x64x1x95xi1>'
loc("outputTensor"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/f84c363d-9006-11ee-8578-1ae9d66b0597/Library/Caches/com.apple.xbs/Sources/MetalPer

#### -- freeze -2 --

In [99]:
# learn.freeze_to(-2)

In [100]:
# learn.fit_one_cycle(1, slice(1e-2/(2.6**4)), 1e-2)

# epoch	train_loss	valid_loss	accuracy	time
#     0	0.265503	0.185434	0.932520	12:54:16
# 774 minutes

epoch,train_loss,valid_loss,accuracy,time
0,0.265503,0.185434,0.93252,12:54:16


loc("outputTensor"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/f84c363d-9006-11ee-8578-1ae9d66b0597/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":745:0)): error: 'anec.gain_offset_control' op result #0 must be 4D/5D memref of 16-bit float or 8-bit signed integer or 8-bit unsigned integer values, but got 'memref<1x64x1x323xi1>'
loc("outputTensor"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/f84c363d-9006-11ee-8578-1ae9d66b0597/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":745:0)): error: 'anec.gain_offset_control' op result #0 must be 4D/5D memref of 16-bit float or 8-bit signed integer or 8-bit unsigned integer values, but got 'memref<1x64x1x537xi1>'
loc("outputTensor"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/f84c363d-9006-11ee-8578-1ae9d66b0597/Library/Caches/com.apple.xbs/Sources/MetalPer

In [101]:
# learn.save_encoder('category_tuned_f-2')
learn.load_encoder('category_tuned_f-2')

#### -- freeze -3 --

In [102]:
learn.freeze_to(-3)

In [103]:
learn.fit_one_cycle(1, slice(5e-3/(2.6**4), 5e-3))

epoch,train_loss,valid_loss,accuracy,time


RuntimeError: MPS backend out of memory (MPS allocated: 2.86 GB, other allocations: 6.38 GB, max allowed: 9.07 GB). Tried to allocate 20.25 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).