<a href="https://colab.research.google.com/github/lkarjun/fastai-huggingface-workouts/blob/main/notebook3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Packages

In [3]:
!pip install -qq transformers[sentencepiece] datasets
!pip install -qq fastai ohmeow-blurr

[?25l[K     |▎                               | 10 kB 16.1 MB/s eta 0:00:01[K     |▋                               | 20 kB 23.1 MB/s eta 0:00:01[K     |▉                               | 30 kB 28.2 MB/s eta 0:00:01[K     |█▏                              | 40 kB 32.7 MB/s eta 0:00:01[K     |█▌                              | 51 kB 22.0 MB/s eta 0:00:01[K     |█▊                              | 61 kB 17.6 MB/s eta 0:00:01[K     |██                              | 71 kB 14.3 MB/s eta 0:00:01[K     |██▎                             | 81 kB 15.7 MB/s eta 0:00:01[K     |██▋                             | 92 kB 15.2 MB/s eta 0:00:01[K     |███                             | 102 kB 13.9 MB/s eta 0:00:01[K     |███▏                            | 112 kB 13.9 MB/s eta 0:00:01[K     |███▌                            | 122 kB 13.9 MB/s eta 0:00:01[K     |███▉                            | 133 kB 13.9 MB/s eta 0:00:01[K     |████                            | 143 kB 13.9 MB/s eta 0:

In [6]:
!pip install -qq wandb

## Fine-tuning with **Blurr**

In [8]:
import wandb

from fastai.text.all import *
from fastai.callback.wandb import *

from datasets import load_dataset, concatenate_datasets
from transformers import AutoModelForSequenceClassification

from blurr.utils import BLURR
from blurr.data.core import HF_TextBlock
from blurr.modeling.core import Blearner, HF_BaseModelWrapper, HF_BaseModelCallback, hf_splitter

In [9]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [24]:
model_cls = AutoModelForSequenceClassification

checkpoint = 'bert-base-uncased'

In [11]:
bsz, val_bsz = 8, 16

wandb_init_kwargs = {
    'reinit': True,
    'project': 'blurr',
    'entity': 'ohmeow',
    'group': 'glue-mrpc',
    'name': 'glue-mrpc-bert-base-uncased',
    'notes': 'Finetuning glue-mrpc with Blurr',
    'tags': ['bert', 'glue-mrpc', 'blurr']
}

### Using low / mid-level Api

In [12]:
raw_datasets = load_dataset('glue', 'mrpc')

Downloading:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, post-processed: Unknown size, total: 2.85 MiB) to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading: 0.00B [00:00, ?B/s]

Downloading: 0.00B [00:00, ?B/s]

Downloading: 0.00B [00:00, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
print(f"{raw_datasets}\n")
print(f"{raw_datasets['train'][0]}\n")
print(f"{raw_datasets['train'].features}\n")

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}

{'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None), 'idx': Value(dtype='int32', id=None)}



## Data Preparation

In [16]:
n_train, n_valid = raw_datasets['train'].num_rows, raw_datasets['validation'].num_rows

In [17]:
train_idxs, valid_idxs = L(range(n_train), L(range(n_train, n_train + n_valid)))

raw_ds = concatenate_datasets([raw_datasets['train'], raw_datasets['validation']])

In [19]:
n_lbls = len(set([item['label'] for item in raw_ds]))
n_lbls

2

In [29]:
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(checkpoint, model_cls = model_cls, config_kwargs={'num_labels': n_lbls})

In [None]:
print(hf_model)

In [42]:
blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), CategoryBlock())

dblock = DataBlock(
                   blocks = blocks,
                   get_x = itemgetter('sentence1', 'sentence2'),
                   get_y = itemgetter('label'),
                   splitter = IndexSplitter(valid_idxs)
                )