In [1]:
!pip install fasthugs

Collecting fasthugs
  Downloading fasthugs-0.0.1-py3-none-any.whl (15 kB)
Collecting typing-extensions
  Downloading typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)
Installing collected packages: typing-extensions, fasthugs
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.2.0
    Uninstalling typing_extensions-4.2.0:
      Successfully uninstalled typing_extensions-4.2.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-io 0.21.0 requires tensorflow-io-gcs-filesystem==0.21.0, which is not installed.
tensorflow 2.6.4 requires absl-py~=0.10, but you have absl-py 1.0.0 which is incompatible.
tensorflow 2.6.4 requires numpy~=1.19.2, but you have numpy 1.21.6 which is incompatible.
tensorflow 2.6.4 requires six~=1.15.0, but you have six 1.16.0 which is incompatible.
tensorflow 2.6.4 requires wrapt~=1.12

In [2]:
from transformers import AutoModelForSequenceClassification
from fastai.text.all import *
from fastai.callback.wandb import *

from fasthugs.learner import TransLearner
from fasthugs.data import TransformersTextBlock, TextGetter, get_splits, PreprocCategoryBlock

from datasets import load_dataset, concatenate_datasets

import random 
import numpy as np
import torch

def random_seed(seed_value): 
    np.random.seed(seed_value) 
    torch.manual_seed(seed_value)
    random.seed(seed_value) 


random_seed(42)

In [3]:
ds_name = 'glue'
model_name = "AnonymousSub/rule_based_roberta_hier_triplet_epochs_1_shard_1"

max_len = 512
bs = 32
val_bs = bs*2

lr = 3e-5

In [4]:
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]
def validate_task():
    assert task in GLUE_TASKS

In [5]:
from fastai.metrics import MatthewsCorrCoef, F1Score, PearsonCorrCoef, SpearmanCorrCoef

In [6]:
glue_metrics = {
    'cola':[MatthewsCorrCoef()],
    'sst2':[accuracy],
    'mrpc':[F1Score(), accuracy],
    'stsb':[PearsonCorrCoef(), SpearmanCorrCoef()],
    'qqp' :[F1Score(), accuracy],
    'mnli':[accuracy],
    'qnli':[accuracy],
    'rte' :[accuracy],
    'wnli':[accuracy],
}

In [7]:
task = 'stsb'
validate_task()

In [8]:
ds = load_dataset(ds_name, task)

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/stsb (download: 784.05 KiB, generated: 1.09 MiB, post-processed: Unknown size, total: 1.86 MiB) to /root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/803k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
ds.keys()

dict_keys(['train', 'validation', 'test'])

In [10]:
len(ds['train']), len(ds['validation'])

(5749, 1500)

In [11]:
train_idx, valid_idx = get_splits(ds)
valid_idx

(#1500) [5749,5750,5751,5752,5753,5754,5755,5756,5757,5758...]

In [12]:
train_ds = concatenate_datasets([ds['train'], ds['validation']])

In [13]:
train_ds[0]

{'sentence1': 'A plane is taking off.',
 'sentence2': 'An air plane is taking off.',
 'label': 5.0,
 'idx': 0}

In [14]:
dblock = DataBlock(blocks = [TransformersTextBlock(pretrained_model_name=model_name), RegressionBlock(1)],
                   get_x=TextGetter('sentence1', 'sentence2'),
                   get_y=ItemGetter('label'),
                   splitter=IndexSplitter(valid_idx))

Downloading:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [15]:
%%time
dls = dblock.dataloaders(train_ds, bs=bs, val_bs=val_bs)

CPU times: user 6.91 s, sys: 1.43 s, total: 8.35 s
Wall time: 12.1 s


In [16]:
dls.show_batch(max_n=5)

Unnamed: 0,text,text_,text__
0,A plane is taking off.,An air plane is taking off.,5.0
1,Sony Xperia Z2 and Nokia‚Äôs X series unveiled,Court orders political ban on Italy‚Äôs Berlusconi,0.0
2,He's not wealthy because he's successful.,Why hate people because they are successful?,0.800000011920929
3,Three more US soldiers killed in Afghanistan,NATO Soldier Killed in Afghanistan,1.7999999523162842
4,"The songs are on offer for 99 cents each, or $9.99 for an album.",The company will offer songs for 99 cents and albums for $9.95.,3.3329999446868896


In [17]:
import wandb

WANDB_NAME = f'{ds_name}-{task}-{model_name}'
GROUP = f'{ds_name}-{task}-{model_name}-{lr:.0e}'
NOTES = f'finetuning {model_name} with RAdam lr={lr:.0e}'
CONFIG = {}
TAGS =[model_name, ds_name, 'radam']

In [18]:
wandb.init(reinit=True, project="fasthugs", entity="fastai_community",
           name=WANDB_NAME, group=GROUP, notes=NOTES, tags=TAGS, config=CONFIG);

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
metrics = glue_metrics[task]
learn = TransLearner(dls, model, metrics=metrics).to_fp16()

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of the model checkpoint at AnonymousSub/rule_based_roberta_hier_triplet_epochs_1_shard_1 were not used when initializing RobertaForSequenceClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at AnonymousSub/rule_based_roberta_hier_triplet_epochs_1_shard_1 and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_p

In [20]:
cbs = []
learn.fit_one_cycle(10, lr, cbs=cbs)

epoch,train_loss,valid_loss,pearsonr,spearmanr,time
0,1.818572,0.679656,0.85031,0.847685,00:49
1,0.568844,0.464313,0.894114,0.894649,00:52
2,0.389603,0.453627,0.904373,0.903994,00:52
3,0.279905,0.492652,0.905581,0.904163,00:51
4,0.205784,0.505553,0.901399,0.89822,00:52
5,0.173306,0.473948,0.909177,0.906299,00:51
6,0.126259,0.422385,0.906482,0.904919,00:52
7,0.105504,0.439509,0.909816,0.906624,00:51
8,0.083578,0.419202,0.909292,0.906407,00:52
9,0.082968,0.426244,0.909239,0.906327,00:52


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/to

In [21]:
learn.show_results()

Unnamed: 0,text,text_,text__,text___
0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,5.0,"(5.20703125,)"
1,"The woman felt threatened and went to the magistrate's office, police said.",The woman reported that she felt threatened and obtained a warrant for Stackhouse's arrest from the local magistrate's office.,3.25,"(3.505859375,)"
2,We always put it crushed up in a bit of butter or on a sardine.,"We had a cat that required daily medication, and it was a struggle to give the cat a pill.",1.399999976158142,"(1.61328125,)"
3,Aristotle didn't put the world at the center of the universe per se.,Why did Aristotle place the earth at the centre of an infinite universe?,2.799999952316284,"(2.1796875,)"
4,Webster's New World Rhyming Dictionary: Clement Wood's Updated This is the rhyming dictionary I turn to first.,"http://www.rhymezone.com/ You just type in a word, then select one of the following.",1.2000000476837158,"(2.625,)"
5,Indian police round up all five suspects in Mumbai rape case,Mumbai police arrest fifth suspect in gang-rape case,3.799999952316284,"(4.52734375,)"
6,The puppy is outdoor.,A man in printed board shorts is doing a yoga pose on the beach.,0.0,"(0.0799560546875,)"
7,A person is scaling a rock wall.,A person and a horse are above a fence.,0.2000000029802322,"(0.625,)"
8,"My experience with Danios has that they always have been ""bossy"" fish.","Breed Zebra Danios are extremely hardy fish, they are almost difficult to kill.",3.0,"(2.65234375,)"
