In [2]:
!pip install fasthugs

Collecting fasthugs
  Downloading fasthugs-0.0.1-py3-none-any.whl (15 kB)
Collecting typing-extensions
  Downloading typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)
Installing collected packages: typing-extensions, fasthugs
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.2.0
    Uninstalling typing_extensions-4.2.0:
      Successfully uninstalled typing_extensions-4.2.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-io 0.21.0 requires tensorflow-io-gcs-filesystem==0.21.0, which is not installed.
tensorflow 2.6.4 requires absl-py~=0.10, but you have absl-py 1.0.0 which is incompatible.
tensorflow 2.6.4 requires numpy~=1.19.2, but you have numpy 1.21.6 which is incompatible.
tensorflow 2.6.4 requires six~=1.15.0, but you have six 1.16.0 which is incompatible.
tensorflow 2.6.4 requires wrapt~=1.12

In [3]:
from transformers import AutoModelForSequenceClassification
from fastai.text.all import *
from fastai.callback.wandb import *

from fasthugs.learner import TransLearner
from fasthugs.data import TransformersTextBlock, TextGetter, get_splits, PreprocCategoryBlock

from datasets import load_dataset, concatenate_datasets

import random 
import numpy as np
import torch

def random_seed(seed_value): 
    np.random.seed(seed_value) 
    torch.manual_seed(seed_value)
    random.seed(seed_value) 


random_seed(42)

In [4]:
ds_name = 'glue'
model_name = "AnonymousSub/rule_based_roberta_hier_triplet_epochs_1_shard_1"

max_len = 512
bs = 32
val_bs = bs*2

lr = 3e-5

In [5]:
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]
def validate_task():
    assert task in GLUE_TASKS

In [6]:
from fastai.metrics import MatthewsCorrCoef, F1Score, PearsonCorrCoef, SpearmanCorrCoef

In [7]:
glue_metrics = {
    'cola':[MatthewsCorrCoef()],
    'sst2':[accuracy],
    'mrpc':[F1Score(), accuracy],
    'stsb':[PearsonCorrCoef(), SpearmanCorrCoef()],
    'qqp' :[F1Score(), accuracy],
    'mnli':[accuracy],
    'qnli':[accuracy],
    'rte' :[accuracy],
    'wnli':[accuracy],
}

In [8]:
task = 'mrpc'
validate_task()

In [9]:
ds = load_dataset(ds_name, task)

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, post-processed: Unknown size, total: 2.85 MiB) to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
ds.keys()

dict_keys(['train', 'validation', 'test'])

In [11]:
len(ds['train']), len(ds['validation'])

(3668, 408)

In [12]:
train_idx, valid_idx = get_splits(ds)
valid_idx

(#408) [3668,3669,3670,3671,3672,3673,3674,3675,3676,3677...]

In [13]:
train_ds = concatenate_datasets([ds['train'], ds['validation']])

In [14]:
train_ds[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [15]:
vocab = train_ds.features['label'].names
dblock = DataBlock(blocks = [TransformersTextBlock(pretrained_model_name=model_name), PreprocCategoryBlock(vocab)],
                   get_x=TextGetter('sentence1', 'sentence2'),
                   get_y=ItemGetter('label'),
                   splitter=IndexSplitter(valid_idx))

Downloading:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [16]:
%%time
dls = dblock.dataloaders(train_ds, bs=bs, val_bs=val_bs)

CPU times: user 8.17 s, sys: 1.76 s, total: 9.93 s
Wall time: 14.2 s


In [17]:
dls.show_batch(max_n=5)

Unnamed: 0,text,text_,category
0,"Amrozi accused his brother, whom he called "" the witness "", of deliberately distorting his evidence.","Referring to him as only "" the witness "", Amrozi accused his brother of deliberately distorting his evidence.",equivalent
1,"Most of the alleged spammers engaged in fraudulent or deceptive practices, said Brad Smith, Microsoft's senior VP and general counsel.",""" Spam knows no borders, "" said Brad Smith, Microsoft's senior vice-president and general counsel.",not_equivalent
2,"Yesterday, Taiwan reported 35 new infections, bringing the total number of cases to 418.","The island reported another 35 probable cases yesterday, taking its total to 418.",equivalent
3,"A month ago, the Commerce Department estimated that GDP had grown at a 7.2 percent rate in the third quarter.","A month ago, the Commerce Department said GDP grew at a 7.2 percent rate.",equivalent
4,Gillespie sent a letter to CBS President Leslie Moonves asking for a historical review or a disclaimer.,Republican National Committee Chairman Ed Gillespie issued a letter Friday to CBS Television President Leslie Moonves.,not_equivalent


In [18]:
import wandb

WANDB_NAME = f'{ds_name}-{task}-{model_name}'
GROUP = f'{ds_name}-{task}-{model_name}-{lr:.0e}'
NOTES = f'finetuning {model_name} with RAdam lr={lr:.0e}'
CONFIG = {}
TAGS =[model_name, ds_name, 'radam']

In [19]:
wandb.init(reinit=True, project="fasthugs", entity="fastai_community",
           name=WANDB_NAME, group=GROUP, notes=NOTES, tags=TAGS, config=CONFIG);

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
metrics = glue_metrics[task]
learn = TransLearner(dls, model, metrics=metrics).to_fp16()

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of the model checkpoint at AnonymousSub/rule_based_roberta_hier_triplet_epochs_1_shard_1 were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at AnonymousSub/rule_based_roberta_hier_triplet_epochs_1_shard_1 and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.de

In [21]:
cbs = [WandbCallback(log_preds=False, log_model=False), SaveModelCallback(monitor=metrics[0].name)]
learn.fit_one_cycle(10, lr, cbs=cbs)

Could not gather input dimensions


epoch,train_loss,valid_loss,f1_score,accuracy,time
0,0.607585,0.56215,0.812227,0.683824,00:37
1,0.480501,0.360215,0.892794,0.85049,00:36
2,0.342549,0.294814,0.898917,0.862745,00:36
3,0.224088,0.312952,0.902622,0.872549,00:35
4,0.13599,0.357275,0.926746,0.894608,00:36
5,0.063998,0.379349,0.930973,0.904412,00:35
6,0.03244,0.415722,0.925,0.897059,00:35
7,0.023815,0.46187,0.927944,0.89951,00:35
8,0.013648,0.504336,0.929078,0.901961,00:37
9,0.012766,0.506984,0.923351,0.894608,00:36


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/to

In [22]:
learn.show_results()

Unnamed: 0,text,text_,category,category_
0,He said the foodservice pie business doesn 't fit the company's long-term growth strategy.,""" The foodservice pie business does not fit our long-term growth strategy.",equivalent,equivalent
1,"About 1,500 police will be deployed for the visit.","Around 1,500 police are to be deployed at Niigata for the ferry's visit.",equivalent,equivalent
2,Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war.,"His wife said he was "" 100 percent behind George Bush "" and looked forward to using his years of training in the war.",not_equivalent,not_equivalent
3,Federal Agriculture Minister Warren Truss said the Government still did not know the real reason the sheep were rejected at the Saudi port of Jeddah on August 21.,He said the Government still did not know the real reason the original Saudi buyer pulled out on August 21.,equivalent,equivalent
4,"BREAST cancer cases in the UK have hit an all-time high with more than 40,000 women diagnosed with the disease each year, Cancer Re-search UK revealed yesterday.","Cases of breast cancer in Britain have reached a record high, with the number of women diagnosed with the disease passing the 40,000 mark for the first time.",equivalent,equivalent
5,"With these assets, Funny Cide has a solid chance to become the first Triple Crown winner since Affirmed in 1978.",Funny Cide is looking to become horse racing's first Triple Crown winner in a generation.,not_equivalent,not_equivalent
6,"Gilead had earnings of $ 73.1 million, or 33 cents a share, compared with $ 20.8 million, or 10 cents, in the year-ago quarter.","Quarterly profit climbed to $ 73.1 million, or 33 cents a share, from $ 20.8 million, or 10 cents, a year earlier, the company said.",equivalent,equivalent
7,"Mr. Clinton's national security adviser, Sandy Berger, said that the White House wasn 't informed of the FBI activities.","Clinton ’ s national security adviser, Sandy Berger, said in an interview that the White House was not informed of the FBI activities.",equivalent,equivalent
8,"He said : "" For the first time there is an easy and affordable way of making this treasure trove of BBC content available to all. """,""" For the first time, there is an easy and affordable way of making this treasure trove of BBC content available to all, "" Dyke said.",equivalent,equivalent
