In [1]:
!pip install fasthugs

Collecting fasthugs
  Downloading fasthugs-0.0.1-py3-none-any.whl (15 kB)
Collecting typing-extensions
  Downloading typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)
Installing collected packages: typing-extensions, fasthugs
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.2.0
    Uninstalling typing_extensions-4.2.0:
      Successfully uninstalled typing_extensions-4.2.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-io 0.21.0 requires tensorflow-io-gcs-filesystem==0.21.0, which is not installed.
tensorflow 2.6.4 requires absl-py~=0.10, but you have absl-py 1.0.0 which is incompatible.
tensorflow 2.6.4 requires numpy~=1.19.2, but you have numpy 1.21.6 which is incompatible.
tensorflow 2.6.4 requires six~=1.15.0, but you have six 1.16.0 which is incompatible.
tensorflow 2.6.4 requir

In [2]:
from transformers import AutoModelForSequenceClassification
from fastai.text.all import *
from fastai.callback.wandb import *

from fasthugs.learner import TransLearner
from fasthugs.data import TransformersTextBlock, TextGetter, get_splits, PreprocCategoryBlock

from datasets import load_dataset, concatenate_datasets

import random 
import numpy as np
import torch

def random_seed(seed_value): 
    np.random.seed(seed_value) 
    torch.manual_seed(seed_value)
    random.seed(seed_value) 


random_seed(42)

In [3]:
ds_name = 'glue'
model_name = "AnonymousSub/rule_based_roberta_hier_triplet_epochs_1_shard_1"

max_len = 512
bs = 32
val_bs = bs*2

lr = 3e-5

In [4]:
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]
def validate_task():
    assert task in GLUE_TASKS

In [5]:
from fastai.metrics import MatthewsCorrCoef, F1Score, PearsonCorrCoef, SpearmanCorrCoef

In [6]:
glue_metrics = {
    'cola':[MatthewsCorrCoef()],
    'sst2':[accuracy],
    'mrpc':[F1Score(), accuracy],
    'stsb':[PearsonCorrCoef(), SpearmanCorrCoef()],
    'qqp' :[F1Score(), accuracy],
    'mnli':[accuracy],
    'qnli':[accuracy],
    'rte' :[accuracy],
    'wnli':[accuracy],
}

In [7]:
task = 'qqp'
validate_task()

In [8]:
ds = load_dataset(ds_name, task)

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/qqp (download: 39.76 MiB, generated: 106.55 MiB, post-processed: Unknown size, total: 146.32 MiB) to /root/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/363846 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/40430 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/390965 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
ds.keys()

dict_keys(['train', 'validation', 'test'])

In [10]:
len(ds['train']), len(ds['validation'])

(363846, 40430)

In [11]:
train_idx, valid_idx = get_splits(ds)
valid_idx

(#40430) [363846,363847,363848,363849,363850,363851,363852,363853,363854,363855...]

In [12]:
train_ds = concatenate_datasets([ds['train'], ds['validation']])

In [13]:
train_ds[0]

{'question1': 'How is the life of a math student? Could you describe your own experiences?',
 'question2': 'Which level of prepration is enough for the exam jlpt5?',
 'label': 0,
 'idx': 0}

In [14]:
vocab = train_ds.features['label'].names
dblock = DataBlock(blocks = [TransformersTextBlock(pretrained_model_name=model_name), PreprocCategoryBlock(vocab)],
                   get_x=TextGetter('question1', 'question2'),
                   get_y=ItemGetter('label'),
                   splitter=IndexSplitter(valid_idx))

Downloading:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [15]:
%%time
dls = dblock.dataloaders(train_ds, bs=bs, val_bs=val_bs)

CPU times: user 4min 42s, sys: 2.35 s, total: 4min 45s
Wall time: 4min 52s


In [16]:
dls.show_batch(max_n=5)

Unnamed: 0,text,text_,category
0,How is the life of a math student? Could you describe your own experiences?,Which level of prepration is enough for the exam jlpt5?,not_duplicate
1,How vulnerable are paratroopers on their descent? If they are fired at can they fire back?,Are conventional paratroopers obsolete?,not_duplicate
2,Examples of sole proprietorship? The,How are sole proprietorships started?,not_duplicate
3,Why don't I get answers for some of my questions on Quora?,Why do some questions get more answers here in Quora?,duplicate
4,Do Kashmiris wants to join Pakistan?,Do the people of Kashmir want to join Pakistan or India? Why?,not_duplicate


In [17]:
import wandb

WANDB_NAME = f'{ds_name}-{task}-{model_name}'
GROUP = f'{ds_name}-{task}-{model_name}-{lr:.0e}'
NOTES = f'finetuning {model_name} with RAdam lr={lr:.0e}'
CONFIG = {}
TAGS =[model_name, ds_name, 'radam']

In [18]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
metrics = glue_metrics[task]
learn = TransLearner(dls, model, metrics=metrics).to_fp16()

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of the model checkpoint at AnonymousSub/rule_based_roberta_hier_triplet_epochs_1_shard_1 were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at AnonymousSub/rule_based_roberta_hier_triplet_epochs_1_shard_1 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_pro

In [19]:
cbs = []
learn.fit_one_cycle(10, lr, cbs=cbs)

epoch,train_loss,valid_loss,f1_score,accuracy,time
0,0.308227,0.275385,0.844744,0.88187,46:34
1,0.263267,0.245858,0.861727,0.896414,46:25
2,0.220615,0.251487,0.861781,0.901014,46:28
3,0.184759,0.228706,0.876176,0.909201,46:41
4,0.148701,0.242946,0.878632,0.91207,46:36
5,0.112244,0.257837,0.883113,0.912021,46:33
6,0.072675,0.305723,0.888947,0.916003,46:29
7,0.046299,0.346715,0.890048,0.917586,46:26
8,0.035707,0.403134,0.890226,0.917858,46:28
9,0.020131,0.445124,0.890822,0.918353,46:34


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [20]:
learn.show_results()

Unnamed: 0,text,text_,category,category_
0,Why are African-Americans so beautiful?,Why are hispanics so beautiful?,not_duplicate,not_duplicate
1,If energy is created and not conserved in an expanding universe (that expands without limit) can infinite energy be created then?,"If vacuum energy is created as space expands, can infinite energy be created?",duplicate,duplicate
2,Is universe expanding because there is potential energy that transforms into kinetic energy all the time?,"As universe expands without limit, dark/vacuum energies are created too so is the energy that can be created (potential energy/potentiality) infinite?",not_duplicate,duplicate
3,What are the niches?,What are niches?,duplicate,duplicate
4,What perks at your startup have been most valuable for increasing your productivity?,What perks at Facebook have been most valuable for increasing productivity?,not_duplicate,not_duplicate
5,Is India better than Pakistan?,Is Pakistan better than India?,duplicate,not_duplicate
6,Which are the good colleges for MBA in tourism in India?,Are there any good colleges for pursuing MBA in IT in India?,not_duplicate,not_duplicate
7,How difficult is it to get a canadian tourist visa?,Why is it so hard for Russians to get a Canadian tourist visa?,not_duplicate,not_duplicate
8,Was David Bowie gay?,Did David Bowie inspire any punk rockers?,not_duplicate,not_duplicate
