In [None]:
!pip install torch transformers huggingface_hub datasets evaluate accelerate bitsandbytes > /dev/null

In [None]:
from huggingface_hub import notebook_login
from datasets import load_dataset, load_metric
from torch.utils.data import Dataset, DataLoader
import torch
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

In [None]:
mnli_dataset = load_dataset('glue', 'mnli')
# The dataset contains whether the sentence entails, contradicts or is
# unrelated to a given hypothesis

In [None]:
mnli_dataset['validation_matched'][:5]

In [None]:
mnli_dataset

In [None]:
label_set = set(mnli_dataset['train']['label'])
label_set

In [None]:
# Check each of the dataset parts and find the unique labels
test_set = set(mnli_dataset['test_mismatched']['label'])
test_set

In [None]:
mnli_dataset['test_matched'][:5]

In [None]:
mnli_dataset['test_mismatched'][:5]

In [None]:
mnli_dataset['validation_mismatched'][:5]

In [None]:
# review the mrpc dataset
mrpc_dataset = load_dataset('glue', 'mrpc')
mrpc_dataset

In [None]:
mrpc_dataset.column_names

In [None]:
mrpc_dataset['train'][:5]

In [None]:
model_cp = "nghuyong/ernie-2.0-large-en"

tokenizer = AutoTokenizer.from_pretrained(model_cp)

In [None]:
from transformers.utils import logging

logging.set_verbosity_debug()

In [None]:
torch.cuda.is_available()

In [None]:
model_ernie = AutoModelForSequenceClassification.from_pretrained(model_cp,
                                                                 num_labels=2,)

In [None]:
model_ernie.to('cuda')

In [None]:
token_sentence = tokenizer(mrpc_dataset['test'][0]['sentence1'] + mrpc_dataset['test'][0]['sentence2'], return_tensors='pt')

In [None]:
pred_class = model_ernie(**token_sentence)
pred_class

In [None]:
def concat_tokenize(example):
  sentence = example['sentence1'] + example['sentence2']
  tokened_stmt = tokenizer(sentence, return_tensors='pt').to('cuda')
  example['input_ids'] = tokened_stmt['input_ids'][0]
  # example['token_type_id'] = tokened_stmt['token_type_ids']
  example['attention_mask'] = tokened_stmt['attention_mask'][0]
  return example

In [None]:
mrpc_dataset = mrpc_dataset.map(concat_tokenize)

In [None]:
mrpc_dataset = mrpc_dataset.remove_columns(['sentence1', 'sentence2', 'idx'])

In [None]:
type(mrpc_dataset['train'][0]['input_ids'])

In [None]:
metric = load_metric('glue', 'mrpc')

In [None]:
single_eval_test = metric.compute(predictions=[1],
               references=[1])

In [None]:
# instantiating trainer
# ensure to type each of the args, and try to ascertain the significance of it
# check the errors, if there is any mistakes in the arguments
train_args = TrainingArguments("/content/ernie_model",
                               evaluation_strategy="epoch",
                               num_train_epochs=2,
                               # save_strategy='epoch',
                               learning_rate=2e-5,
                               per_device_train_batch_size=8,
                               per_device_eval_batch_size=8,
                               weight_decay=0.01,
                               load_best_model_at_end=False,
                               metric_for_best_model=metric,
                               push_to_hub=False,
                               report_to='none',
                               skip_memory_metrics=True)
# The issue of ThreadFileContext is happening when the training args are being saved in the D:\tform\Lib\site-packages\transformers\trainer.py
# in 3233 line. Here torch.save() is called which takes the execution to D:\tform\Lib\site-packages\torch\serialization.py, where the 
# training_args object is not getting pickled.

In [None]:
import numpy as np

def compute_metric(eval_pred):
  pred, refs = eval_pred
  predictions = np.argmax(pred, axis=1)
  return metric.compute(predictions=predictions,
                        references=refs)

In [None]:
mrpc_dataset

In [None]:
trainer = Trainer(
    model_ernie,
    train_args,
    train_dataset=mrpc_dataset['train'],
    eval_dataset=mrpc_dataset['test'],
    compute_metrics=compute_metric,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
from evaluate import evaluator

task_evaluator = evaluator("text-classification")

In [None]:
results = task_evaluator.compute(
    model_or_pipeline=model_ernie,
    data=mrpc_dataset['validation'],
    tokenizer=tokenizer,
    metric="accuracy",
    label_mapping={"LABEL_0": 0.0, "LABEL_1": 1.0},
    strategy="bootstrap",
    n_resamples=10,
    random_state=0
)