In [None]:
# Notebook dives into basic Text Classification fine-tuning process

- Bringing in the Datasets
    > Review the datasets, its features and names
    If you are loading from csv, json or parquet ensure the 
    columns are clean, and you know the column names
    > Need to work on the data, based on the task at hand. 
    (Need to complete other tutorial NBs in HF) 
    > Load the dataset based on the splits
    > Create a Dataloader, Iterator out of the dataset

- Bringing in the Tokenisers
    > Decide on the type of tokenizer that best suits
    > Practice creating new tokenizers and training them using own corpus
    > Setup the function that tokenizes and returns the ids
        + Review the padding, max_length, truncate options, review output

- Preprocessing functions:
    > Tokenise the input sequences, and remove the text data 
    > To process the input_ids for the task, write/ import the 
    functions, depending on the task 
    > Map the imported functions on the dataset, 

- Setup Training:  
    > Instantiate the Training Arguments
    > Instantiate DataCollators if required
    > Instantiate the post-processing collator to support trainer
    > Build the Trainer, with datasets and collators. 
    > Start the training

- Work on Post Processing:
    > Instantiate the metrics
    > Write post-processing function for evaluation
    > Run the evaluation and get the resuls

In [None]:
from huggingface_hub import notebook_login
from datasets import load_dataset, load_metric
from torch.utils.data import Dataset, DataLoader
import torch
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification

#### First lets understand the datasets that are present for Text Classification. 

The most famous one is Glue. Which has many datasets.The GLUE Benchmark is a group of nine classification tasks on sentences or pairs of sentences which are:

CoLA (Corpus of Linguistic Acceptability) Determine if a sentence is grammatically correct or not.is a dataset containing sentences labeled grammatically correct or not.

MNLI (Multi-Genre Natural Language Inference) Determine if a sentence entails, contradicts or is unrelated to a given hypothesis. (This dataset has two versions, one with the validation and test set coming from the same distribution, another called mismatched where the validation and test use out-of-domain data.)

MRPC (Microsoft Research Paraphrase Corpus) Determine if two sentences are paraphrases from one another or not.

QNLI (Question-answering Natural Language Inference) Determine if the answer to a question is in the second sentence or not. (This dataset is built from the SQuAD dataset.)

QQP (Quora Question Pairs2) Determine if two questions are semantically equivalent or not.

RTE (Recognizing Textual Entailment) Determine if a sentence entails a given hypothesis or not.

SST-2 (Stanford Sentiment Treebank) Determine if the sentence has a positive or negative sentiment.

STS-B (Semantic Textual Similarity Benchmark) Determine the similarity of two sentences with a score from 1 to 5.

WNLI (Winograd Natural Language Inference) Determine if a sentence with an anonymous pronoun and a sentence with this pronoun replaced are entailed or not. (This dataset is built from the Winograd Schema Challenge dataset.)

Metrics are:

for CoLA: Matthews Correlation Coefficient

for MNLI (matched or mismatched): Accuracy

for MRPC: Accuracy and F1 score

for QNLI: Accuracy

for QQP: Accuracy and F1 score

for RTE: Accuracy

for SST-2: Accuracy

for STS-B: Pearson Correlation Coefficient and Spearman's_Rank_Correlation_Coefficient

for WNLI: Accuracy

Other datasets to work on TextClassification:

Decided to check the Datasets page on HF, and looked at the most downloaded datasets.

- IMDB (Already having it locally)

- ccdv/arxiv-classification (2.1GB so dropping it due to its size)

- ccdv/patent-classification (downloads to 285MB, so thinking of taking it)

- jackhhao/jailbreak-classification (2MB, 2K rows)

- knowledgator/events_classification_biotech (seems like a small 10~15MB dataset)

Lets begin the process of pre-processing the dataset

In [None]:
glue_tasks = ["cola", "mnli", "mnli-mm",
              "mrpc", "qnli","qqp",
              "rte", "sst2", "stsb",
              "wnli"]

In [None]:
task = "cola"
model_cp = "distilbert-base-uncased"  # Model used for classification 
batch_size = 16

In [None]:
# get the dataset for a task, and its corresponding metric

actual_task = "mnli" if task == 'mnli-mm' else task

dataset = load_dataset('glue', actual_task)
metric = load_metric('glue', actual_task)

In [None]:
dataset['validation'][0]

In [None]:
metric

In [None]:
fake_preds = np.random.randint(0, 2, size=(64,))
fake_refs = np.random.randint(0, 2, size=(64,))
metric.compute(predictions=fake_preds,
              references=fake_refs)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_cp)  # will pull only the tokenizer
tokenizer("this is 1st sentence", "followed by spidey sense of tropical tree top")

In [None]:
# The data set can be used for further classification training by minor modification, 
# so the below task to keys are created.

task_to_keys = {
    glue_tasks[0]:("sentence", None),
    glue_tasks[1]:("premise", "hypothesis"),
    glue_tasks[2]:("premise", "hypothesis"),
    glue_tasks[3]:("sentence1", "sentence2"),
    glue_tasks[4]:("question", "sentence"),
    glue_tasks[5]:("question1", "question2"),
    glue_tasks[6]:("sentence1", "sentence2"),
    glue_tasks[7]:("sentence", None),
    glue_tasks[8]:("sentence1", "sentence2"),
    glue_tasks[9]:("sentence1", "sentence2"),
}

In [None]:
sentence1_key, sentence2_key = task_to_keys[task]
if sentence2_key is None:
    print(f"Sentence: {dataset['train'][0][sentence1_key]}")
    
else:
    print(f"Sentence2: {dataset['train'][0][sentence2_key]}")
    print(f"Sentence1: {dataset['train'][0][sentence2_key]}")

In [None]:
def preprocess_function(examples):
    """Function that tokenizes based on the type of task, 
    and truncates the sentence that is longer than the model 
    can handle"""
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], truncation=True),  tokenizer(examples[sentence2_key], truncation=True) 

In [None]:
tokenised_ds = dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import Trainer, TrainingArguments

num_labels = 3 if task.startswith('mnli') else 1 if task == 'sstb' else 2
# task is mnle then 3, if task is sstb then 1, rest of cases 2
num_labels  # expecting 2 as it is Cola task

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_cp,
                                                           num_labels=num_labels)

In [None]:
metric_name = "pearson" if task == 'sstb' else 'matthews_correlation' \
            if task == 'cola' else "accuracy"
metric_name

In [None]:
# Trainer here is basically a wrapper function around the training loop that we 
# created using Torch, Tensors and Wine_dataset

args = TrainingArguments(
    "/home/kamal/training_files/prac/",
    evaluation_strategy='epoch',
    num_train_epochs=2,
    # save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    load_best_model_at_end=False,
    metric_for_best_model=metric,
    push_to_hub=False,
    report_to="none",
    hub_model_id=f"kamaljp/{model_cp}-finetuned-{task}",
    skip_memory_metrics=True  # this is for avoiding the threadlock error
    # https://github.com/huggingface/transformers/issues/17696
) 

In [None]:
def compute_metric(eval_pred):
    pred, refs = eval_pred
    if task != 'sstb':
        predictions = np.argmax(pred, axis=1)
    else:
        predictions = pred[:, 0]
    
    return metric.compute(predictions=predictions,
                         references=refs)

In [None]:
validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" \
                if task == "mnli" else "validation"
trainer = Trainer(
    model,
    args,
    train_dataset=tokenised_ds['train'],
    eval_dataset=tokenised_ds[validation_key],
    compute_metrics=compute_metric,
    tokenizer=tokenizer, # this is new, and it is required for padding,
)
# if no tokenizer is provided then length mismatch occurs, leading to error

In [None]:
trainer.train()

In [None]:
# Test using evaluator
from evaluate import evaluator

task_evaluator = evaluator("text-classification")

results = task_evaluator.compute(
    model_or_pipeline=model,
    data=test_ds,
    tokenizer=tokenizer,
    metric="accuracy",
    label_mapping={"LABEL_0": 0.0, "LABEL_1": 1.0},
    strategy="bootstrap",
    n_resamples=10,
    random_state=0
)

pprint(results)

In [None]:
trainer.evaluate()
trainer.push_to_hub()