<a href="https://colab.research.google.com/github/mrjunos/machine_learning/blob/main/NLP-fine_tunning-hugging_face_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fine-tunning a HuggingFace NPL Model

In [None]:
%%capture

%pip install datasets transformers[torch] torch evaluate ipywidgets huggingface_hub

In [None]:
from huggingface_hub import interpreter_login

interpreter_login()

## Loading Dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("mrjunos/depression-reddit-cleaned")
ds = ds['train'].train_test_split(test_size=0.2, seed=42)

In [None]:
ds

In [None]:
ds['train'][2]

In [None]:
labels = ds['train'].features['label']
labels

## Loading Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

In [None]:
tokenized_example = tokenizer(ds['train'][2]['text'])

In [None]:
tokenized_example

In [None]:
tokenizer.convert_ids_to_tokens(tokenized_example['input_ids'])

In [None]:
def tokenize_fn(example):
    return tokenizer(example['text'], truncation=True)

In [None]:
prepared_ds = ds.map(tokenize_fn, batched=True)

In [None]:
prepared_ds

## Adding Padding to the Dataset

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Adding Methods for training and validation

In [None]:
import evaluate
import numpy as np

def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification

labels = ds['train'].features['label'].names

model = AutoModelForSequenceClassification.from_pretrained(
    "distilroberta-base",
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./depression-reddit-distilroberta-base",
    evaluation_strategy="steps",
    num_train_epochs=3,
    push_to_hub=True,
    load_best_model_at_end=True,
)

## Trainer

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=prepared_ds['train'],
    eval_dataset=prepared_ds['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

## Training Model

In [None]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)

## Model Evaluation

In [None]:
metrics = trainer.evaluate(prepared_ds['test'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

## Pushing to HuggingFace

In [None]:
kwargs = {
    "finetuned_from": model.config._name_or_path,
    "tasks": "text-classification",
    "dataset": "mrjunos/depression-reddit-cleaned",
    "tags": ["text-classification", "depression", "reddit"],
}

trainer.push_to_hub(**kwargs)

## How to use it?

In [None]:
from transformers import pipeline
predict_task = pipeline(model="mrjunos/depression-reddit-distilroberta-base", task="text-classification")
predict_task("Stop listing your issues here, use forum instead or open ticket.")