<a href="https://colab.research.google.com/github/mrjunos/machine_learning/blob/main/NLP-fine_tunning-hugging_face_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fine-tunning a HuggingFace NPL Model

In [None]:
%%capture

%pip install datasets transformers[torch] evaluate

In [None]:
from datasets import load_dataset

ds = load_dataset("mrjunos/depression-reddit-cleaned")
ds = ds['train'].train_test_split(test_size=0.2, seed=42)

In [None]:
ds

In [None]:
ds['train'][2]

In [None]:
labels = ds['train'].features['is_depression']
labels

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

In [None]:
tokenized_example = tokenizer(ds['train'][2]['clean_text'])

In [None]:
tokenized_example

In [None]:
tokenizer.convert_ids_to_tokens(tokenized_example['input_ids'])

In [None]:
def tokenize_fn(example):
    return tokenizer(example['clean_text'], truncation=True)

In [None]:
prepared_ds = ds.map(tokenize_fn, batched=True)

In [None]:
prepared_ds

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate
import numpy as np

def compute_metrics(eval_pred):
    metric = evaluate.load("depression-reddit-cleaned")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification

labels = ds['train'].features['is_depression'].names

model = AutoModelForSequenceClassification.from_pretrained(
    "distilroberta-base",
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./depression-reddit-distilroberta-base",
    evaluation_strategy="steps",
    num_train_epochs=3,
    push_to_hub=True,
    load_best_model_at_end=True,
)

In [None]:
!huggingface-cli login

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=prepared_ds['train'],
    eval_dataset=prepared_ds['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)

In [None]:
metrics = trainer.evaluate(prepared_ds['validation'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)