# DistilBERT Text Classification Example

_Developed from Hugging Face example_

_https://huggingface.co/docs/transformers/tasks/sequence_classification_

This notebook helps you fine-tune a DistilBERT model using a public IMDB of movie reviews **(input)** that are labeled positive and negative **(label/target)**.

You can then run inference using the model

## Install libraries

In [None]:
pip install transformers datasets evaluate rouge_score

## Load and preprocess data

In [None]:
from datasets import load_dataset

imdb = load_dataset("imdb")

In [None]:
# View data
imdb["test"][0]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [None]:
# Create function to tokenize
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
# Tokenize
tokenized_imdb = imdb.map(preprocess_function, batched=True)

## Evaluation Function

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train Model

In [None]:
# Map IDs to labels
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
# Set up params
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()

## Use model for inference

In [None]:
# Create test sample
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."

#### Pipeline Method

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="./results")
classifier(text)

#### Manual Method

In [None]:
from transformers import AutoTokenizer

# Tokenize and format
tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
inputs = tokenizer(text, return_tensors="pt")

In [None]:
# Infer
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]