# 📘 Simple Fine-Tuning of an Open Source LLM with Hugging Face

In [None]:
!pip install transformers datasets evaluate --quiet

## Load dataset (IMDB 1% sample)

In [None]:
from datasets import load_dataset

# Load a small sentiment dataset for demonstration
dataset = load_dataset("imdb", split='train[:1%]').train_test_split(test_size=0.2)
dataset

## Preprocess the data

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

## Load model and prepare training

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=10,
)

## Fine-tune the model

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

trainer.train()

## Evaluate the model

In [None]:
trainer.evaluate()