* In this notebook we fine-tune an LLM model so that it can recognize if the descriptions that are used in a post on the Instagram, are easy to read or not for people that use a screen reader.

* Descriptions that have too many emojis, fonts that are not the default one, too many hashtags, hashtags and mentions that are everywhere in the text but in the last line grouped, create problems either on how the screen reader reads them or on the way that the description is heard by blind people.

* The model that we used is BERT from the Hugging Face. It is trained in more than 100 languages, so it can be used in many different cases.

In [None]:
# Install required libraries
!pip install -q transformers datasets scikit-learn peft bitsandbytes accelerate

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torch.optim import AdamW

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    get_scheduler,
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tqdm import tqdm
import os

In [None]:
# Load and prepare CSV that includes the training data

df = pd.read_csv("the CSV file")
df = df.iloc[:, :2]
df.columns = ["description", "label"]
df = df.dropna()
df = df[df["label"].isin([0, 1])]
df["label"] = df["label"].astype(int)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Tokenizer and checkpoint
checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def tokenize(example):
    return tokenizer(example["description"], padding=True, truncation=True, max_length=128)

# 5-Fold Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_preds, fold_labels = [], []

for fold, (train_idx, val_idx) in enumerate(skf.split(df, df["label"])):
    print(f"\n=== Fold {fold+1}/5 ===")

    train_df = df.iloc[train_idx].reset_index(drop=True)
    val_df = df.iloc[val_idx].reset_index(drop=True)

    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    tokenized_train = train_dataset.map(tokenize, batched=True, remove_columns=["description"])
    tokenized_val = val_dataset.map(tokenize, batched=True, remove_columns=["description"])

    train_loader = DataLoader(tokenized_train, batch_size=32, shuffle=True, collate_fn=data_collator)
    val_loader = DataLoader(tokenized_val, batch_size=32, collate_fn=data_collator)

    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        target_modules=["query", "key", "value"]
    )
    model = get_peft_model(model, lora_config)
    model.to(device)

    # Class weights
    class_counts = train_df["label"].value_counts().to_dict()
    total = sum(class_counts.values())
    weights = [total / class_counts[i] for i in range(2)]
    class_weights = torch.tensor(weights).to(device)
    loss_fn = CrossEntropyLoss(weight=class_weights)

    optimizer = AdamW(model.parameters(), lr=2e-5)
    epochs = 30
    scheduler = get_scheduler(
        "linear", optimizer=optimizer, num_warmup_steps=0,
        num_training_steps=epochs * len(train_loader)
    )

    best_f1, no_improve, patience = 0.0, 0, 3
    for epoch in range(epochs):
        model.train()
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
        for batch in loop:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = loss_fn(outputs.logits, batch["labels"])
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            loop.set_postfix(loss=loss.item())

        # Validation
        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                logits = model(**batch).logits
                pred = logits.argmax(dim=-1)
                val_preds.extend(pred.cpu().numpy())
                val_labels.extend(batch["labels"].cpu().numpy())
        f1 = f1_score(val_labels, val_preds)
        print(f"Validation F1: {f1:.4f}")

        if f1 > best_f1:
            best_f1 = f1
            no_improve = 0
            model.save_pretrained(f"fold_{fold+1}_adapter")
        else:
            no_improve += 1
            if no_improve >= patience:
                print("Early stopping.")
                break

    # Load best adapter and evaluate on validation again
    base_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
    peft_model = PeftModel.from_pretrained(base_model, f"fold_{fold+1}_adapter")
    peft_model = peft_model.merge_and_unload().to(device)
    peft_model.eval()

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            logits = peft_model(**batch).logits
            preds = logits.argmax(dim=-1)
            fold_preds.extend(preds.cpu().numpy())
            fold_labels.extend(batch["labels"].cpu().numpy())

# Final evaluation across all folds
acc = accuracy_score(fold_labels, fold_preds)
f1 = f1_score(fold_labels, fold_preds)
print(f"\nFinal Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")
print(classification_report(fold_labels, fold_preds))

# Save LoRA adapter
adapter_save_path = "lora_adapter"
model.save_pretrained(adapter_save_path)

# Merge adapter into base model and save
base_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
peft_model = PeftModel.from_pretrained(base_model, adapter_save_path)
merged_model = peft_model.merge_and_unload()
merged_model.save_pretrained("merged_model")
tokenizer.save_pretrained("merged_model")