* In this notebook we fine-tune an LLM model so that it can recognize if the hashtags that are used in a post on the Instagram, are accessible or not for people that use a screen reader.

* Hashtags that do not use PascalCase, that use slang language, that have emojis, that have weird words (e.g. #Heloooo), are difficult for a screen reader user to understand.

* The model that we used is BERT from the Hugging Face. It is trained in more than 100 languages, so it can be used in many different cases.

In [None]:
# Install
!pip install -q transformers datasets scikit-learn peft bitsandbytes accelerate

In [None]:
# Imports
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    pipeline,
)
from peft import get_peft_model, LoraConfig, TaskType, PeftModel, PeftConfig
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Load & preprocess the CSV that contains that data to fine-tune the model
# Can be used the already CSV or something that you will build on your own

df = pd.read_csv('the CSV file')
df = df.rename(columns={df.columns[0]: "text", df.columns[1]: "label"})
df = df.dropna().sample(frac=1, random_state=42).reset_index(drop=True)
df['label'] = df['label'].astype(int)

In [None]:
# Check if the label column includes only 0 and 1, otherwise the model
# will encounter problems

df.label.unique()

In [None]:
# Drop each column that may be empty and convert the type of the values
# in the label column to int

df = df[df['label'].isin([0, 1])].dropna()
df['label'] = df['label'].astype(int)

In [None]:
# Create dataset & split

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
# Tokenizer & tokenization (with padding/truncation)

model_checkpoint = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_fn(examples):
    return tokenizer(
        examples['text'],
        padding=True,
        truncation=True,
        max_length=32
    )

tokenized_dataset = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=['text']
)

In [None]:
# Data collator to pad batches and create data loaders to use them with PyTorch

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

train_loader = DataLoader(
    tokenized_dataset["train"],
    batch_size=32,
    shuffle=True,
    collate_fn=data_collator
)
test_loader = DataLoader(
    tokenized_dataset["test"],
    batch_size=32,
    collate_fn=data_collator
)

In [None]:
# LoRA is used to train only a small percentage of the parameters of the
# BERT model, because the huge amount of the parameters would need huge
# computational power to be trained.

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=2
)
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "value"],
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# Move to GPU if available

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Optimizer

optimizer = AdamW(model.parameters(), lr=2e-4)

In [None]:
# Training loop

model.train()
epochs = 10
for epoch in range(epochs):
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        loop.set_postfix(loss=loss.item())

In [None]:
# Final evaluation on the 20% test split

model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(**batch).logits
        preds = logits.argmax(-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())

acc = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)
print(f"Test Accuracy: {acc:.4f}   |   F1 Score: {f1:.4f}")

In [None]:
# Quick interactive predictions

classifier = pipeline(
    "text-classification",
    model="/content/hashtag-accessibility-model",
    tokenizer="/content/hashtag-accessibility-model",
    device=0 if torch.cuda.is_available() else -1,
)
label_map = {"LABEL_0": "Inaccessible", "LABEL_1": "Accessible"}

print("\n🔍 Example Predictions:")
for tag in ["#ΨηφιακήΚαινοτομία", "#καθημερινότητάμου", "#ΈξυπνηΖωή", "#οικονομικάνεανων", "#ΔημιουργικήΣκέψη", \
      "#τεχνολογίασήμερα", "#ΕπιχειρηματικέςΙδέες", "#πράσινηενέργεια", "#ΕλληνικήΚουζίνα", "#ταξίδιαελλάδα", \
      "#ΚαινοτόμεςΛύσεις", "#ευεξίακαιζωή", "#ΨηφιακάΕργαλεία", "#εργασίαεξαποστάσεως", "#ΕκπαίδευσηΣτοΔιαδίκτυο", \
      "#μαγειρεύουμεμαζί", "#ΝέεςΙδέες", "#τεχνολογικάνεα", "#ΕπαγγελματικήΑνάπτυξη", "#καλύτερηζωή", \
      "#ΑυτόματηΛύση", "#περιβαλλοντικήδράση", "#ΔιαδικτυακήΜάθηση", "#ομορφιάφυσικά", "#ΠολιτιστικήΚληρονομιά", \
      "#οικογενειακέςστιγμές", "#ΔημιουργικόΠεριεχόμενο", "#καθημερινήενέργεια", "#ΠράσινεςΤεχνολογίες", "#ευκαιρίεςεργασίας", \
      "#ΖούμεΨηφιακά", "#τεχνολογίαστηζωή", "#ΑνάπτυξηΔεξιοτήτων", "#προσωπικήεξέλιξη", "#ΕργαλείαΜάρκετινγκ", \
      "#κουλτούρακαιτέχνη", "#Ελλάδα2025", "#φιλοξενίαμεψυχή", "#ιδέεςγιατοσπίτι", "#Επιχειρηματικότητα", \
      "#στυλιστικέςεπιλογές", "#ΨηφιακήΕποχή", "#καινοτομικαπροϊόντα", "#ΥγείαΚαιΕυεξία", "#αγοράκαιτεχνολογία", \
      "#ΝεανικήΚαινοτομία", "#παιδείατουμέλλοντος", "#ΔιαδίκτυοΤωνΠραγμάτων", "#τέχνηστουςδρόμους", "#πολιτιστικάγεγονότα"]:
    res = classifier(tag)[0]
    print(f"{tag:25} → {label_map[res['label']]} ({res['score']:.2f})")

In [None]:
# After you’ve run your eval loop and gathered all_labels & all_preds:
report = classification_report(all_labels, all_preds, target_names=["Inaccessible","Accessible"], output_dict=True)
cm = confusion_matrix(all_labels, all_preds)

# Turn into DataFrames for readability
report_df = pd.DataFrame(report).transpose()
cm_df     = pd.DataFrame(cm,        index=["True Inac","True Acc"], columns=["Pred Inac","Pred Acc"])

print("Classification Report:\n", report_df)
print("\nConfusion Matrix:\n", cm_df)


In [None]:
# Load adapter config
peft_model_path = "Hashtag_LLM_Model"
config = PeftConfig.from_pretrained(peft_model_path)

# Load base model and merge with adapter
base_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path, num_labels=2)
model = PeftModel.from_pretrained(base_model, peft_model_path)
model = model.merge_and_unload()

# Save the full merged model
model.save_pretrained("merged_model")
tokenizer.save_pretrained("merged_model")
