In [None]:
!pip install kagglehub[hf-datasets] torch transformers accelerate huggingface_hub trackio evaluate scikit-learn sentencepiece protobuf

In [None]:
import huggingface_hub
huggingface_hub.login()

In [None]:
import torch
torch.cuda.is_available()

In [None]:
import datasets as ds
import kagglehub, logging

def load_dataset():
    logging.basicConfig(
        level=logging.INFO, 
        format= '[%(asctime)s] {%(pathname)s:%(lineno)d} %(levelname)s - %(message)s',
        datefmt='%H:%M:%S'
    )
    dataset = "tobiasbueck/multilingual-customer-support-tickets"
    subset = "aa_dataset-tickets-multi-lang-5-2-50-version.csv"
    # Download latest version
    kagglehub.dataset_download(dataset)

    # Load a DataFrame with a specific version of a CSV
    df: ds.Dataset = kagglehub.dataset_load(
        adapter = kagglehub.KaggleDatasetAdapter.HUGGING_FACE,
        handle = dataset,
        path = subset
    )
    seed = 10
    # df = df.to_iterable_dataset()

    df_en = df.filter(lambda x: x["language"] == "en")
    df_en = df_en.select_columns(["subject", "body", "queue"])
    df_en = df_en.map(lambda x: {
        "subject": x.get("subject", "") or "",
        "body": x.get("body", "") or "",
        "queue": x.get("queue")
    })
    df_en = df_en.map(lambda x: {
        "ticket": x.get("subject") + " " + x.get("body")
        })

    df_en = df_en.class_encode_column("queue")
    df_en = df_en.select_columns(["ticket", "queue"]).rename_columns({"ticket": "text", "queue": "labels"})
    logging.info(df_en.to_pandas())

    ## Creating label mappings
    id2label = {i: label for i, label in enumerate(df_en.features["labels"].names)}
    label2id = {label: i for i, label in enumerate(df_en.features["labels"].names)}
    queue_labels = list(label2id.keys())

    ## Splitting dataset into train, validation and test sets
    train_valid, test = df_en.train_test_split(test_size=0.25, stratify_by_column="labels", seed=seed).values()
    train_valid = train_valid.shuffle(seed=seed)
    train, valid = train_valid.train_test_split(test_size=0.1, stratify_by_column="labels", seed=seed).values()

    ## Verifying distribution of class labels in train and validation datasets
    labels = sorted(train.to_pandas()["labels"].unique())
    for l in labels:
        logging.info(f"[Train] Label {l}: {train.to_pandas()["labels"].apply(lambda x: x == l).sum()} occurrences")

    labels = sorted(valid.to_pandas()["labels"].unique())
    for l in labels:
        logging.info(f"[Validation] Label {l}: {valid.to_pandas()["labels"].apply(lambda x: x == l).sum()} occurrences")

    dataset_dict = ds.DatasetDict({
        "train": train,
        "validation": valid,
        "test": test
    })

    return dataset_dict, queue_labels, id2label, label2id


def oversample_with_interleave(train_dataset, num_labels, boost_classes=None, boost_factor=2.0, seed=42):
    """
    Oversample minority classes using interleave_datasets.

    Args:
        train_dataset: datasets.Dataset (with "labels" column)
        num_labels: number of unique labels
        boost_classes: list of class indices to oversample more strongly
        boost_factor: multiplier for boost_classes probabilities
        seed: random seed
    """
    ## Split into one dataset per class
    class_datasets = []
    class_sizes = []
    for c in range(num_labels):
        ds_c = train_dataset.filter(lambda x: x["labels"] == c)
        class_datasets.append(ds_c)
        class_sizes.append(len(ds_c))

    ## Base probabilities: proportional to dataset sizes
    total = sum(class_sizes)
    probs = [size / total for size in class_sizes]

    ## Optionally boost certain classes (e.g. underrepresented)
    if boost_classes is not None:
        for c in boost_classes:
            probs[c] *= boost_factor

    ## Normalize to sum to 1
    s = sum(probs)
    probs = [p / s for p in probs]

    logging.info("Sampling probabilities:", {c: round(p, 3) for c, p in enumerate(probs)})

    ## Interleave with oversampling
    interleaved = ds.interleave_datasets(
        class_datasets,
        probabilities=probs,
        seed=seed,
        stopping_strategy="all_exhausted"
    )
    return interleaved

In [None]:
import warnings, logging, os
from typing import Any
import datasets as ds
from transformers import (
    pipeline,
    get_scheduler,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoConfig,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    EvalPrediction
)
import torch
import numpy as np
import evaluate

class TicketTriageModel:
    def __init__(
        self,
        labels: list,
        model_name = "microsoft/deberta-v3-base",
        id2label: dict[int, Any] = None
    ):
        warnings.filterwarnings("ignore")
        logging.basicConfig(
           level=logging.INFO,
            format= '[%(asctime)s] {%(pathname)s:%(lineno)d} %(levelname)s - %(message)s',
            datefmt='%H:%M:%S'
        )
        self.model_name = model_name
        if torch.cuda.is_available():
            self.device = "cuda"
            torch.cuda.empty_cache()
        else:
            self.device = "cpu"
        self.classifier = pipeline("zero-shot-classification", model=self.model_name, device=self.device)
        self.labels = labels
        if not id2label:
            self.config = AutoConfig.from_pretrained(self.model_name)
            self.ids2labels = self.config.id2label
        else:
            self.ids2labels = id2label

    def get_predictions_from_dataset(self, dataset: ds.Dataset, batch_size: int = 32) -> ds.Dataset:
        """
        Run batch inference on a Hugging Face Dataset and add predictions as a column.
        """
        def ids2labels(batch):
            return {"labels_str" : [self.ids2labels[_id] for _id in batch["labels"]]}

        def predict(batch):
            outputs = self.classifier(batch["text"], self.labels,
                                      multi_label=False, batch_size=batch_size)
            if isinstance(outputs, dict):
                return {"pred_labels": outputs["labels"][0]}
            else:
                return {"pred_labels": [out["labels"][0] for out in outputs]}

        dataset = dataset.map(predict, batched=True, batch_size=batch_size)
        dataset = dataset.map(ids2labels, batched=True, batch_size=batch_size)
        return dataset

In [None]:
class BaseModel:
    def __init__(
        self,
        labels: list,
        model_name :str = "microsoft/deberta-v3-base"
    ):
        logging.basicConfig(
            level=logging.INFO,
            format= '[%(asctime)s] {%(pathname)s:%(lineno)d} %(levelname)s - %(message)s',
            datefmt='%H:%M:%S'
        )
        warnings.filterwarnings("ignore")
        self.model_name = model_name
        self.labels = labels
        self.f1_metric = evaluate.load("f1")
        self.accuracy_metric = evaluate.load("accuracy")

    def compute_metrics(self, eval_preds: EvalPrediction):
        logits, labels = eval_preds
        preds = np.argmax(logits, -1)
        ## using f1-macro to address class imbalance
        ## See https://www.numberanalytics.com/blog/f1-score-imbalanced-classes-guide
        macro_f1 = self.f1_metric.compute(predictions=preds, references=labels, average="macro")

        ## Accuracy
        acc = self.accuracy_metric.compute(predictions=preds, references=labels)

        ## Per-class f1 score
        per_class_f1 = self.f1_metric.compute(predictions=preds, references=labels, average=None)
        per_class_f1_dict = {f"f1_class_{i}": score for i, score in enumerate(per_class_f1["f1"])}

        return {
            "accuracy": acc["accuracy"],
            "f1_macro": macro_f1["f1"],
            **per_class_f1_dict
        }

    def preprocess_data(self, dataset: ds.Dataset):
        ## Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        def tokenize(examples):
            # https://huggingface.co/docs/transformers/pad_truncation
            return self.tokenizer(examples["text"], padding="max_length", max_length=self.max_length, truncation=True, return_tensors="pt")
        dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])
        dataset = dataset.with_format("torch")
        return dataset

    def finetune_model(
        self,
        train_dataset : ds.Dataset,
        validation_dataset: ds.Dataset,
        id2label : dict,
        label2id : dict,
        batch_size : int = 8,
        num_train_epochs : int = 5,
        oversample : bool = False
    ):
        logging.info(f"XPU: {torch.xpu.is_available()}")
        if torch.xpu.is_available():
            torch.xpu.empty_cache()

        ## Load model
        logging.info("Loading model...")
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            problem_type="single_label_classification", # problem_type (str, optional) — Problem type for XxxForSequenceClassification models. Can be one of "regression", "single_label_classification" or "multi_label_classification".
            num_labels=len(self.labels),
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True
        )

        if oversample:
            ## Oversampling (underrepresented) classes
            logging.info("Oversampling (underrepresented) classes...")
            train_dataset = oversample_with_interleave(
                train_dataset,
                num_labels=len(self.labels),
                seed=10
            )

        ## Tokenize and prepare the training dataset for training
        logging.info("Tokenizing and prepare the training dataset for training...")
        self.encoded_train_dataset = self.preprocess_data(train_dataset)
        self.encoded_validation_dataset = self.preprocess_data(validation_dataset)
        logging.info(self.encoded_train_dataset)

        ## Initialize training args
        args = TrainingArguments(
            f"{HF_USER}/ticket_triage_{self.model_name.replace('/','_')}_finetuned",
            save_strategy = "epoch",
            eval_strategy="epoch",
            num_train_epochs=num_train_epochs,
            warmup_ratio=0.06,
            per_device_train_batch_size=batch_size,
            gradient_accumulation_steps=2, ## Efective batch size will be batch_size * gradient_accumulation_steps
            max_grad_norm=0.5,
            metric_for_best_model="f1_macro",
            greater_is_better=True,
            load_best_model_at_end=True,
            lr_scheduler_type="linear",
            report_to="trackio",
            remove_unused_columns=False,
            push_to_hub=True
        )

        ## Custom optimizer an lr_scheduler
        ## Following hyperparams from: https://huggingface.co/MoritzLaurer/ModernBERT-large-zeroshot-v2.0
        optimizer = torch.optim.AdamW(
            self.model.parameters(),
            lr=9e-06,
            betas=(0.9, 0.999),
            eps=1e-08,
            weight_decay=0.01
        )
        num_training_steps = len(self.encoded_train_dataset) // batch_size * num_train_epochs
        lr_scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_training_steps=num_training_steps,
            num_warmup_steps=0.06*num_training_steps
        )

        ## Initialize the trainer
        self.trainer = Trainer(
            self.model,
            args,
            train_dataset=self.encoded_train_dataset,
            eval_dataset=self.encoded_validation_dataset,
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics,
            optimizers=(optimizer, lr_scheduler),
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )

        # fine tune the model
        self.trainer.train()

In [None]:
logging.basicConfig(
  level=logging.INFO,
  format= '[%(asctime)s] {%(pathname)s:%(lineno)d} %(levelname)s - %(message)s',
  datefmt='%H:%M:%S'
)
HF_USER = "marquesafonso"
## Prepare datasets
logging.info("Prepare datasets...")
dataset, queue_labels, id2label, label2id = load_dataset()
train_dataset, validation_dataset, test_dataset = dataset["train"], dataset["validation"], dataset["test"]

## Prepare base model
logging.info("Preparing base model...")
basemodel_name = "microsoft/deberta-v3-base"
basemodel = BaseModel(
  model_name=basemodel_name,
  labels=queue_labels
)

In [None]:
## finetune the model
logging.info("Finetuning the model...")
basemodel.finetune_model(
    train_dataset=train_dataset,
    validation_dataset=validation_dataset,
    id2label=id2label,
    label2id=label2id,
    batch_size=16,
    num_train_epochs=10,
    oversample=False
)


In [None]:
import trackio
trackio.show(project="huggingface")

In [None]:
## Evaluate on test set
logging.info("Evaluating on test set...")
finetuned_model_name = f"{HF_USER}/ticket_triage_{basemodel_name.replace('/',"_")}_finetuned"
finetuned_model = TicketTriageModel(
    model_name=finetuned_model_name,
    labels=queue_labels,
    id2label=id2label
)

finetuned_dataset = finetuned_model.get_predictions_from_dataset(test_dataset, batch_size=64)
finetuned_dataset.to_parquet(f"output/{basemodel_name.replace('/',"_")}_finetuned_preds.parquet")

# From a saved predictions file
# import datasets as ds
# finetuned_dataset = ds.Dataset.from_parquet(f"output/{basemodel_name.replace('/',"_")}_finetuned_preds.parquet")

logging.info(finetuned_dataset.to_pandas().head())

finetuned_accuracy = finetuned_dataset.filter(lambda x: x["pred_labels"] == x["labels_str"]).num_rows * 100 / finetuned_dataset.num_rows


In [None]:
print(finetuned_accuracy)