# Training a multilingual topic classification model for Q&A pairs

In [29]:
import os
import torch
import pandas as pd
from torch import nn
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
from sklearn.metrics import f1_score
# from webfaq.config import *

In [30]:
pretained_model_name = "xlm-roberta-base"
finetuned_model_name = "webfaq-topic-classification_2"

resources_dir = os.path.join("..", "..", "..", "..", "resources")
model_dir = os.path.join("..", "..", "..", "..", "models", finetuned_model_name)

## Dataset

### Load dataset

In [31]:
# Load annotations file
annotations_path = os.path.join(resources_dir, "tc_annotations.jsonl")

# Add test split
dataset = Dataset.from_json(annotations_path)
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# Add validation split
dataset_validation_test = dataset["test"].train_test_split(test_size=0.5, seed=42)
dataset["validation"] = dataset_validation_test["train"]
dataset["test"] = dataset_validation_test["test"]

dataset

DatasetDict({
    train: Dataset({
        features: ['language', 'label', 'question', 'answer', 'title', 'description'],
        num_rows: 63720
    })
    test: Dataset({
        features: ['language', 'label', 'question', 'answer', 'title', 'description'],
        num_rows: 7965
    })
    validation: Dataset({
        features: ['language', 'label', 'question', 'answer', 'title', 'description'],
        num_rows: 7965
    })
})

### Statistics

In [32]:
df_dataset_all = pd.concat([dataset["train"].to_pandas(), dataset["validation"].to_pandas(), dataset["test"].to_pandas()])
df_dataset_all.head()

Unnamed: 0,language,label,question,answer,title,description
0,spa,7,¿22 bet es legal en Argentina?,"Sí. La marca no cuenta con una licencia local,...",22 Bet: La mejor casa de apuestas deportivas y...,Descubra la emoción de las apuestas deportivas...
1,jpn,2,Guangzhou Baiyun International Airportからホテルまでの...,Vaperse Hotel Guangzhou 空港へ28.4km。,Vaperse Hotel Guangzhou - 予約ウェブサイト,"Vaperse Hotel Guangzhou, 広州のCBD珠江新城の中心地、金穂路にあり..."
2,ukr,4,Як приєднатися до партнерської програми?,Щоб приєднатися до партнерської програми BetWi...,Партнерство в BetWinner - ваше нове джерело до...,Реєструйся в партнерській програмі BetWinner і...
3,dan,7,Hvem skal have styr på årsregnskabets frist?,Generelt er det ledelsen af virksomheden som s...,Årsregnskab frist - Hold styr på regler ang. å...,"Har du svært ved at holde styr på regler, såso..."
4,bul,3,Мога ли да резервирам онлайн и кога ще получа ...,"Резервацията се прави в няколко лесни стъпки, ...",Балеаж — Модерно боядисване и кичури • Reserva...,"Балеаж с професионални фризьори! Виж цени, про..."


In [33]:
df_dataset_all["label"].value_counts().sort_index()

label
1    23170
2    11076
3     5756
4    23103
5     4739
6     5087
7     3583
8     3136
Name: count, dtype: int64

In [34]:
df_dataset_all["language"].value_counts().sort_index()

language
afr     119
ara    2000
aze     457
bel     111
ben     626
       ... 
ukr    2000
urd     227
uzb     312
vie    2000
zho    2000
Name: count, Length: 63, dtype: int64

In [35]:
languages_100_scheme_hosts = df_dataset_all["language"].value_counts().index.tolist()

result = ""
for i, language in enumerate(sorted(languages_100_scheme_hosts)):
    result += f"\"{language}\""
    if i + 1 == len(languages_100_scheme_hosts):
        pass
    elif (i + 1) % 10 != 0:
        result += ", "
    else:
        result += "\n"
print(result)

"afr", "ara", "aze", "bel", "ben", "bos", "bul", "cat", "ces", "dan"
"deu", "ell", "eng", "est", "eus", "fas", "fin", "fra", "glg", "guj"
"hbs", "heb", "hin", "hrv", "hun", "hye", "ind", "isl", "ita", "jpn"
"kat", "kaz", "kor", "lat", "lav", "lit", "mar", "mkd", "msa", "nld"
"nno", "nor", "pol", "por", "ron", "rus", "slk", "slv", "spa", "sqi"
"srp", "swa", "swe", "tam", "tel", "tgl", "tha", "tur", "ukr", "urd"
"uzb", "vie", "zho"


### Transform dataset

In [36]:
def concat_qa(example):
    text = f"{example['question']} ### {example['answer']}"
    if "title" in example and example["title"]:
        text += f" ### Title: {example['title']}"
    if "description" in example and example["description"]:
        text += f" ### Description: {example['description']}"
    return {"text": text}

# def concat_qa(example):
#     text = f"{example['question']}"
#     return {"text": text}

def adapt_label(example):
    example["label"] = example["label"] - 1
    return example

# Apply transformations to all splits
dataset = dataset.map(concat_qa)
dataset = dataset.map(adapt_label)

# Remove columns
dataset = dataset.remove_columns(["language", "question", "answer", "title", "description"])
# dataset = dataset.remove_columns(["language", "question"])

# Rename label to labels
dataset = dataset.rename_column("label", "labels")

dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 63720
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 7965
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 7965
    })
})

In [37]:
dataset["train"][0]

{'labels': 6,
 'text': '¿22 bet es legal en Argentina? ### Sí. La marca no cuenta con una licencia local, pero gracias a las regulaciones vigentes, puede operar sin problemas con su licencia internacional. ### Title: 22 Bet: La mejor casa de apuestas deportivas y eventos deportivos ### Description: Descubra la emoción de las apuestas deportivas con la aplicación, las ofertas de bonos y las completas líneas de apuestas de 22 Bet. ¡Vive la emoción de tus eventos favoritos!'}

### Tokenize dataset

In [38]:
tokenizer = AutoTokenizer.from_pretrained(pretained_model_name)

tokenizer(dataset["train"]["text"][:1])

{'input_ids': [[0, 3936, 4015, 1600, 198, 8437, 22, 34170, 32, 6, 187284, 22683, 5, 239, 7098, 110, 9472, 158, 220, 95280, 4000, 4, 1788, 21376, 10, 576, 15913, 3825, 124483, 7, 4, 5171, 6264, 42, 880, 12234, 158, 166, 95280, 17228, 5, 6, 187284, 48962, 12, 1039, 6300, 12, 239, 8114, 2349, 8, 177842, 7, 103643, 7, 113, 44938, 157823, 7, 6, 187284, 70643, 12, 6, 177633, 11, 21, 196835, 8, 576, 177842, 7, 103643, 7, 158, 21, 36050, 4, 576, 96722, 8, 337, 2245, 113, 576, 13627, 7, 49975, 7, 8, 177842, 7, 8, 1039, 6300, 5, 14701, 6609, 272, 21, 196835, 8, 15875, 44938, 95333, 7, 38, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [39]:
def tokenize_text(examples):
    return tokenizer(examples["text"], padding=True, truncation=True, max_length=512)

dataset = dataset.map(tokenize_text, batched=True)
dataset

Map:   0%|          | 0/7965 [00:00<?, ? examples/s]

Map: 100%|██████████| 7965/7965 [00:02<00:00, 3494.18 examples/s]


DatasetDict({
    train: Dataset({
        features: ['labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 63720
    })
    test: Dataset({
        features: ['labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 7965
    })
    validation: Dataset({
        features: ['labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 7965
    })
})

### Dealing with imbalanced classes

In [40]:
df_dataset = dataset["train"].to_pandas()

In [41]:
df_dataset["labels"].value_counts(normalize=True).sort_index()

labels
0    0.291274
1    0.139140
2    0.071767
3    0.290458
4    0.058632
5    0.063606
6    0.045778
7    0.039344
Name: proportion, dtype: float64

In [42]:
class_weights = (1 - df_dataset["labels"].value_counts(normalize=True).sort_index()).values
class_weights

array([0.70872567, 0.86086001, 0.92823289, 0.70954175, 0.94136849,
       0.9363936 , 0.95422159, 0.96065599])

In [43]:
class_weights = torch.tensor(class_weights, dtype=torch.float32).cuda()
class_weights

tensor([0.7087, 0.8609, 0.9282, 0.7095, 0.9414, 0.9364, 0.9542, 0.9607],
       device='cuda:0')

## Training

In [54]:
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Feed inputs to model and extract logits
        outputs = model(**inputs)
        logits = outputs.logits

        # Extract labels
        labels = inputs.get("labels")

        # Define loss function with class weights
        loss_fn = nn.CrossEntropyLoss(weight=class_weights)

        # Compute loss
        loss = loss_fn(logits, labels)

        # Return loss and outputs
        return (loss, outputs) if return_outputs else loss

In [55]:
model = AutoModelForSequenceClassification.from_pretrained(
    pretained_model_name,
    num_labels=8,
    # num_labels=10,
    # id2label=id2label,
    # label2id=label2id,
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
batch_size = 64

# Log the training loss at each epoch
logging_steps = len(dataset["train"]) // batch_size

training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_steps=logging_steps,
    fp16=True,
    # push_to_hub=True,
)

In [57]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    return {"f1": f1}

In [58]:
trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
)

  trainer = WeightedLossTrainer(


In [59]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1
1,No log,0.448183,0.860083
2,No log,0.405204,0.870866
3,No log,0.372235,0.88026
4,No log,0.367988,0.882646
5,No log,0.367115,0.880735




TrainOutput(global_step=625, training_loss=0.43798857421875, metrics={'train_runtime': 1246.4869, 'train_samples_per_second': 255.598, 'train_steps_per_second': 0.501, 'total_flos': 8.38316981403648e+16, 'train_loss': 0.43798857421875, 'epoch': 5.0})

## Using the fine-tuned model

In [27]:
pretrained_model_name = os.path.join("..", "..", "..", "..", "models", "webfaq-topic-classification", "checkpoint-375")
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline("text-classification", model=pretrained_model_name, truncation=True, max_length=512, device=device)

Device set to use cuda


In [28]:
pipe("Hampton Inn & Suites Santa Monica ha una piscina? ### S\u00ec, la struttura dispone di una piscina all'aperto.")

[{'label': 'LABEL_1', 'score': 0.9905927777290344}]