# Training a multilingual topic classification model for Q&A pairs

In [None]:
import os
import torch
import pandas as pd
from torch import nn
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
from sklearn.metrics import f1_score
from webfaq.config import *

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
pretained_model_name = "xlm-roberta-base"
finetuned_model_name = "webfaq-question-topic-classification_3"

resources_dir = os.path.join("..", "..", "..", "..", RESOURCES_FOLDER)
model_dir = os.path.join("..", "..", "..", "..", MODELS_FOLDER, finetuned_model_name)

## Dataset

### Load dataset

In [23]:
dataset_name = "wdc"

# Load annotations file
annotations_path = os.path.join(resources_dir, dataset_name, "qtc_annotations.jsonl")

# Add test split
dataset = Dataset.from_json(annotations_path)
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# Add validation split
dataset_validation_test = dataset["test"].train_test_split(test_size=0.5, seed=42)
dataset["validation"] = dataset_validation_test["train"]
dataset["test"] = dataset_validation_test["test"]

dataset

DatasetDict({
    train: Dataset({
        features: ['language', 'label', 'question'],
        num_rows: 29906
    })
    test: Dataset({
        features: ['language', 'label', 'question'],
        num_rows: 3739
    })
    validation: Dataset({
        features: ['language', 'label', 'question'],
        num_rows: 3738
    })
})

### Statistics

In [24]:
df_dataset_all = pd.concat([dataset["train"].to_pandas(), dataset["validation"].to_pandas(), dataset["test"].to_pandas()])
df_dataset_all.head()

Unnamed: 0,language,label,question
0,nld,6,Waarom is online marketing belangrijk?
1,spa,7,¿Cómo me registro en 22Bet?
2,tha,9,ฉันสามารถดาวน์โหลดรูปภาพจากสไลด์โชว์ TikTok ได...
3,jpn,7,Chengdu Shuangliu International Airportからホテルまで...
4,slv,8,Ali deluje povezava Bet365 Slovenija?


In [25]:
df_dataset_all["label"].value_counts().sort_index()

label
1     6656
2     1864
3     1686
4     3623
5     1039
6     2462
7     4948
8     5102
9     2830
10    7173
Name: count, dtype: int64

In [26]:
df_dataset_all["language"].value_counts().sort_index()

language
ara    1000
aze     269
ben     518
bul    1000
cat     425
ces    1000
dan    1000
deu    1000
ell    1000
eng    1000
est     514
fas    1000
fin    1000
fra    1000
heb    1000
hin    1000
hrv     572
hun    1000
ind    1000
isl     108
ita    1000
jpn    1000
kat     107
kaz     151
kor    1000
lav     461
lit     797
mar     127
msa     444
nld    1000
nor    1000
pol    1000
por    1000
ron    1000
rus    1000
slk    1000
slv     788
spa    1000
sqi     112
srp     558
swe    1000
tgl     167
tha    1000
tur    1000
ukr    1000
urd     119
uzb     146
vie    1000
zho    1000
Name: count, dtype: int64

In [27]:
languages_100_scheme_hosts = df_dataset_all["language"].value_counts().index.tolist()

result = ""
for i, language in enumerate(sorted(languages_100_scheme_hosts)):
    result += f"\"{language}\""
    if i + 1 == len(languages_100_scheme_hosts):
        pass
    elif (i + 1) % 10 != 0:
        result += ", "
    else:
        result += "\n"
print(result)

"ara", "aze", "ben", "bul", "cat", "ces", "dan", "deu", "ell", "eng"
"est", "fas", "fin", "fra", "heb", "hin", "hrv", "hun", "ind", "isl"
"ita", "jpn", "kat", "kaz", "kor", "lav", "lit", "mar", "msa", "nld"
"nor", "pol", "por", "ron", "rus", "slk", "slv", "spa", "sqi", "srp"
"swe", "tgl", "tha", "tur", "ukr", "urd", "uzb", "vie", "zho"


### Transform dataset

In [28]:
# def concat_qa(example):
#     text = f"{example['question']} ### {example['answer']}"
#     return {"text": text}

def concat_qa(example):
    text = f"{example['question']}"
    return {"text": text}

def adapt_label(example):
    example["label"] = example["label"] - 1
    return example

# Apply transformations to all splits
dataset = dataset.map(concat_qa)
dataset = dataset.map(adapt_label)

# Remove columns
# dataset = dataset.remove_columns(["language", "question", "answer"])
dataset = dataset.remove_columns(["language", "question"])

# Rename label to labels
dataset = dataset.rename_column("label", "labels")

dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 29906
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 3739
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 3738
    })
})

In [29]:
dataset["train"][0]

{'labels': 5, 'text': 'Waarom is online marketing belangrijk?'}

### Tokenize dataset

In [30]:
tokenizer = AutoTokenizer.from_pretrained(pretained_model_name)

tokenizer(dataset["train"]["text"][:1])

{'input_ids': [[0, 94865, 83, 1118, 7481, 54446, 32, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1]]}

In [31]:
def tokenize_text(examples):
    return tokenizer(examples["text"], padding=True, truncation=True, max_length=512)

dataset = dataset.map(tokenize_text, batched=True)
dataset

Map: 100%|██████████| 3739/3739 [00:00<00:00, 14794.56 examples/s]


DatasetDict({
    train: Dataset({
        features: ['labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 29906
    })
    test: Dataset({
        features: ['labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 3739
    })
    validation: Dataset({
        features: ['labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 3738
    })
})

### Dealing with imbalanced classes

In [32]:
df_dataset = dataset["train"].to_pandas()

In [33]:
df_dataset["labels"].value_counts(normalize=True).sort_index()

labels
0    0.180934
1    0.049823
2    0.045208
3    0.097071
4    0.027152
5    0.064402
6    0.131445
7    0.136127
8    0.075771
9    0.192068
Name: proportion, dtype: float64

In [34]:
class_weights = (1 - df_dataset["labels"].value_counts(normalize=True).sort_index()).values
class_weights

array([0.81906641, 0.95017722, 0.95479168, 0.90292918, 0.97284826,
       0.93559821, 0.86855481, 0.86387347, 0.92422925, 0.80793152])

In [35]:
class_weights = torch.tensor(class_weights, dtype=torch.float32).cuda()
class_weights

tensor([0.8191, 0.9502, 0.9548, 0.9029, 0.9728, 0.9356, 0.8686, 0.8639, 0.9242,
        0.8079], device='cuda:0')

## Training

In [36]:
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Feed inputs to model and extract logits
        outputs = model(**inputs)
        logits = outputs.logits

        # Extract labels
        labels = inputs.get("labels")

        # Define loss function with class weights
        loss_fn = nn.CrossEntropyLoss(weight=class_weights)

        # Compute loss
        loss = loss_fn(logits, labels)

        # Return loss and outputs
        return (loss, outputs) if return_outputs else loss

In [37]:
model = AutoModelForSequenceClassification.from_pretrained(
    pretained_model_name,
    # num_labels=8,
    num_labels=10,
    # id2label=id2label,
    # label2id=label2id,
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
batch_size = 64

# Log the training loss at each epoch
logging_steps = len(dataset["train"]) // batch_size

training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_steps=logging_steps,
    fp16=True,
    # push_to_hub=True,
)

In [39]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    return {"f1": f1}

In [40]:
trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
)

  trainer = WeightedLossTrainer(


In [41]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,1.2327,0.797994,0.741793
2,0.7576,0.680312,0.781345
3,0.653,0.660538,0.785645


TrainOutput(global_step=1404, training_loss=0.8807794730208198, metrics={'train_runtime': 208.8572, 'train_samples_per_second': 429.566, 'train_steps_per_second': 6.722, 'total_flos': 1.290590240852388e+16, 'train_loss': 0.8807794730208198, 'epoch': 3.0})

## Using the fine-tuned model

In [43]:
pretrained_model_name = os.path.join("..", "..", "..", "..", MODELS_FOLDER, "webfaq-topic-classification_2", "checkpoint-936")
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline("text-classification", model=pretrained_model_name, truncation=True, max_length=512, device=device)

Device set to use cuda


In [44]:
pipe("Hampton Inn & Suites Santa Monica ha una piscina? ### S\u00ec, la struttura dispone di una piscina all'aperto.")

[{'label': 'LABEL_1', 'score': 0.9893352389335632}]