##Multilingual

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset, concatenate_datasets
from transformers import AutoTokenizer
from collections import Counter
import random
import torch
from transformers import DataCollatorWithPadding, EarlyStoppingCallback
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, WeightedRandomSampler
from pathlib import Path
import numpy as np

#Load Data

In [None]:
base_dir = '/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/data'
train_en_path = f'{base_dir}/english/train_en.tsv'
train_it_path = f'{base_dir}/italian/train_it.tsv'
train_bg_path = f'{base_dir}/bulgarian/train_bg.tsv'
train_ar_path = f'{base_dir}/arabic/train_ar.tsv'
train_de_path = f'{base_dir}/german/train_de.tsv'
dev_path   = f'{base_dir}/multilingual/dev_test_multilingual.tsv'
test_path = f'{base_dir}/multilingual/test_multilingual_labeled.tsv'
test_unlabeled_path = f'{base_dir}/multilingual/test_multilingual_unlabeled.tsv'

In [None]:
train_en_df = pd.read_csv(train_en_path, sep='\t')
train_it_df = pd.read_csv(train_it_path, sep='\t')
train_bg_df = pd.read_csv(train_bg_path, sep='\t')
train_ar_df = pd.read_csv(train_ar_path, sep='\t')
train_de_df = pd.read_csv(train_de_path, sep='\t')
dev_df   = pd.read_csv(dev_path, sep='\t')
test_df = pd.read_csv(test_path, sep='\t')
test_unlabeled_df = pd.read_csv(test_unlabeled_path, sep='\t')

##Pre Processing

In [None]:
le = LabelEncoder()
train_en_df['label_id'] = le.fit_transform(train_en_df['label'])
train_it_df['label_id'] = le.fit_transform(train_it_df['label'])
train_bg_df['label_id'] = le.fit_transform(train_bg_df['label'])
train_ar_df['label_id'] = le.fit_transform(train_ar_df['label'])
train_de_df['label_id'] = le.fit_transform(train_de_df['label'])
dev_df['label_id'] = le.fit_transform(dev_df['label'])
test_df['label_id'] = le.fit_transform(test_df['label'])


for df in (train_en_df, train_it_df, train_bg_df, train_ar_df, train_de_df, dev_df, test_df):
    df.drop(columns=['label'], inplace=True)

for df in (train_en_df, train_it_df, train_bg_df, train_ar_df, train_de_df, dev_df, test_df):
    df.rename(columns={'label_id':'labels'}, inplace=True)


print("Mapped classes:", dict(enumerate(le.classes_)))


Mapped classes: {0: 'OBJ', 1: 'SUBJ'}


In [None]:
train_en_ds    = Dataset.from_pandas(train_en_df[['sentence','labels']])
train_it_ds    = Dataset.from_pandas(train_it_df[['sentence','labels']])
train_bg_ds    = Dataset.from_pandas(train_bg_df[['sentence','labels']])
train_ar_ds    = Dataset.from_pandas(train_ar_df[['sentence','labels']])
train_de_ds    = Dataset.from_pandas(train_de_df[['sentence','labels']])
dev_ds      = Dataset.from_pandas(dev_df[['sentence','labels']])
test_ds = Dataset.from_pandas(test_df[['sentence','labels']])
test_unlabeled_ds = Dataset.from_pandas(test_unlabeled_df[['sentence']])

Create train set with merge all 5 languages train set

In [None]:
train_multilingual = concatenate_datasets([train_en_ds, train_de_ds, train_it_ds, train_ar_ds, train_bg_ds])

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Create Balance train set with 60% OBJ and 40% SUBJ

In [None]:
balanced_splits = []
for ds in [train_en_ds, train_de_ds, train_it_ds, train_ar_ds, train_bg_ds]:
    # split OBJ / SUBJ
    ds_obj  = ds.filter(lambda x: x['labels']==0)
    ds_subj = ds.filter(lambda x: x['labels']==1)

    # undersample ds_obj to 1.5×|SUBJ| → 60/40
    target_obj = int(len(ds_subj) * 1.5)
    ds_obj_small = ds_obj.shuffle(seed=42).select(range(min(target_obj, len(ds_obj))))

    # re-combine and shuffle per-language
    balanced_splits.append(concatenate_datasets([ds_obj_small, ds_subj]).shuffle(seed=42))

# now merge all languages into one multilingual set
train_multi_balanced = concatenate_datasets(balanced_splits).shuffle(seed=42)


Filter:   0%|          | 0/830 [00:00<?, ? examples/s]

Filter:   0%|          | 0/830 [00:00<?, ? examples/s]

Filter:   0%|          | 0/800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1613 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1613 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2446 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2446 [00:00<?, ? examples/s]

Filter:   0%|          | 0/691 [00:00<?, ? examples/s]

Filter:   0%|          | 0/691 [00:00<?, ? examples/s]

##Model

We use the mode : **mdeberta-v3-base**

In [None]:
model_name = "microsoft/mdeberta-v3-base"
tokenizer  = AutoTokenizer.from_pretrained(model_name)

max_len = 100

def tokenize(batch):
    return tokenizer(batch['sentence'],
                     padding='max_length',
                     truncation=True,
                     max_length=max_len)

train_multilingual   = train_multilingual.map(tokenize, batched=True)
train_multi_balanced    = train_multi_balanced.map(tokenize, batched=True)
dev_ds      = dev_ds.map(tokenize, batched=True)
test_ds     = test_ds.map(tokenize, batched=True)
test_unlabeled_ds = test_unlabeled_ds.map(tokenize, batched=True)


cols = ['input_ids','attention_mask','labels']
train_multilingual    = train_multilingual.remove_columns([c for c in train_multilingual.column_names if c not in cols])
train_multi_balanced   = train_multi_balanced.remove_columns([c for c in train_multi_balanced.column_names if c not in cols])
dev_ds      = dev_ds.remove_columns([c for c in dev_ds.column_names if c not in cols])
test_ds     = test_ds.remove_columns([c for c in test_ds.column_names if c not in cols])
test_unlabeled_ds = test_unlabeled_ds.remove_columns(
    [c for c in test_unlabeled_ds.column_names if c not in ['input_ids','attention_mask']])



Map:   0%|          | 0/6380 [00:00<?, ? examples/s]

Map:   0%|          | 0/5607 [00:00<?, ? examples/s]

Map:   0%|          | 0/2224 [00:00<?, ? examples/s]

Map:   0%|          | 0/1982 [00:00<?, ? examples/s]

Map:   0%|          | 0/1982 [00:00<?, ? examples/s]

 Define a data collator for dynamic padding and a metrics function to compute per-class precision, recall, F1, and macro F1 score.


In [12]:
data_collator = DataCollatorWithPadding(tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, labels=[0,1], zero_division=0
    )
    return {
        'accuracy': accuracy_score(labels, preds),
        'precision_OBJ': precision[0],
        'recall_OBJ':    recall[0],
        'f1_OBJ':        f1[0],
        'precision_SUBJ':precision[1],
        'recall_SUBJ':   recall[1],
        'f1_SUBJ':       f1[1],
        'macro_f1':      f1.mean()
    }


##Train for unbalanced train set

 Use WeightedRandomSampler to balance class sampling in each batch, and customize Trainer to use this sampler during training.(For unbalances train set)


In [None]:
train_labels = train_multilingual["labels"]


counts = Counter(train_labels)
total  = counts[0] + counts[1]
# weight for OBJ = total/counts[0], for SUBJ = total/counts[1]
weights = [ total / counts[label] for label in train_labels ]

sampler = WeightedRandomSampler(
    weights      = weights,
    num_samples  = len(weights),
    replacement  = True
)


class SamplerTrainer(Trainer):
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            sampler      = sampler,
            batch_size   = self.args.per_device_train_batch_size,
            collate_fn   = self.data_collator,
            num_workers  = self.args.dataloader_num_workers,
            pin_memory   = True,
        )

 Initialize model **(mdeberta-v3-base)** and training configuration with gradient checkpointing and early stopping.

 Uses a custom SamplerTrainer to address class imbalance, and selects the best model based on macro F1 score.


In [None]:
torch.cuda.empty_cache()
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir= "/content/results_multi",
    eval_strategy = 'epoch',
    save_strategy       = 'epoch',
    learning_rate       = 5e-5,
    per_device_train_batch_size = 32,
    gradient_accumulation_steps   = 2,
    per_device_eval_batch_size  = 64,
    num_train_epochs    = 4,
    weight_decay        = 0.3,
    warmup_steps        = 500,
    lr_scheduler_type = "linear",
    fp16                          = True,
    load_best_model_at_end=True,
    metric_for_best_model = 'macro_f1',
    greater_is_better   = True,
    logging_dir         = './logs_multi',
    logging_steps       = 50,
    logging_strategy = 'epoch' ,
)

trainer = SamplerTrainer(
    model           = model,
    args            = training_args,
    train_dataset   = train_multilingual,
    eval_dataset    = dev_ds,
    tokenizer       = tokenizer,
    data_collator   = data_collator,
    compute_metrics = compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)




Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = SamplerTrainer(


Train and fine tuning model and save the model.

In [None]:
trainer.train()
output_dir = "/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/models/Multilingual"
Path(output_dir).mkdir(parents=True, exist_ok=True)
trainer.save_model(output_dir)

print(f"Final model saved to {output_dir}")


Epoch,Training Loss,Validation Loss,Accuracy,Precision Obj,Recall Obj,F1 Obj,Precision Subj,Recall Subj,F1 Subj,Macro F1
1,0.689,0.630918,0.647932,0.752967,0.692833,0.721649,0.486301,0.561265,0.521101,0.621375
2,0.5455,0.533539,0.724371,0.807359,0.763823,0.784988,0.587112,0.648221,0.616155,0.700572
3,0.4484,0.558001,0.714029,0.806356,0.74471,0.774308,0.570608,0.654809,0.609816,0.692062


Final model saved to /content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/models/Multilingual


In [None]:
for record in trainer.state.log_history:
    if 'eval_loss' in record or 'loss' in record:
        print(record)

train_metrics = trainer.evaluate(train_multilingual)
val_metrics   = trainer.evaluate(dev_ds)
print("Train macro-F1:", train_metrics['eval_macro_f1'])
print("Val   macro-F1:", val_metrics['eval_macro_f1'])


{'loss': 0.689, 'grad_norm': 2.714303970336914, 'learning_rate': 9.900000000000002e-06, 'epoch': 1.0, 'step': 100}
{'eval_loss': 0.6309181451797485, 'eval_accuracy': 0.647931654676259, 'eval_precision_OBJ': 0.7529673590504451, 'eval_recall_OBJ': 0.6928327645051194, 'eval_f1_OBJ': 0.7216494845360825, 'eval_precision_SUBJ': 0.4863013698630137, 'eval_recall_SUBJ': 0.5612648221343873, 'eval_f1_SUBJ': 0.5211009174311927, 'eval_macro_f1': 0.6213752009836375, 'eval_runtime': 4.8678, 'eval_samples_per_second': 456.879, 'eval_steps_per_second': 7.19, 'epoch': 1.0, 'step': 100}
{'loss': 0.5455, 'grad_norm': 5.9713358879089355, 'learning_rate': 1.9900000000000003e-05, 'epoch': 2.0, 'step': 200}
{'eval_loss': 0.5335390567779541, 'eval_accuracy': 0.7243705035971223, 'eval_precision_OBJ': 0.8073593073593074, 'eval_recall_OBJ': 0.7638225255972696, 'eval_f1_OBJ': 0.7849877236057524, 'eval_precision_SUBJ': 0.5871121718377088, 'eval_recall_SUBJ': 0.6482213438735178, 'eval_f1_SUBJ': 0.6161552911709455, '

Train macro-F1: 0.7484437108369542
Val   macro-F1: 0.700571507388349


Load Model

In [None]:

output_dir = "/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/models/Multilingual"
model     = AutoModelForSequenceClassification.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)


data_collator = DataCollatorWithPadding(tokenizer)


trainer = Trainer(
    model            = model,
    tokenizer        = tokenizer,
    data_collator   = data_collator,
    compute_metrics = compute_metrics
)

  trainer = Trainer(


##Result

Result for test data(labeled): **Macro F1: 0.68308**



In [None]:
metrics = trainer.evaluate(test_ds)

print("Result of test data")
print(f"OBJ – Precision: {metrics['eval_precision_OBJ']:.5f}, Recall: {metrics['eval_recall_OBJ']:.5f}, F1: {metrics['eval_f1_OBJ']:.5f}")
print(f"SUBJ – Precision: {metrics['eval_precision_SUBJ']:.5f}, Recall: {metrics['eval_recall_SUBJ']:.5f}, F1: {metrics['eval_f1_SUBJ']:.5f}")
print(f"Macro‐F1: {metrics['eval_macro_f1']:.5f}")


Result of test data
OBJ – Precision: 0.88194, Recall: 0.64123, F1: 0.74257
SUBJ – Precision: 0.50656, Recall: 0.81099, F1: 0.62360
Macro‐F1: 0.68308


Prediction for test unlabeled data and save it

In [None]:
pred_out = trainer.predict(test_unlabeled_ds)
logits   = pred_out.predictions
pred_ids = logits.argmax(axis=-1)

pred_labels = le.inverse_transform(pred_ids)


df = pd.DataFrame({
    'sentence': test_unlabeled_df['sentence'],
    'prediction': pred_labels
})
save_path = "/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/unlabeld_predict/multilingual/multilingual_predictions.tsv"
df.to_csv(save_path, sep='\t', index=False)

print(f"Saved predictions to {save_path}")


Saved predictions to /content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/unlabeld_predict/multilingual/multilingual_predictions.tsv


##Train for balanced train set

In [None]:
train_labels = train_multi_balanced["labels"]


counts = Counter(train_labels)
total  = counts[0] + counts[1]
# weight for OBJ = total/counts[0], for SUBJ = total/counts[1]
weights = [ total / counts[label] for label in train_labels ]

sampler = WeightedRandomSampler(
    weights      = weights,
    num_samples  = len(weights),
    replacement  = True
)


class SamplerTrainer(Trainer):
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            sampler      = sampler,
            batch_size   = self.args.per_device_train_batch_size,
            collate_fn   = self.data_collator,
            num_workers  = self.args.dataloader_num_workers,
            pin_memory   = True,
        )

We use the model **mdeberta-v3-base**

In [None]:
torch.cuda.empty_cache()
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir= "/content/results_multi",
    eval_strategy = 'epoch',
    save_strategy       = 'epoch',
    learning_rate       = 5e-5,
    per_device_train_batch_size = 32,
    gradient_accumulation_steps   = 2,
    per_device_eval_batch_size  = 64,
    num_train_epochs    = 6,
    weight_decay        = 0.3,
    warmup_steps        = 500,
    lr_scheduler_type = "linear",
    fp16                          = True,
    load_best_model_at_end=True,
    metric_for_best_model = 'macro_f1',
    greater_is_better   = True,
    logging_dir         = './logs_multi',
    logging_steps       = 50,
    logging_strategy = 'epoch' ,
)

trainer = SamplerTrainer(
    model           = model,
    args            = training_args,
    train_dataset   = train_multi_balanced,
    eval_dataset    = dev_ds,
    tokenizer       = tokenizer,
    data_collator   = data_collator,
    compute_metrics = compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)




Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = SamplerTrainer(


Train and fine tuning

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision Obj,Recall Obj,F1 Obj,Precision Subj,Recall Subj,F1 Subj,Macro F1
1,0.691,0.67024,0.671313,0.781874,0.694881,0.735815,0.515184,0.625823,0.56514,0.650477
2,0.6099,0.649215,0.708183,0.800442,0.74198,0.770103,0.56351,0.642951,0.600615,0.685359
3,0.5338,0.543591,0.736061,0.783592,0.827986,0.805178,0.627219,0.55863,0.590941,0.698059
4,0.4354,0.573393,0.734263,0.772784,0.845051,0.807304,0.635048,0.520422,0.572049,0.689676


TrainOutput(global_step=352, training_loss=0.5675324960188433, metrics={'train_runtime': 645.6598, 'train_samples_per_second': 52.105, 'train_steps_per_second': 0.818, 'total_flos': 1152570425428800.0, 'train_loss': 0.5675324960188433, 'epoch': 4.0})

Save the best model

In [None]:
output_dir = "/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/models/Multilingual_balanced"
Path(output_dir).mkdir(parents=True, exist_ok=True)
trainer.save_model(output_dir)

print(f"Final model saved to {output_dir}")

Final model saved to /content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/models/Multilingual_balanced


Load model

In [None]:

output_dir = "/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/models/Multilingual_balanced"
model     = AutoModelForSequenceClassification.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)


data_collator = DataCollatorWithPadding(tokenizer)


trainer = Trainer(
    model            = model,
    tokenizer        = tokenizer,
    data_collator   = data_collator,
    compute_metrics = compute_metrics
)

  trainer = Trainer(


##Result

Result for test data(labeled): **Macro F1: 0.72472**



In [None]:

metrics = trainer.evaluate(test_ds)

print("Result of test data")
print(f"OBJ – Precision: {metrics['eval_precision_OBJ']:.5f}, Recall: {metrics['eval_recall_OBJ']:.5f}, F1: {metrics['eval_f1_OBJ']:.5f}")
print(f"SUBJ – Precision: {metrics['eval_precision_SUBJ']:.5f}, Recall: {metrics['eval_recall_SUBJ']:.5f}, F1: {metrics['eval_f1_SUBJ']:.5f}")
print(f"Macro‐F1: {metrics['eval_macro_f1']:.5f}")


Result of test data
OBJ – Precision: 0.83246, Recall: 0.81658, F1: 0.82444
SUBJ – Precision: 0.61240, Recall: 0.63813, F1: 0.62500
Macro‐F1: 0.72472


In [None]:
pred_out = trainer.predict(test_unlabeled_ds)
logits   = pred_out.predictions
pred_ids = logits.argmax(axis=-1)

pred_labels = le.inverse_transform(pred_ids)

df = pd.DataFrame({
    'sentence': test_unlabeled_df['sentence'],
    'prediction': pred_labels
})
save_path = "/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/unlabeld_predict/multilingual/multilingual_balanced_predictions.tsv"
df.to_csv(save_path, sep='\t', index=False)

print(f"Saved predictions to {save_path}")


Saved predictions to /content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/unlabeld_predict/multilingual/multilingual_balanced_predictions.tsv


##Test for each language

In this part, we evaluate and predict our best model for each Test data from the each language

In [15]:
base_dir = '/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/data'
test_en_path = f'{base_dir}/english/test_en_labeled.tsv'
test_it_path = f'{base_dir}/italian/test_it_labeled.tsv'
dev_test_bg_path = f'{base_dir}/bulgarian/dev_test_bg.tsv'
test_ar_path = f'{base_dir}/arabic/test_ar_labeled.tsv'
test_de_path = f'{base_dir}/german/test_de_labeled.tsv'


In [16]:
test_en_df = pd.read_csv(test_en_path, sep='\t')
test_it_df = pd.read_csv(test_it_path, sep='\t')
dev_test_bg_df = pd.read_csv(dev_test_bg_path, sep='\t')
test_ar_df = pd.read_csv(test_ar_path, sep='\t')
test_de_df = pd.read_csv(test_de_path, sep='\t')

In [17]:
le = LabelEncoder()
test_en_df['label_id'] = le.fit_transform(test_en_df['label'])
test_it_df['label_id'] = le.fit_transform(test_it_df['label'])
test_de_df['label_id'] = le.fit_transform(test_de_df['label'])
test_ar_df['label_id'] = le.fit_transform(test_ar_df['label'])
dev_test_bg_df['label_id'] = le.fit_transform(dev_test_bg_df['label'])



for df in (test_en_df, test_de_df, test_ar_df, test_it_df, dev_test_bg_df):
    df.drop(columns=['label'], inplace=True)

for df in (test_en_df, test_de_df, test_ar_df, test_it_df, dev_test_bg_df):
    df.rename(columns={'label_id':'labels'}, inplace=True)

print("Mapped classes:", dict(enumerate(le.classes_)))


Mapped classes: {0: 'OBJ', 1: 'SUBJ'}


In [18]:
test_en_ds = Dataset.from_pandas(test_en_df[['sentence','labels']])
test_it_ds = Dataset.from_pandas(test_it_df[['sentence','labels']])
test_de_ds = Dataset.from_pandas(test_de_df[['sentence','labels']])
test_ar_ds = Dataset.from_pandas(test_ar_df[['sentence','labels']])
dev_test_bg_ds = Dataset.from_pandas(dev_test_bg_df[['sentence','labels']])

In [19]:
model_name = "microsoft/mdeberta-v3-base"
tokenizer  = AutoTokenizer.from_pretrained(model_name)

max_len = 100

def tokenize(batch):
    return tokenizer(batch['sentence'],
                     padding='max_length',
                     truncation=True,
                     max_length=max_len)

test_en_ds   = test_en_ds.map(tokenize, batched=True)
test_it_ds  = test_it_ds.map(tokenize, batched=True)
test_ar_ds   = test_ar_ds.map(tokenize, batched=True)
test_de_ds  = test_de_ds.map(tokenize, batched=True)
dev_test_bg_ds = dev_test_bg_ds.map(tokenize, batched=True)


cols = ['input_ids','attention_mask','labels']
test_en_ds    = test_en_ds.remove_columns([c for c in test_en_ds.column_names if c not in cols])
test_it_ds   = test_it_ds.remove_columns([c for c in test_it_ds.column_names if c not in cols])
test_ar_ds    = test_ar_ds.remove_columns([c for c in test_ar_ds.column_names if c not in cols])
test_de_ds   = test_de_ds.remove_columns([c for c in test_de_ds.column_names if c not in cols])
dev_test_bg_ds = dev_test_bg_ds.remove_columns([c for c in dev_test_bg_ds.column_names if c not in cols])



Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/299 [00:00<?, ? examples/s]

Map:   0%|          | 0/1036 [00:00<?, ? examples/s]

Map:   0%|          | 0/347 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Load the best model

In [20]:

output_dir = "/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/models/Multilingual_balanced"
model     = AutoModelForSequenceClassification.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)


data_collator = DataCollatorWithPadding(tokenizer)


trainer = Trainer(
    model            = model,
    tokenizer        = tokenizer,
    data_collator   = data_collator,
    compute_metrics = compute_metrics
)

  trainer = Trainer(


##English

Result for english test data(labeled): **Macro F1: 0.69961**



In [21]:
metrics = trainer.evaluate(test_en_ds)

print("Result of test data")
print(f"OBJ – Precision: {metrics['eval_precision_OBJ']:.5f}, Recall: {metrics['eval_recall_OBJ']:.5f}, F1: {metrics['eval_f1_OBJ']:.5f}")
print(f"SUBJ – Precision: {metrics['eval_precision_SUBJ']:.5f}, Recall: {metrics['eval_recall_SUBJ']:.5f}, F1: {metrics['eval_f1_SUBJ']:.5f}")
print(f"Macro‐F1: {metrics['eval_macro_f1']:.5f}")


Result of test data
OBJ – Precision: 0.85128, Recall: 0.77209, F1: 0.80976
SUBJ – Precision: 0.53333, Recall: 0.65882, F1: 0.58947
Macro‐F1: 0.69961


##Italian

Result for italian test data(labeled): **Macro F1: 0.78619**



In [22]:
metrics = trainer.evaluate(test_it_ds)

print("Result of test data")
print(f"OBJ – Precision: {metrics['eval_precision_OBJ']:.5f}, Recall: {metrics['eval_recall_OBJ']:.5f}, F1: {metrics['eval_f1_OBJ']:.5f}")
print(f"SUBJ – Precision: {metrics['eval_precision_SUBJ']:.5f}, Recall: {metrics['eval_recall_SUBJ']:.5f}, F1: {metrics['eval_f1_SUBJ']:.5f}")
print(f"Macro‐F1: {metrics['eval_macro_f1']:.5f}")


Result of test data
OBJ – Precision: 0.88304, Recall: 0.78646, F1: 0.83196
SUBJ – Precision: 0.67969, Recall: 0.81308, F1: 0.74043
Macro‐F1: 0.78619


##German

Result for germany test data(labeled): **Macro F1: 0.78619**



In [23]:
metrics = trainer.evaluate(test_de_ds)

print("Result of test data")
print(f"OBJ – Precision: {metrics['eval_precision_OBJ']:.5f}, Recall: {metrics['eval_recall_OBJ']:.5f}, F1: {metrics['eval_f1_OBJ']:.5f}")
print(f"SUBJ – Precision: {metrics['eval_precision_SUBJ']:.5f}, Recall: {metrics['eval_recall_SUBJ']:.5f}, F1: {metrics['eval_f1_SUBJ']:.5f}")
print(f"Macro‐F1: {metrics['eval_macro_f1']:.5f}")


Result of test data
OBJ – Precision: 0.83004, Recall: 0.91703, F1: 0.87137
SUBJ – Precision: 0.79787, Recall: 0.63559, F1: 0.70755
Macro‐F1: 0.78946


##Arabic

Result for arabic test data(labeled): **Macro F1: 0.68439**



In [24]:
metrics = trainer.evaluate(test_ar_ds)

print("Result of test data")
print(f"OBJ – Precision: {metrics['eval_precision_OBJ']:.5f}, Recall: {metrics['eval_recall_OBJ']:.5f}, F1: {metrics['eval_f1_OBJ']:.5f}")
print(f"SUBJ – Precision: {metrics['eval_precision_SUBJ']:.5f}, Recall: {metrics['eval_recall_SUBJ']:.5f}, F1: {metrics['eval_f1_SUBJ']:.5f}")
print(f"Macro‐F1: {metrics['eval_macro_f1']:.5f}")


Result of test data
OBJ – Precision: 0.81513, Recall: 0.80055, F1: 0.80777
SUBJ – Precision: 0.54969, Recall: 0.57282, F1: 0.56101
Macro‐F1: 0.68439


##Bulgarian

Result for dev test data: **0.72036**

In [25]:
metrics = trainer.evaluate(dev_test_bg_ds)

print("Result of test data")
print(f"OBJ – Precision: {metrics['eval_precision_OBJ']:.5f}, Recall: {metrics['eval_recall_OBJ']:.5f}, F1: {metrics['eval_f1_OBJ']:.5f}")
print(f"SUBJ – Precision: {metrics['eval_precision_SUBJ']:.5f}, Recall: {metrics['eval_recall_SUBJ']:.5f}, F1: {metrics['eval_f1_SUBJ']:.5f}")
print(f"Macro‐F1: {metrics['eval_macro_f1']:.5f}")


Result of test data
OBJ – Precision: 0.74051, Recall: 0.81818, F1: 0.77741
SUBJ – Precision: 0.71739, Recall: 0.61682, F1: 0.66332
Macro‐F1: 0.72036
