#Zero-shot

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset, concatenate_datasets
from transformers import AutoTokenizer
from collections import Counter
import random
import torch
from transformers import DataCollatorWithPadding, EarlyStoppingCallback
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, WeightedRandomSampler
from pathlib import Path

In [None]:
base_dir = '/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/data'
test_gr_path = f'{base_dir}/greek/test_gr_labeled.tsv'
test_gr_unlabeled_path = f'{base_dir}/greek/test_gr_unlabeled.tsv'
test_pol_path = f'{base_dir}/polish/test_pol_labeled.tsv'
test_pol_unlabeled_path = f'{base_dir}/polish/test_pol_unlabeled.tsv'
test_ro_path = f'{base_dir}/romanian/test_ro_labeled.tsv'
test_ro_unlabeled_path = f'{base_dir}/romanian/test_ro_unlabeled.tsv'
test_ukr_path = f'{base_dir}/ukrainian/test_ukr_labeled.tsv'
test_ukr_unlabeled_path = f'{base_dir}/ukrainian/test_ukr_unlabeled.tsv'


In [None]:
test_gr_df = pd.read_csv(test_gr_path, sep='\t')
test_gr_unlabeled_df = pd.read_csv(test_gr_unlabeled_path, sep='\t')
test_pol_df = pd.read_csv(test_pol_path, sep='\t')
test_pol_unlabeled_df = pd.read_csv(test_pol_unlabeled_path, sep='\t')
test_ro_df = pd.read_csv(test_ro_path, sep='\t')
test_ro_unlabeled_df = pd.read_csv(test_ro_unlabeled_path, sep='\t')
test_ukr_df = pd.read_csv(test_ukr_path, sep='\t')
test_ukr_unlabeled_df = pd.read_csv(test_ukr_unlabeled_path, sep='\t')

In [None]:
le = LabelEncoder()
test_gr_df['label_id'] = le.fit_transform(test_gr_df['label'])
test_pol_df['label_id'] = le.fit_transform(test_pol_df['label'])
test_ro_df['label_id'] = le.fit_transform(test_ro_df['label'])
test_ukr_df['label_id'] = le.fit_transform(test_ukr_df['label'])


for df in (test_gr_df, test_pol_df, test_ukr_df, test_ro_df):
    df.drop(columns=['label'], inplace=True)

for df in (test_gr_df, test_pol_df, test_ukr_df, test_ro_df):
    df.rename(columns={'label_id':'labels'}, inplace=True)

print("Mapped classes:", dict(enumerate(le.classes_)))


Mapped classes: {0: 'OBJ', 1: 'SUBJ'}


In [None]:
test_gr_ds = Dataset.from_pandas(test_gr_df[['sentence','labels']])
test_pol_ds = Dataset.from_pandas(test_pol_df[['sentence','labels']])
test_ro_ds = Dataset.from_pandas(test_ro_df[['sentence','labels']])
test_ukr_ds = Dataset.from_pandas(test_ukr_df[['sentence','labels']])
test_gr_unlabeled_ds = Dataset.from_pandas(test_gr_unlabeled_df[['sentence']])
test_pol_unlabeled_ds = Dataset.from_pandas(test_pol_unlabeled_df[['sentence']])
test_ro_unlabeled_ds = Dataset.from_pandas(test_ro_unlabeled_df[['sentence']])
test_ukr_unlabeled_ds = Dataset.from_pandas(test_ukr_unlabeled_df[['sentence']])

In [None]:
model_name = "microsoft/mdeberta-v3-base"
tokenizer  = AutoTokenizer.from_pretrained(model_name)

max_len = 100

def tokenize(batch):
    return tokenizer(batch['sentence'],
                     padding='max_length',
                     truncation=True,
                     max_length=max_len)

test_gr_ds   = test_gr_ds.map(tokenize, batched=True)
test_pol_ds  = test_pol_ds.map(tokenize, batched=True)
test_ro_ds   = test_ro_ds.map(tokenize, batched=True)
test_ukr_ds  = test_ukr_ds.map(tokenize, batched=True)
test_gr_unlabeled_ds = test_gr_unlabeled_ds.map(tokenize, batched=True)
test_pol_unlabeled_ds = test_pol_unlabeled_ds.map(tokenize, batched=True)
test_ro_unlabeled_ds = test_ro_unlabeled_ds.map(tokenize, batched=True)
test_ukr_unlabeled_ds = test_ukr_unlabeled_ds.map(tokenize, batched=True)


cols = ['input_ids','attention_mask','labels']
test_gr_ds    = test_gr_ds.remove_columns([c for c in test_gr_ds.column_names if c not in cols])
test_pol_ds   = test_pol_ds.remove_columns([c for c in test_pol_ds.column_names if c not in cols])
test_ro_ds    = test_ro_ds.remove_columns([c for c in test_ro_ds.column_names if c not in cols])
test_ukr_ds   = test_ukr_ds.remove_columns([c for c in test_ukr_ds.column_names if c not in cols])
test_gr_unlabeled_ds = test_gr_unlabeled_ds.remove_columns(
    [c for c in test_gr_unlabeled_ds.column_names if c not in ['input_ids','attention_mask']])
test_pol_unlabeled_ds = test_pol_unlabeled_ds.remove_columns(
    [c for c in test_pol_unlabeled_ds.column_names if c not in ['input_ids','attention_mask']])
test_ro_unlabeled_ds = test_ro_unlabeled_ds.remove_columns(
    [c for c in test_ro_unlabeled_ds.column_names if c not in ['input_ids','attention_mask']])
test_ukr_unlabeled_ds = test_ukr_unlabeled_ds.remove_columns(
    [c for c in test_ukr_unlabeled_ds.column_names if c not in ['input_ids','attention_mask']])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/284 [00:00<?, ? examples/s]

Map:   0%|          | 0/351 [00:00<?, ? examples/s]

Map:   0%|          | 0/206 [00:00<?, ? examples/s]

Map:   0%|          | 0/297 [00:00<?, ? examples/s]

Map:   0%|          | 0/284 [00:00<?, ? examples/s]

Map:   0%|          | 0/351 [00:00<?, ? examples/s]

Map:   0%|          | 0/206 [00:00<?, ? examples/s]

Map:   0%|          | 0/297 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorWithPadding(tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, labels=[0,1], zero_division=0
    )
    return {
        'accuracy': accuracy_score(labels, preds),
        'precision_OBJ': precision[0],
        'recall_OBJ':    recall[0],
        'f1_OBJ':        f1[0],
        'precision_SUBJ':precision[1],
        'recall_SUBJ':   recall[1],
        'f1_SUBJ':       f1[1],
        'macro_f1':      f1.mean()
    }


##Load best multilingual Model

In [None]:

output_dir = "/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/models/Multilingual_balanced"
model     = AutoModelForSequenceClassification.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model            = model,
    tokenizer        = tokenizer,
    data_collator   = data_collator,
    compute_metrics = compute_metrics
)

  trainer = Trainer(


In [None]:
test_gr_ds = test_gr_ds.filter(lambda x: x["labels"] in [0, 1])


Filter:   0%|          | 0/284 [00:00<?, ? examples/s]

##Greek result

Result for test data(labeled): **Macro F1:  0.77467**



In [None]:
metrics = trainer.evaluate(test_gr_ds)

print("Result of test data")
print(f"OBJ – Precision: {metrics['eval_precision_OBJ']:.5f}, Recall: {metrics['eval_recall_OBJ']:.5f}, F1: {metrics['eval_f1_OBJ']:.5f}")
print(f"SUBJ – Precision: {metrics['eval_precision_SUBJ']:.5f}, Recall: {metrics['eval_recall_SUBJ']:.5f}, F1: {metrics['eval_f1_SUBJ']:.5f}")
print(f"Macro‐F1: {metrics['eval_macro_f1']:.5f}")


[34m[1mwandb[0m: Currently logged in as: [33mmehreganmohseni[0m ([33mmehreganmohseni-universit-di-bologna[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Result of test data
OBJ – Precision: 0.92766, Recall: 0.92373, F1: 0.92569
SUBJ – Precision: 0.61702, Recall: 0.63043, F1: 0.62366
Macro‐F1: 0.77467


In [None]:
pred_out = trainer.predict(test_gr_unlabeled_ds)
logits   = pred_out.predictions
pred_ids = logits.argmax(axis=-1)

pred_labels = le.inverse_transform(pred_ids)

df = pd.DataFrame({
    'sentence': test_gr_unlabeled_df['sentence'],
    'prediction': pred_labels
})
save_path = "/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/unlabeld_predict/zeroshot/greek_predictions.tsv"
df.to_csv(save_path, sep='\t', index=False)

print(f"Saved predictions to {save_path}")


Saved predictions to /content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/unlabeld_predict/zeroshot/greek_predictions.tsv


##Romanian result

Result for test data(labeled): **Macro F1: 0.72798**



In [None]:
# 4) Evaluate on your labeled test set
metrics = trainer.evaluate(test_ro_ds)

# 5) Print out the per-class and macro metrics
print("Result of test data")
print(f"OBJ – Precision: {metrics['eval_precision_OBJ']:.5f}, Recall: {metrics['eval_recall_OBJ']:.5f}, F1: {metrics['eval_f1_OBJ']:.5f}")
print(f"SUBJ – Precision: {metrics['eval_precision_SUBJ']:.5f}, Recall: {metrics['eval_recall_SUBJ']:.5f}, F1: {metrics['eval_f1_SUBJ']:.5f}")
print(f"Macro‐F1: {metrics['eval_macro_f1']:.5f}")


Result of test data
OBJ – Precision: 0.92000, Recall: 0.74675, F1: 0.82437
SUBJ – Precision: 0.51852, Recall: 0.80769, F1: 0.63158
Macro‐F1: 0.72798


In [None]:
pred_out = trainer.predict(test_ro_unlabeled_ds)
logits   = pred_out.predictions
pred_ids = logits.argmax(axis=-1)

pred_labels = le.inverse_transform(pred_ids)

df = pd.DataFrame({
    'sentence': test_ro_unlabeled_df['sentence'],
    'prediction': pred_labels
})
save_path = "/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/unlabeld_predict/zeroshot/romainian_predictions.tsv"
df.to_csv(save_path, sep='\t', index=False)

print(f"Saved predictions to {save_path}")


Saved predictions to /content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/unlabeld_predict/zeroshot/romainian_predictions.tsv


##Ukrainian result

Result for test data(labeled): **Macro F1: 0.64025**



In [None]:
# 4) Evaluate on your labeled test set
metrics = trainer.evaluate(test_ukr_ds)

# 5) Print out the per-class and macro metrics
print("Result of test data")
print(f"OBJ – Precision: {metrics['eval_precision_OBJ']:.5f}, Recall: {metrics['eval_recall_OBJ']:.5f}, F1: {metrics['eval_f1_OBJ']:.5f}")
print(f"SUBJ – Precision: {metrics['eval_precision_SUBJ']:.5f}, Recall: {metrics['eval_recall_SUBJ']:.5f}, F1: {metrics['eval_f1_SUBJ']:.5f}")
print(f"Macro‐F1: {metrics['eval_macro_f1']:.5f}")


Result of test data
OBJ – Precision: 0.82039, Recall: 0.77169, F1: 0.79529
SUBJ – Precision: 0.45055, Recall: 0.52564, F1: 0.48521
Macro‐F1: 0.64025


In [None]:

pred_out = trainer.predict(test_ukr_unlabeled_ds)
logits   = pred_out.predictions
pred_ids = logits.argmax(axis=-1)


pred_labels = le.inverse_transform(pred_ids)

df = pd.DataFrame({
    'sentence': test_ukr_unlabeled_df['sentence'],
    'prediction': pred_labels
})
save_path = "/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/unlabeld_predict/zeroshot/ukrainian_predictions.tsv"
df.to_csv(save_path, sep='\t', index=False)

print(f"Saved predictions to {save_path}")


Saved predictions to /content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/unlabeld_predict/zeroshot/ukrainian_predictions.tsv


##Polish result

Result for test data(labeled): **Macro F1: 0.64251**



In [None]:
# 4) Evaluate on your labeled test set
metrics = trainer.evaluate(test_pol_ds)

# 5) Print out the per-class and macro metrics
print("Result of test data")
print(f"OBJ – Precision: {metrics['eval_precision_OBJ']:.5f}, Recall: {metrics['eval_recall_OBJ']:.5f}, F1: {metrics['eval_f1_OBJ']:.5f}")
print(f"SUBJ – Precision: {metrics['eval_precision_SUBJ']:.5f}, Recall: {metrics['eval_recall_SUBJ']:.5f}, F1: {metrics['eval_f1_SUBJ']:.5f}")
print(f"Macro‐F1: {metrics['eval_macro_f1']:.5f}")


Result of test data
OBJ – Precision: 0.64041, Recall: 0.98421, F1: 0.77593
SUBJ – Precision: 0.94915, Recall: 0.34783, F1: 0.50909
Macro‐F1: 0.64251


In [None]:

pred_out = trainer.predict(test_pol_unlabeled_ds)
logits   = pred_out.predictions
pred_ids = logits.argmax(axis=-1)


pred_labels = le.inverse_transform(pred_ids)


df = pd.DataFrame({
    'sentence': test_pol_unlabeled_df['sentence'],
    'prediction': pred_labels
})
save_path = "/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/unlabeld_predict/zeroshot/polish_predictions.tsv"
df.to_csv(save_path, sep='\t', index=False)

print(f"Saved predictions to {save_path}")


Saved predictions to /content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/unlabeld_predict/zeroshot/polish_predictions.tsv
