#Monolingual_English

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset, concatenate_datasets
from transformers import AutoTokenizer
from collections import Counter
import random
import torch
from transformers import DataCollatorWithPadding, EarlyStoppingCallback
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, WeightedRandomSampler
from pathlib import Path

##Load Data

In [None]:
base_dir = '/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/data/english'
train_path = f'{base_dir}/train_en.tsv'
dev_path   = f'{base_dir}/dev_en.tsv'
dev_test_path = f'{base_dir}/dev_test_en.tsv'
test_path = f'{base_dir}/test_en_labeled.tsv'
test_unlabeled_path = f'{base_dir}/test_en_unlabeled.tsv'

In [None]:
train_df = pd.read_csv(train_path, sep='\t')
dev_df   = pd.read_csv(dev_path, sep='\t')
dev_test_df = pd.read_csv(dev_test_path, sep='\t')
test_df = pd.read_csv(test_path, sep='\t')
test_unlabeled_df = pd.read_csv(test_unlabeled_path, sep='\t')

##Pre processing

In [None]:
le = LabelEncoder()
train_df['label_id']    = le.fit_transform(train_df['label'])
dev_df['label_id']      = le.transform(dev_df['label'])
dev_test_df['label_id'] = le.transform(dev_test_df['label'])
test_df['label_id']     = le.transform(test_df['label'])

for df in (train_df, dev_df, dev_test_df, test_df):
    df.drop(columns=['label'], inplace=True)

for df in (train_df, dev_df, dev_test_df, test_df):
    df.rename(columns={'label_id':'labels'}, inplace=True)

print("Mapped classes:", dict(enumerate(le.classes_)))


Mapped classes: {0: 'OBJ', 1: 'SUBJ'}


In [None]:
train_ds    = Dataset.from_pandas(train_df[['sentence','labels']])
dev_ds      = Dataset.from_pandas(dev_df[['sentence','labels']])
dev_test_ds = Dataset.from_pandas(dev_test_df[['sentence','labels']])
test_ds = Dataset.from_pandas(test_df[['sentence','labels']])
test_unlabeled_ds = Dataset.from_pandas(test_unlabeled_df[['sentence']])

In [None]:
counts = Counter(train_ds['labels'])
n_obj, n_subj = counts[0], counts[1]
print(f"Original counts → OBJ: {n_obj}, SUBJ: {n_subj}")

Original counts → OBJ: 532, SUBJ: 298


#Tokenize

For tokenize data, we use the **mDeBERTaV3_base** model.

In [None]:
model_name = "microsoft/mdeberta-v3-base"
tokenizer  = AutoTokenizer.from_pretrained(model_name)

max_len = 100

def tokenize(batch):
    return tokenizer(batch['sentence'],
                     padding='max_length',
                     truncation=True,
                     max_length=max_len)

train_ds    = train_ds.map(tokenize, batched=True)
dev_ds      = dev_ds.map(tokenize, batched=True)
dev_test_ds = dev_test_ds.map(tokenize, batched=True)
test_ds     = test_ds.map(tokenize, batched=True)
test_unlabeled_ds = test_unlabeled_ds.map(tokenize, batched=True)

cols = ['input_ids','attention_mask','labels']
train_ds    = train_ds.remove_columns([c for c in train_ds.column_names if c not in cols])
dev_ds      = dev_ds.remove_columns([c for c in dev_ds.column_names if c not in cols])
dev_test_ds = dev_test_ds.remove_columns([c for c in dev_test_ds.column_names if c not in cols])
test_ds     = test_ds.remove_columns([c for c in test_ds.column_names if c not in cols])
test_unlabeled_ds = test_unlabeled_ds.remove_columns(
    [c for c in test_unlabeled_ds.column_names if c not in ['input_ids','attention_mask']])



Map:   0%|          | 0/830 [00:00<?, ? examples/s]

Map:   0%|          | 0/462 [00:00<?, ? examples/s]

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

#Model

 Define a data collator for dynamic padding and a metrics function to compute per-class precision, recall, F1, and macro F1 score.


In [None]:
data_collator = DataCollatorWithPadding(tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, labels=[0,1], zero_division=0
    )
    return {
        'accuracy': accuracy_score(labels, preds),
        'precision_OBJ': precision[0],
        'recall_OBJ':    recall[0],
        'f1_OBJ':        f1[0],
        'precision_SUBJ':precision[1],
        'recall_SUBJ':   recall[1],
        'f1_SUBJ':       f1[1],
        'macro_f1':      f1.mean()
    }


 Use WeightedRandomSampler to balance class sampling in each batch, and customize Trainer to use this sampler during training.


In [None]:
# Extract train labels (0 or 1)
train_labels = train_ds["labels"]  # a list or array of 0/1


counts = Counter(train_labels)
total  = counts[0] + counts[1]
# Weight for OBJ = total/counts[0], for SUBJ = total/counts[1]
weights = [ total / counts[label] for label in train_labels ]

# Sampler that samples N = len(train) items with replacement
sampler = WeightedRandomSampler(
    weights      = weights,
    num_samples  = len(weights),
    replacement  = True
)



class SamplerTrainer(Trainer):
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            sampler      = sampler,
            batch_size   = self.args.per_device_train_batch_size,
            collate_fn   = self.data_collator,
            num_workers  = self.args.dataloader_num_workers,
            pin_memory   = True,
        )

 Initialize model **(mDeBERTaV3_base)** and training configuration with gradient checkpointing and early stopping.

 Uses a custom SamplerTrainer to address class imbalance, and selects the best model based on macro F1 score.


In [None]:
torch.cuda.empty_cache()
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir= "/content/results_en",
    eval_strategy = 'epoch',
    save_strategy       = 'epoch',
    learning_rate       = 3e-5,
    per_device_train_batch_size = 32,
    gradient_accumulation_steps   = 2,
    per_device_eval_batch_size  = 64,
    num_train_epochs    = 6,
    weight_decay        = 0.01,
    warmup_steps        = 6,
    lr_scheduler_type = "linear",
    fp16                          = True,
    load_best_model_at_end=True,
    metric_for_best_model = 'macro_f1',
    greater_is_better   = True,
    logging_dir         = './logs_en',
    logging_steps       = 50,
    logging_strategy = 'epoch' ,
)

trainer = SamplerTrainer(
    model           = model,
    args            = training_args,
    train_dataset   = train_ds,
    eval_dataset    = dev_ds,
    tokenizer       = tokenizer,
    data_collator   = data_collator,
    compute_metrics = compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)




Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = SamplerTrainer(


Train the model and save the best model.

In [None]:
trainer.train()
output_dir = "/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/models/Monolingual_english"
Path(output_dir).mkdir(parents=True, exist_ok=True)
trainer.save_model(output_dir)

print(f"Final model saved to {output_dir}")


Epoch,Training Loss,Validation Loss,Accuracy,Precision Obj,Recall Obj,F1 Obj,Precision Subj,Recall Subj,F1 Subj,Macro F1
1,0.7024,0.686395,0.519481,0.0,0.0,0.0,0.519481,1.0,0.683761,0.34188
2,0.6312,0.622013,0.694805,0.630225,0.882883,0.73546,0.827815,0.520833,0.639386,0.687423
3,0.5355,0.55695,0.742424,0.679443,0.878378,0.766208,0.845714,0.616667,0.713253,0.739731
4,0.4563,0.542312,0.748918,0.690647,0.864865,0.768,0.836957,0.641667,0.726415,0.747208
5,0.3899,0.525243,0.768398,0.712177,0.869369,0.782961,0.848168,0.675,0.75174,0.767351
6,0.3513,0.517332,0.785714,0.746988,0.837838,0.789809,0.830986,0.7375,0.781457,0.785633


Final model saved to /content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/models/Monolingual_english


training and evaluation loss logs for each epoch, then evaluate and display final macro F1 scores on the train and validation sets.


In [None]:
for record in trainer.state.log_history:
    if 'eval_loss' in record or 'loss' in record:
        print(record)

train_metrics = trainer.evaluate(train_ds)
val_metrics   = trainer.evaluate(dev_ds)
print("Train macro-F1:", train_metrics['eval_macro_f1'])
print("Val   macro-F1:", val_metrics['eval_macro_f1'])


{'loss': 0.7024, 'grad_norm': 1.4550509452819824, 'learning_rate': 2.75e-05, 'epoch': 1.0, 'step': 13}
{'eval_loss': 0.6863954067230225, 'eval_accuracy': 0.5194805194805194, 'eval_precision_OBJ': 0.0, 'eval_recall_OBJ': 0.0, 'eval_f1_OBJ': 0.0, 'eval_precision_SUBJ': 0.5194805194805194, 'eval_recall_SUBJ': 1.0, 'eval_f1_SUBJ': 0.6837606837606838, 'eval_macro_f1': 0.3418803418803419, 'eval_runtime': 1.1458, 'eval_samples_per_second': 403.202, 'eval_steps_per_second': 6.982, 'epoch': 1.0, 'step': 13}
{'loss': 0.6312, 'grad_norm': 2.718979835510254, 'learning_rate': 2.25e-05, 'epoch': 2.0, 'step': 26}
{'eval_loss': 0.622012734413147, 'eval_accuracy': 0.6948051948051948, 'eval_precision_OBJ': 0.6302250803858521, 'eval_recall_OBJ': 0.8828828828828829, 'eval_f1_OBJ': 0.7354596622889306, 'eval_precision_SUBJ': 0.8278145695364238, 'eval_recall_SUBJ': 0.5208333333333334, 'eval_f1_SUBJ': 0.639386189258312, 'eval_macro_f1': 0.6874229257736213, 'eval_runtime': 1.1099, 'eval_samples_per_second': 41

Train macro-F1: 0.9198612172927558
Val   macro-F1: 0.7856329354199182


Load model

In [None]:
output_dir = "/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/models/Monolingual_english"
model     = AutoModelForSequenceClassification.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model            = model,
    tokenizer        = tokenizer,
    data_collator   = data_collator,
    compute_metrics = compute_metrics
)

  trainer = Trainer(


#Result

Result for test data(labeled): **Macro F1: 0.71735**



In [None]:
# Evaluate on labeled test set
metrics = trainer.evaluate(test_ds)

print("Result of test data")
print(f"OBJ – Precision: {metrics['eval_precision_OBJ']:.5f}, Recall: {metrics['eval_recall_OBJ']:.5f}, F1: {metrics['eval_f1_OBJ']:.5f}")
print(f"SUBJ – Precision: {metrics['eval_precision_SUBJ']:.5f}, Recall: {metrics['eval_recall_SUBJ']:.5f}, F1: {metrics['eval_f1_SUBJ']:.5f}")
print(f"Macro‐F1: {metrics['eval_macro_f1']:.5f}")


Result of test data
OBJ – Precision: 0.84951, Recall: 0.81395, F1: 0.83135
SUBJ – Precision: 0.57447, Recall: 0.63529, F1: 0.60335
Macro‐F1: 0.71735


Result for dev test data

In [None]:
metrics = trainer.evaluate(dev_test_ds)

print("Result of dev_test data")
print(f"OBJ – Precision: {metrics['eval_precision_OBJ']:.5f}, Recall: {metrics['eval_recall_OBJ']:.5f}, F1: {metrics['eval_f1_OBJ']:.5f}")
print(f"SUBJ – Precision: {metrics['eval_precision_SUBJ']:.5f}, Recall: {metrics['eval_recall_SUBJ']:.5f}, F1: {metrics['eval_f1_SUBJ']:.5f}")
print(f"Macro‐F1: {metrics['eval_macro_f1']:.5f}")


Result of dev_test data
OBJ – Precision: 0.82850, Recall: 0.94751, F1: 0.88402
SUBJ – Precision: 0.72857, Recall: 0.41803, F1: 0.53125
Macro‐F1: 0.70764


Prediction for test unlabeled data and save it

In [None]:
# prediction on the unlabeled test set
pred_out = trainer.predict(test_unlabeled_ds)
logits   = pred_out.predictions
pred_ids = logits.argmax(axis=-1)

pred_labels = le.inverse_transform(pred_ids)

df = pd.DataFrame({
    'sentence': test_unlabeled_df['sentence'],
    'prediction': pred_labels
})
save_path = "/content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/unlabeld_predict/english/english_predictions.tsv"
df.to_csv(save_path, sep='\t', index=False)

print(f"Saved predictions to {save_path}")


Saved predictions to /content/drive/MyDrive/clef2025-checkthat-lab-main-task1/task1/unlabeld_predict/english/english_predictions.tsv
