<h1 align="center">Exists2025:  Sexism Identification in Twitter</h1>
<h2 align="center">Task 1. Fine-tuning for binary classification
<h3 style="display:block; margin-top:5px;" align="center">ETSInf. Universitat Politècnica de València</h3>
<br>

## Libraries

In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import  AutoTokenizer, AutoModelForSequenceClassification,  Trainer, TrainingArguments,  EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import random
import os
import pandas as pd
import json
import sys
import tempfile
import time
import re
import emoji

#Importing the required modules to use the ICM measure
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.metrics.metricfactory import MetricFactory
from pyevall.reports.reports import PyEvALLReport
from pyevall.utils.utils import PyEvALLUtils
from functools import partial

from readerEXIST2025 import EXISTReader
from create_submision_folder import create_submision_file

## Read data

In [2]:
url = r"\bhttps?://[\w.-]+(?:\.[a-zA-Z]{2,})?(?:/\S*/?)?"

emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
pattern_emoji = "(" + "|".join(re.escape(u) for u in emojis) + ")"

pattern_all = r"|".join(
    [
        url,
        pattern_emoji,
    ]
)

re_all = re.compile(pattern_all, re.U)

In [3]:
file_train = "data/EXIST 2025 Tweets Dataset/training/EXIST2025_training.json"
file_dev = "data/EXIST 2025 Tweets Dataset/dev/EXIST2025_dev.json"
file_test = "data/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json"

reader_train = EXISTReader(file_train)
reader_dev = EXISTReader(file_dev)
reader_test = EXISTReader(file_test, is_test=True)

EnTrainTask3, EnDevTask3, EnTestTask3 = reader_train.get(lang="EN", subtask="3", regular_exp=re_all, preprocess=True), reader_dev.get(lang="EN", subtask="3", regular_exp=re_all, preprocess=True), reader_test.get(lang="EN", subtask="3", regular_exp=re_all, preprocess=True)
SpTrainTask3, SpDevTask3, SpTestTask3 = reader_train.get(lang="ES", subtask="3", regular_exp=re_all, preprocess=True), reader_dev.get(lang="ES", subtask="3", regular_exp=re_all, preprocess=True), reader_test.get(lang="ES", subtask="3", regular_exp=re_all, preprocess=True)

In [4]:
EnTrainTask3[2].head()

1    [NO, MISOGYNY-NON-SEXUAL-VIOLENCE, OBJECTIFICA...
2    [NO, MISOGYNY-NON-SEXUAL-VIOLENCE, OBJECTIFICA...
5    [IDEOLOGICAL-INEQUALITY, MISOGYNY-NON-SEXUAL-V...
6            [OBJECTIFICATION, STEREOTYPING-DOMINANCE]
7                         [NO, IDEOLOGICAL-INEQUALITY]
Name: label3, dtype: object

Ahora son 6 labels, incluyendo al NO.

## ICM Wrapper

In [4]:
def ICMWrapper(pred, labels, multi=False,ids=None):
    test = PyEvALLEvaluation()
    metrics=[MetricFactory.ICM.value]
    params= dict()
    fillLabel=None
    if multi:
        params[PyEvALLUtils.PARAM_REPORT]="embedded"
        hierarchy={"True":['IDEOLOGICAL-INEQUALITY', 'STEREOTYPING-DOMINANCE', 'MISOGYNY-NON-SEXUAL-VIOLENCE', 'OBJECTIFICATION', 'SEXUAL-VIOLENCE'],
        "False":[]}
        params[PyEvALLUtils.PARAM_HIERARCHY]=hierarchy
        fillLabel = lambda x: ["False"] if len(x)== 0 else x
    else:
        params[PyEvALLUtils.PARAM_REPORT]="simple"
        fillLabel = lambda x: str(x)


    truth_name, predict_name=None, None
    if ids is None:
        ids=list(range(len(labels)))

    with tempfile.NamedTemporaryFile(mode='w', delete=False, encoding='utf-8') as truth:
        truth_name=truth.name
        truth_df=pd.DataFrame({'test_case': ['EXIST2025']*len(labels),
                        'id': [str(x) for x in ids],
                        'value': [fillLabel(x) for x in labels]})
        if multi==True:
            truth_df=truth_df.astype('object')
        truth.write(truth_df.to_json(orient="records"))

    with  tempfile.NamedTemporaryFile(mode='w', delete=False) as predict:
        predict_name=predict.name
        predict_df=pd.DataFrame({'test_case': ['EXIST2025']*len(pred),
                        'id': [str(x) for x in ids],
                        'value': [fillLabel(x) for x in pred]})
        if multi==True:
            predict_df=predict_df.astype('object')
        predict.write(predict_df.to_json(orient="records"))

    report = test.evaluate(predict_name, truth_name, metrics, **params)
    os.unlink(truth_name)
    os.unlink(predict_name)

    icm = None
    if 'metrics' in report.report:
        if 'ICM' in report.report["metrics"]: icm=float(report.report["metrics"]['ICM']["results"]["average_per_test_case"])
    return icm

## Set seed

In [5]:
def set_seed(seed=1234):
    """
    Sets the seed to make everything deterministic, for reproducibility of experiments
    Parameters:
    seed: the number to set the seed to
    Return: None
    """
    # Random seed
    random.seed(seed)
    # Numpy seed
    np.random.seed(seed)
    # Torch seed
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    # os seed
    os.environ['PYTHONHASHSEED'] = str(seed)

## Dataset class

In [6]:
class SexismDatasetMulti(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_len=128, pad="max_length", trunc=True,rt='pt'):
        self.texts = texts.tolist()
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pad = pad
        self.trunc = trunc
        self.rt = rt


    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,padding=self.pad, truncation=self.trunc,
            return_tensors=self.rt
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float),
            'id': torch.tensor(self.ids[idx], dtype=torch.long)}

## Metrics

In [7]:
def compute_metrics_3(pred, lencoder):
    labels = pred.label_ids
    #preds = pred.predictions.argmax(-1)
    preds = torch.sigmoid(torch.tensor(pred.predictions)).numpy()
    preds_binary = (preds >= 0.5).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds_binary, average=None, zero_division=0
    )
    acc = accuracy_score(labels, preds_binary)
    icm= ICMWrapper(lencoder.inverse_transform(preds_binary), lencoder.inverse_transform(labels), multi=True)
    # Macro averages
    precision_macro = np.mean(precision)
    recall_macro = np.mean(recall)
    f1_macro = np.mean(f1)
    metrics = {}
    metrics.update({
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'ICM': icm
    })
    return metrics

## Pipeline

In [8]:
def sexism_classification_pipeline_task3(trainInfo, devInfo, testInfo=None, model_name='roberta-base', nlabels=5, ptype="multi_label_classification", **args):
    # Model and Tokenizer
    labelEnc= MultiLabelBinarizer()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=nlabels,
        problem_type=ptype, 
        ignore_mismatched_sizes=args.get("ignore_mismatched_sizes", False)
        )

    # Prepare datasets
    train_dataset = SexismDatasetMulti(trainInfo[1], labelEnc.fit_transform(trainInfo[2]),[int(x) for x in trainInfo[0]], tokenizer )
    val_dataset = SexismDatasetMulti(devInfo[1], labelEnc.transform(devInfo[2]), [int(x) for x in devInfo[0]], tokenizer)

    # Training Arguments
    training_args = TrainingArguments(
        report_to="none", # alt: "wandb", "tensorboard" "comet_ml" "mlflow" "clearml"
        output_dir= args.get('output_dir', './results'),
        num_train_epochs= args.get('num_train_epochs', 5),
        learning_rate=args.get('learning_rate', 5e-5),
        per_device_train_batch_size=args.get('per_device_train_batch_size', 16),
        per_device_eval_batch_size=args.get('per_device_eval_batch_size', 64),
        warmup_steps=args.get('warmup_steps', 500),
        weight_decay=args.get('weight_decay',0.01),
        logging_dir=args.get('logging_dir', './logs'),
        logging_steps=args.get('logging_steps', 10),
        eval_strategy=args.get('eval_strategy','epoch'),
        save_strategy=args.get('save_strategy', "epoch"),
        save_total_limit=args.get('save_total_limit', 1),
        load_best_model_at_end=args.get('load_best_model_at_end', True),
        metric_for_best_model=args.get('metric_for_best_model',"ICM")
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        #compute_metrics=compute_metrics_3,
        compute_metrics = partial(compute_metrics_3, lencoder=labelEnc),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=args.get("early_stopping_patience",3))]
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()
    print("Validation Results:", eval_results)

    if testInfo is not None:
        # Prepare test dataset for prediction
        test_dataset = SexismDatasetMulti(testInfo[1], [[0,0,0,0,0]] * len(testInfo[1]),  [int(x) for x in testInfo[0]],   tokenizer)
        output_type = args.get("output_type", "hard")
        # Predict test set labels
        predictions = trainer.predict(test_dataset)
        #predicted_labels = np.argmax(predictions.predictions, axis=1)
        probabilities = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()

        if output_type == "hard":
            predicted_labels = (probabilities >= 0.5).astype(int)
            # Create submission DataFrame
            labels = labelEnc.inverse_transform(predicted_labels)
            list_labels = []
            for t in labels:
                aux = []
                for e in t:
                    aux.append(e)
                list_labels.append(aux)

            submission_df = pd.DataFrame({
                'id': testInfo[0],
                'label': list_labels,
                "test_case": ["EXIST2025"]*len(predicted_labels)
            })

        elif output_type == "soft":
            submission_data = {
                'id': testInfo[0],
                "test_case": ["EXIST2025"] * len(probabilities)
            }
            list_probabilities = []
            for i in range(probabilities.shape[0]):
                value = {}
                for j in range(probabilities.shape[1]):
                    c = labelEnc.classes_[j]
                    value[c] = probabilities[i,j].item()
                list_probabilities.append(value)

            submission_data["value"] = list_probabilities
            submission_df = pd.DataFrame(submission_data)

        language = args.get("language", "english")
        submission_df.to_csv(f'sexism_predictions_task3_{output_type}_{language}.csv', index=False)
        print(f"Prediction TASK3 completed. Results saved to sexism_predictions_task3_{output_type}_{language}.csv")
    return model, eval_results

## Run id

Before training, we specify the `run_id` for the predictions.

In [13]:
run_id = 1

## Training soft

In [None]:
modelname = "cardiffnlp/twitter-roberta-base-sentiment-latest"
params = {"num_train_epochs": 20,
          "learning_rate": 0.0001,
          "per_device_train_batch_size": 64,
          "warmup_steps": 200,
          "early_stopping_patience": 5,
          "ignore_mismatched_sizes": True,
          "logging_dir": None,
          "output_dir": None,
          "language": "english",
          "output_type": "soft"
          }

_, eval_results = sexism_classification_pipeline_task3(EnTrainTask3, EnDevTask3, EnTestTask3, modelname, 6, "multi_label_classification", **params)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Epoch,Training Loss,Validation Loss,Precision Macro,Recall Macro,F1 Macro,Icm
1,0.666,0.643364,0.559085,0.563092,0.530898,-0.43754
2,0.6119,0.592635,0.695663,0.708991,0.693488,0.13825
3,0.5548,0.59508,0.747913,0.596079,0.658422,-1.112738
4,0.4994,0.584585,0.712171,0.743613,0.724938,0.25976
5,0.4359,0.621352,0.740727,0.633646,0.676475,-0.382674
6,0.3694,0.675786,0.758082,0.644586,0.688733,-0.528495
7,0.2843,0.723909,0.751633,0.643302,0.691272,-0.166248
8,0.2049,0.823686,0.714713,0.717676,0.713221,0.234415
9,0.1556,0.961109,0.713626,0.624492,0.662815,-0.221616


2025-05-09 09:33:02,794 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:33:02,836 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
cargado 29
2025-05-09 09:33:35,503 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:33:35,542 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-09 09:34:08,579 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:34:08,622 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-09 09:34:40,800 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:34:40,842 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-09 09:35:13,656 - pyevall.evaluation 

2025-05-09 09:37:30,631 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:37:30,672 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Validation Results: {'eval_loss': 0.5845847725868225, 'eval_precision_macro': 0.712170684134154, 'eval_recall_macro': 0.7436130905369929, 'eval_f1_macro': 0.7249379178396337, 'eval_ICM': 0.2597598187451168, 'eval_runtime': 1.7079, 'eval_samples_per_second': 197.899, 'eval_steps_per_second': 3.513, 'epoch': 9.0}
2025-05-09 09:37:35,344 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:37:35,440 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Prediction TASK3 completed. Results saved to sexism_predictions_task3_soft_english.csv


In [None]:
modelname = "pysentimiento/robertuito-sentiment-analysis"
params = {"num_train_epochs": 20,
          "learning_rate": 0.0001,
          "per_device_train_batch_size": 64,
          "warmup_steps": 200,
          "early_stopping_patience": 5, 
          "logging_dir": None,
          "output_dir": None,
          "ignore_mismatched_sizes": True,
          "language": "spanish",
          "output_type": "soft"
          }

_, eval_results = sexism_classification_pipeline_task3(SpTrainTask3, SpDevTask3, SpTestTask3, modelname, 6, "multi_label_classification", **params)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Macro,Recall Macro,F1 Macro,Icm
1,0.6434,0.659251,0.490233,0.689636,0.566027,-0.252275
2,0.5878,0.600795,0.69255,0.703808,0.668361,0.069275
3,0.539,0.5607,0.736369,0.743344,0.729875,0.241841
4,0.4397,0.596911,0.763316,0.688277,0.712062,0.025317
5,0.3416,0.650066,0.732839,0.725982,0.718368,0.213611
6,0.2202,0.717247,0.728261,0.735594,0.726644,0.300343
7,0.1279,0.861503,0.735377,0.664599,0.691491,0.057289
8,0.0714,0.978028,0.739234,0.679625,0.698341,0.106592
9,0.0425,1.041119,0.709557,0.717575,0.711975,0.240034
10,0.0276,1.123157,0.716698,0.697476,0.701412,0.197172


2025-05-09 09:38:16,282 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:38:16,328 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-09 09:38:54,787 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:38:54,831 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-09 09:39:32,274 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:39:32,317 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-09 09:40:09,658 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:40:09,702 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-09 09:40:47,680 - pyevall.evaluation - INFO -   

2025-05-09 09:44:38,035 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:44:38,085 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Validation Results: {'eval_loss': 0.7172467708587646, 'eval_precision_macro': 0.7282610845154825, 'eval_recall_macro': 0.7355940816195249, 'eval_f1_macro': 0.7266436446666374, 'eval_ICM': 0.3003432369659737, 'eval_runtime': 1.897, 'eval_samples_per_second': 207.169, 'eval_steps_per_second': 3.69, 'epoch': 11.0}
2025-05-09 09:44:43,116 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:44:43,223 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Prediction TASK3 completed. Results saved to sexism_predictions_task3_soft_spanish.csv


## Save predictions

In [12]:
if create_submision_file("ArPa Project", 1, "soft", 1, 3, ".", "exist2025_ArPa Project"):
    print("Predictions saved")

Predictions saved


## Training hard

In [None]:
modelname = "cardiffnlp/twitter-roberta-base-sentiment-latest"
params = {"num_train_epochs": 0,
          "learning_rate": 0.0001,
          "per_device_train_batch_size": 64,
          "warmup_steps": 200,
          "early_stopping_patience": 5,
          "ignore_mismatched_sizes": True,
          "output_dir": None,
          "logging_dir": None,
          "language": "english"
          }

_, eval_results = sexism_classification_pipeline_task3(EnTrainTask3, EnDevTask3, EnTestTask3, modelname, 6, "multi_label_classification", **params)

KeyboardInterrupt: 

In [None]:
modelname = "pysentimiento/robertuito-sentiment-analysis"
params = {"num_train_epochs": 20,
          "learning_rate": 0.0001,
          "per_device_train_batch_size": 64,
          "warmup_steps": 200,
          "early_stopping_patience": 5, 
          "output_dir": None,
          "logging_dir": None,
          "ignore_mismatched_sizes": True,
          "language": "spanish"
          }

_, eval_results = sexism_classification_pipeline_task3(SpTrainTask3, SpDevTask3, SpTestTask3, modelname, 6, "multi_label_classification", **params)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision Macro,Recall Macro,F1 Macro,Icm
1,0.6488,0.658545,0.484574,0.711421,0.572352,-0.212001
2,0.5826,0.592964,0.700108,0.698671,0.669933,0.075833
3,0.5348,0.557457,0.719699,0.74094,0.719817,0.243792
4,0.4365,0.57078,0.731828,0.723512,0.723467,0.211905
5,0.3478,0.620014,0.714015,0.744476,0.722195,0.278668
6,0.2345,0.702498,0.740536,0.698693,0.713919,0.158825
7,0.1422,0.800688,0.719236,0.707004,0.710646,0.092316
8,0.0726,0.95355,0.727712,0.684123,0.691749,0.068952
9,0.0426,1.044353,0.696136,0.713857,0.703277,0.130324
10,0.0322,1.107942,0.702621,0.707898,0.701564,0.091612


2025-05-09 09:51:22,429 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:51:22,477 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-09 09:51:59,663 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:51:59,710 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-09 09:52:37,712 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:52:37,758 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-09 09:53:16,288 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:53:16,333 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-09 09:53:54,719 - pyevall.evaluation - INFO -   

2025-05-09 09:57:10,954 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:57:11,011 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Validation Results: {'eval_loss': 0.6200137734413147, 'eval_precision_macro': 0.714015422673501, 'eval_recall_macro': 0.7444756519749317, 'eval_f1_macro': 0.7221948546623337, 'eval_ICM': 0.2786683851113127, 'eval_runtime': 2.0058, 'eval_samples_per_second': 195.933, 'eval_steps_per_second': 3.49, 'epoch': 10.0}
2025-05-09 09:57:16,246 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM']
2025-05-09 09:57:16,348 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
Prediction TASK3 completed. Results saved to sexism_predictions_task3_hard_spanish.csv


## Save predictions

In [15]:
if create_submision_file("ArPa Project", 1, "hard", 1, 3, ".", "exist2025_ArPa Project"):
    print("Predictions saved")

Predictions saved
