<h1 align="center">Exists2025:  Sexism Identification in Twitter</h1>
<h2 align="center">Task 1. Fine-tuning for binary classification
<h3 style="display:block; margin-top:5px;" align="center">ETSInf. Universitat Politècnica de València</h3>
<br>

## Libraries

In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import  AutoTokenizer, AutoModelForSequenceClassification,  Trainer, TrainingArguments,  EarlyStoppingCallback, AutoConfig
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
import random
import os
import pandas as pd
from readerEXIST2025 import EXISTReader
import re
import emoji
from create_submision_folder import create_submision_file

## Read data

In [2]:
url = r"\bhttps?://[\w.-]+(?:\.[a-zA-Z]{2,})?(?:/\S*/?)?"

emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
pattern_emoji = "(" + "|".join(re.escape(u) for u in emojis) + ")"

pattern_all = r"|".join(
    [
        url,
        pattern_emoji,
    ]
)

re_all = re.compile(pattern_all, re.U)

In [3]:
file_train = "data/EXIST 2025 Tweets Dataset/training/EXIST2025_training.json"
file_dev = "data/EXIST 2025 Tweets Dataset/dev/EXIST2025_dev.json"
file_test = "data/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json"

reader_train = EXISTReader(file_train)
reader_dev = EXISTReader(file_dev)
reader_test = EXISTReader(file_test, is_test=True)

EnTrainTask1, EnDevTask1, EnTestTask1 = reader_train.get(lang="EN", subtask="1", regular_exp=re_all, preprocess=True), reader_dev.get(lang="EN", subtask="1", regular_exp=re_all, preprocess=True), reader_test.get(lang="EN", subtask="1", regular_exp=re_all, preprocess=True)
SpTrainTask1, SpDevTask1, SpTestTask1 = reader_train.get(lang="ES", subtask="1", regular_exp=re_all, preprocess=True), reader_dev.get(lang="ES", subtask="1", regular_exp=re_all, preprocess=True), reader_test.get(lang="ES", subtask="1", regular_exp=re_all, preprocess=True)

## Set seed

In [4]:
def set_seed(seed=1234):
    """
    Sets the seed to make everything deterministic, for reproducibility of experiments
    Parameters:
    seed: the number to set the seed to
    Return: None
    """
    # Random seed
    random.seed(seed)
    # Numpy seed
    np.random.seed(seed)
    # Torch seed
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    # os seed
    os.environ['PYTHONHASHSEED'] = str(seed)

## Dataset class

In [5]:
class SexismDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_len=128, pad="max_length", trunc=True,rt='pt'):
        self.texts = texts.tolist()
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pad = pad
        self.trunc = trunc
        self.rt = rt

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,padding=self.pad, truncation=self.trunc,
            return_tensors=self.rt
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long),
            'id': torch.tensor(self.ids[idx], dtype=torch.long)
        }

## Metrics

In [6]:
def compute_metrics_1(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary', zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## Pipeline

In [None]:
def sexism_classification_pipeline_task1(trainInfo, devInfo, testInfo=None, model_name='roberta-base', nlabels=2, ptype="single_label_classification", **args):
    # Model and Tokenizer
    labelEnc= LabelEncoder()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=nlabels,
        problem_type=ptype,
        force_download=args.get("force_download"),
        ignore_mismatched_sizes=args.get("ignore_mismatched_sizes", False)
    )

    # Prepare datasets
    train_dataset = SexismDataset(trainInfo[1], labelEnc.fit_transform(trainInfo[2]),[int(x) for x in trainInfo[0]], tokenizer )
    val_dataset = SexismDataset(devInfo[1], labelEnc.transform(devInfo[2]), [int(x) for x in devInfo[0]], tokenizer)

    # Training Arguments
    training_args = TrainingArguments(
        report_to="none", # alt: "wandb", "tensorboard" "comet_ml" "mlflow" "clearml"
        output_dir= args.get('output_dir', './results'), # Cambiar si queremos mantener todos los modelos y no se sobreescriban.
        num_train_epochs= args.get('num_train_epochs', 5),
        learning_rate=args.get('learning_rate', 5e-5),
        per_device_train_batch_size=args.get('per_device_train_batch_size', 16),
        per_device_eval_batch_size=args.get('per_device_eval_batch_size', 64),
        warmup_steps=args.get('warmup_steps', 500),
        weight_decay=args.get('weight_decay',0.01),
        logging_dir=args.get('logging_dir', './logs'),
        logging_steps=args.get('logging_steps', 10),
        eval_strategy=args.get('eval_strategy','epoch'),
        save_strategy=args.get('save_strategy', "epoch"),
        load_best_model_at_end=args.get('load_best_model_at_end', True),
        metric_for_best_model=args.get('metric_for_best_model',"f1")
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_1,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=args.get("early_stopping_patience",3))]
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()
    print("Validation Results:", eval_results)

    # If there is a test dataset
    if testInfo is not None:
        # Prepare test dataset for prediction
        test_dataset = SexismDataset(testInfo[1], [0] * len(testInfo[1]),  [int(x) for x in testInfo[0]],   tokenizer)
        output_type = args.get("output_type", "hard")
        predictions = trainer.predict(test_dataset)

        # Predict test set labels
        if output_type == "hard":
            predicted_labels = np.argmax(predictions.predictions, axis=1)
            # Create submission DataFrame
            submission_df = pd.DataFrame({
                'id': testInfo[0],
                'label': labelEnc.inverse_transform(predicted_labels),
                "test_case": ["EXIST2025"]*len(predicted_labels)
            })

        elif output_type == "soft":
            probabilities = torch.softmax(torch.tensor(predictions.predictions), axis=1)
            submission_data = {
                'id': testInfo[0],
                "test_case": ["EXIST2025"] * len(probabilities)
            }
            list_probabilities = []
            for i in range(probabilities.shape[0]):
                value = {}
                for c in range(probabilities.shape[1]):
                    original_label = labelEnc.inverse_transform([c])[0]
                    value[original_label] = probabilities[i, c].item()
                list_probabilities.append(value)

            submission_data["value"] = list_probabilities
            submission_df = pd.DataFrame(submission_data)


        language = args.get("language", "english")
        submission_df.to_csv(f'sexism_predictions_task1_{output_type}_{language}.csv', index=False)
        print(f"Prediction for TASK 1 completed. Results saved to sexism_predictions_task1_{output_type}_{language}.csv")
        return model, submission_df
    return model, eval_results

## Run id

Before training, we specify the `run_id` to create the predictions.

In [3]:
run_id = 1

## Training soft

In [9]:
modelname = "cardiffnlp/twitter-roberta-base-sentiment-latest"
params = {"num_train_epochs": 20,
          "learning_rate": 0.00001,
          "per_device_train_batch_size": 64,
          "warmup_steps": 200,
          "early_stopping_patience": 5,
          "ignore_mismatched_sizes": True,
          "logging_dir": None,
          "output_dir": None,
          "language": "english",
          "output_type": "soft"
          }

_, eval_results = sexism_classification_pipeline_task1(EnTrainTask1, EnDevTask1, EnTestTask1, modelname, 2, "single_label_classification", **params)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6496,0.624496,0.61036,0.356877,0.64,0.247423
2,0.5664,0.525529,0.75,0.751678,0.664032,0.865979
3,0.3978,0.403775,0.817568,0.799007,0.770335,0.829897
4,0.356,0.366895,0.84009,0.816537,0.818653,0.814433
5,0.2726,0.381544,0.844595,0.816,0.845304,0.78866
6,0.2414,0.385558,0.842342,0.829268,0.787037,0.876289
7,0.227,0.41708,0.844595,0.824427,0.81407,0.835052
8,0.1015,0.479274,0.84009,0.825553,0.788732,0.865979
9,0.1239,0.508737,0.849099,0.834568,0.800948,0.871134
10,0.0791,0.557598,0.846847,0.828283,0.811881,0.845361


Validation Results: {'eval_loss': 0.5087371468544006, 'eval_accuracy': 0.8490990990990991, 'eval_f1': 0.8345679012345679, 'eval_precision': 0.8009478672985783, 'eval_recall': 0.8711340206185567, 'eval_runtime': 1.869, 'eval_samples_per_second': 237.561, 'eval_steps_per_second': 3.745, 'epoch': 14.0}
         id  test_case                                              value
0    600001  EXIST2025  {'NO': 0.008463903330266476, 'YES': 0.99153614...
1    600002  EXIST2025  {'NO': 0.008908777497708797, 'YES': 0.99109125...
2    600003  EXIST2025  {'NO': 0.9847129583358765, 'YES': 0.0152871217...
3    600004  EXIST2025  {'NO': 0.7240634560585022, 'YES': 0.2759366035...
4    600005  EXIST2025  {'NO': 0.008483437821269035, 'YES': 0.99151659...
..      ...        ...                                                ...
973  600974  EXIST2025  {'NO': 0.009189260192215443, 'YES': 0.99081075...
974  600975  EXIST2025  {'NO': 0.008204150944948196, 'YES': 0.99179583...
975  600976  EXIST2025  {'NO': 0.

In [10]:
modelname = "pysentimiento/robertuito-sentiment-analysis"
params = {"num_train_epochs": 20,
          "learning_rate": 0.00001,
          "per_device_train_batch_size": 64,
          "warmup_steps": 200,
          "early_stopping_patience": 5, 
          "logging_dir": None,
          "ignore_mismatched_sizes": True,
          "language": "spanish",
          "output_type": "soft"
          }

_, eval_results = sexism_classification_pipeline_task1(SpTrainTask1, SpDevTask1, SpTestTask1, modelname, 2, "single_label_classification", **params)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6918,0.679481,0.6,0.668919,0.598187,0.758621
2,0.6141,0.62502,0.65102,0.682746,0.661871,0.704981
3,0.5312,0.530365,0.734694,0.759259,0.734767,0.785441
4,0.3984,0.383883,0.832653,0.838583,0.862348,0.816092
5,0.3325,0.342287,0.853061,0.858824,0.879518,0.83908
6,0.2405,0.350639,0.857143,0.86,0.899582,0.823755
7,0.2192,0.352436,0.855102,0.862136,0.874016,0.850575
8,0.1507,0.369983,0.85102,0.857143,0.876,0.83908
9,0.1077,0.462856,0.842857,0.842536,0.903509,0.789272
10,0.0607,0.47673,0.846939,0.849699,0.890756,0.812261


Validation Results: {'eval_loss': 0.352435827255249, 'eval_accuracy': 0.8551020408163266, 'eval_f1': 0.8621359223300971, 'eval_precision': 0.8740157480314961, 'eval_recall': 0.8505747126436781, 'eval_runtime': 1.8473, 'eval_samples_per_second': 265.254, 'eval_steps_per_second': 4.331, 'epoch': 12.0}
          id  test_case                                              value
978   500001  EXIST2025  {'NO': 0.9940873384475708, 'YES': 0.0059126578...
979   500002  EXIST2025  {'NO': 0.9719235301017761, 'YES': 0.0280765071...
980   500003  EXIST2025  {'NO': 0.9319759011268616, 'YES': 0.0680240616...
981   500004  EXIST2025  {'NO': 0.05927068367600441, 'YES': 0.940729379...
982   500005  EXIST2025  {'NO': 0.9882194995880127, 'YES': 0.0117804622...
...      ...        ...                                                ...
2071  501094  EXIST2025  {'NO': 0.09400786459445953, 'YES': 0.905992090...
2072  501095  EXIST2025  {'NO': 0.008214369416236877, 'YES': 0.99178558...
2073  501096  EXIST2025 

## Save predictions

In [11]:
if create_submision_file("ArPa Project", 1, "soft", run_id, 1, ".", "exist2025_ArPa Project"):
    print("Predictions saved")

Predictions saved


## Training hard

In [12]:
modelname = "cardiffnlp/twitter-roberta-base-sentiment-latest"
params = {"num_train_epochs": 20,
          "learning_rate": 0.00001,
          "per_device_train_batch_size": 64,
          "warmup_steps": 200,
          "early_stopping_patience": 5,
          "ignore_mismatched_sizes": True,
          "logging_dir": None,
          "output_dir": None,
          "language": "english"
          }

_, eval_results = sexism_classification_pipeline_task1(EnTrainTask1, EnDevTask1, EnTestTask1, modelname, 2, "single_label_classification", **params)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6484,0.630305,0.581081,0.162162,0.642857,0.092784
2,0.5706,0.534911,0.75,0.748299,0.668016,0.850515
3,0.4004,0.396462,0.813063,0.793017,0.768116,0.819588
4,0.3396,0.352449,0.84009,0.819338,0.809045,0.829897
5,0.2694,0.377413,0.844595,0.819843,0.830688,0.809278
6,0.2385,0.385575,0.842342,0.830918,0.781818,0.886598
7,0.2197,0.417208,0.853604,0.836272,0.817734,0.85567
8,0.1025,0.474006,0.844595,0.830467,0.793427,0.871134
9,0.1269,0.504938,0.855856,0.838384,0.821782,0.85567
10,0.0787,0.600974,0.858108,0.842893,0.816425,0.871134


Validation Results: {'eval_loss': 0.6009740233421326, 'eval_accuracy': 0.8581081081081081, 'eval_f1': 0.8428927680798005, 'eval_precision': 0.8164251207729468, 'eval_recall': 0.8711340206185567, 'eval_runtime': 1.6759, 'eval_samples_per_second': 264.935, 'eval_steps_per_second': 4.177, 'epoch': 15.0}
Prediction for TASK 1 completed. Results saved to sexism_predictions_task1_hard_english.csv


In [13]:
modelname = "pysentimiento/robertuito-sentiment-analysis"
params = {"num_train_epochs": 20,
          "learning_rate": 0.00001,
          "per_device_train_batch_size": 64,
          "warmup_steps": 200,
          "early_stopping_patience": 5, 
          "logging_dir": None,
          "ignore_mismatched_sizes": True,
          "language": "spanish"
          }

_, eval_results = sexism_classification_pipeline_task1(SpTrainTask1, SpDevTask1, SpTestTask1, modelname, 2, "single_label_classification", **params)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6837,0.666569,0.614286,0.649351,0.629496,0.670498
2,0.6028,0.609191,0.679592,0.692759,0.708,0.678161
3,0.496,0.49492,0.781633,0.8,0.781022,0.819923
4,0.3745,0.37296,0.830612,0.839458,0.847656,0.831418
5,0.314,0.340186,0.838776,0.846004,0.861111,0.831418
6,0.2251,0.351636,0.838776,0.841046,0.885593,0.800766
7,0.2,0.359282,0.842857,0.850485,0.862205,0.83908
8,0.1375,0.379405,0.84898,0.856031,0.869565,0.842912
9,0.1099,0.449648,0.834694,0.837675,0.878151,0.800766
10,0.0734,0.464168,0.846939,0.851485,0.881148,0.823755


Validation Results: {'eval_loss': 0.37940484285354614, 'eval_accuracy': 0.8489795918367347, 'eval_f1': 0.8560311284046692, 'eval_precision': 0.8695652173913043, 'eval_recall': 0.842911877394636, 'eval_runtime': 1.8521, 'eval_samples_per_second': 264.562, 'eval_steps_per_second': 4.319, 'epoch': 13.0}
Prediction for TASK 1 completed. Results saved to sexism_predictions_task1_hard_spanish.csv


## Save predictions

In [4]:
if create_submision_file("ArPa Project", 1, "hard", run_id, 1, ".", "exist2025_ArPa Project"):
    print("Predictions saved")

Predictions saved
