<h1 align="center">Exists2025:  Sexism Identification in Twitter</h1>
<h2 align="center">Task 1. Fine-tuning for binary classification
<h3 style="display:block; margin-top:5px;" align="center">ETSInf. Universitat Politècnica de València</h3>
<br>

## Libraries

In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import  AutoTokenizer, AutoModelForSequenceClassification,  Trainer, TrainingArguments,  EarlyStoppingCallback, AutoConfig
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
import random
import os
import pandas as pd
from readerEXIST2025 import EXISTReader
import re
import emoji
from create_submision_folder import create_submision_file

## Read data

In [2]:
url = r"\bhttps?://[\w.-]+(?:\.[a-zA-Z]{2,})?(?:/\S*/?)?"

emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
pattern_emoji = "(" + "|".join(re.escape(u) for u in emojis) + ")"

pattern_all = r"|".join(
    [
        url,
        pattern_emoji,
    ]
)

re_all = re.compile(pattern_all, re.U)

In [3]:
file_train = "data/EXIST 2025 Tweets Dataset/training/EXIST2025_training.json"
file_dev = "data/EXIST 2025 Tweets Dataset/dev/EXIST2025_dev.json"
file_test = "data/EXIST 2025 Tweets Dataset/test/EXIST2025_test_clean.json"

reader_train = EXISTReader(file_train)
reader_dev = EXISTReader(file_dev)
reader_test = EXISTReader(file_test, is_test=True)

EnTrainTask2, EnDevTask2, EnTestTask2 = reader_train.get(lang="EN", subtask="2", regular_exp=re_all, preprocess=True), reader_dev.get(lang="EN", subtask="2", regular_exp=re_all, preprocess=True), reader_test.get(lang="EN", subtask="2", regular_exp=re_all, preprocess=True)
SpTrainTask2, SpDevTask2, SpTestTask2 = reader_train.get(lang="ES", subtask="2", regular_exp=re_all, preprocess=True), reader_dev.get(lang="ES", subtask="2", regular_exp=re_all, preprocess=True), reader_test.get(lang="ES", subtask="2", regular_exp=re_all, preprocess=True)

In [4]:
EnTrainTask2[2].unique()

array(['REPORTED', 'NO', 'JUDGEMENTAL', 'DIRECT'], dtype=object)

Ahora tenemos 4 labels, incluyendo al NO también.

In [5]:
torch.cuda.is_available()

True

## Set seed

In [6]:
def set_seed(seed=1234):
    """
    Sets the seed to make everything deterministic, for reproducibility of experiments
    Parameters:
    seed: the number to set the seed to
    Return: None
    """
    # Random seed
    random.seed(seed)
    # Numpy seed
    np.random.seed(seed)
    # Torch seed
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    # os seed
    os.environ['PYTHONHASHSEED'] = str(seed)

## Dataset class

In [7]:
class SexismDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_len=128, pad="max_length", trunc=True,rt='pt'):
        self.texts = texts.tolist()
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pad = pad
        self.trunc = trunc
        self.rt = rt

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,padding=self.pad, truncation=self.trunc,
            return_tensors=self.rt
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long),
            'id': torch.tensor(self.ids[idx], dtype=torch.long)
        }

## Metrics

In [8]:
def compute_metrics_2(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='macro', zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## Pipeline

In [9]:
def sexism_classification_pipeline_task2(trainInfo, devInfo, testInfo=None, model_name='bert-base-uncased', nlabels=3, ptype="single_label_classification", **args):
    # Model and Tokenizer
    labelEnc= LabelEncoder()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=nlabels,
        problem_type=ptype,
        ignore_mismatched_sizes=args.get("ignore_mismatched_sizes", False)
    )

    # Prepare datasets
    train_dataset = SexismDataset(trainInfo[1], labelEnc.fit_transform(trainInfo[2]),[int(x) for x in trainInfo[0]], tokenizer )
    val_dataset = SexismDataset(devInfo[1], labelEnc.transform(devInfo[2]), [int(x) for x in devInfo[0]], tokenizer)

    # Training Arguments
    training_args = TrainingArguments(
        report_to="none", # alt: "wandb", "tensorboard" "comet_ml" "mlflow" "clearml"
        output_dir= args.get('output_dir', './results'),
        num_train_epochs= args.get('num_train_epochs', 5),
        learning_rate=args.get('learning_rate', 5e-5),
        per_device_train_batch_size=args.get('per_device_train_batch_size', 16),
        per_device_eval_batch_size=args.get('per_device_eval_batch_size', 64),
        #warmup_steps=args.get('warmup_steps', 500),
        warmup_ratio=args.get("warmup_ratio", 0.1),
        weight_decay=args.get('weight_decay',0.01),
        lr_scheduler_type=args.get("scheduler_type", "linear"),
        logging_dir=args.get('logging_dir', './logs'),
        logging_steps=args.get('logging_steps', 10),
        eval_strategy=args.get('eval_strategy','epoch'),
        save_strategy=args.get('save_strategy', "epoch"),
        save_total_limit=args.get('save_total_limit', 1),
        load_best_model_at_end=args.get('load_best_model_at_end', True),
        #metric_for_best_model=args.get('metric_for_best_model',"ICM")
        metric_for_best_model=args.get('metric_for_best_model',"f1")
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_2,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=args.get("early_stopping_patience",3))]
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()
    print("Validation Results:", eval_results)

    # If there is a test dataset
    if testInfo is not None:
        # Prepare test dataset for prediction
        test_dataset = SexismDataset(testInfo[1], [0] * len(testInfo[1]),  [int(x) for x in testInfo[0]],   tokenizer)
        output_type = args.get("output_type", "hard")
        predictions = trainer.predict(test_dataset)

        # Predict test set labels
        if output_type == "hard":
            predicted_labels = np.argmax(predictions.predictions, axis=1)
            # Create submission DataFrame
            submission_df = pd.DataFrame({
                'id': testInfo[0],
                'label': labelEnc.inverse_transform(predicted_labels),
                "test_case": ["EXIST2025"]*len(predicted_labels)
            })

        elif output_type == "soft":
            probabilities = torch.softmax(torch.tensor(predictions.predictions), axis=1)
            submission_data = {
                'id': testInfo[0],
                "test_case": ["EXIST2025"] * len(probabilities)
            }
            list_probabilities = []
            for i in range(probabilities.shape[0]):
                value = {}
                for c in range(probabilities.shape[1]):
                    original_label = labelEnc.inverse_transform([c])[0]
                    value[original_label] = probabilities[i, c].item()
                list_probabilities.append(value)

            submission_data["value"] = list_probabilities
            submission_df = pd.DataFrame(submission_data)

        language = args.get("language", "english")
        submission_df.to_csv(f'sexism_predictions_task2_{output_type}_{language}.csv', index=False)
        print(f"Prediction TASK2 completed. Results saved to sexism_predictions_task2_{output_type}_{language}.csv")
        return model, submission_df
    return model, eval_results


## Run id

Before training, we specify the `run_id` for the predictions.

In [10]:
run_id = 3

## Training soft

In [11]:
modelname = "cardiffnlp/twitter-roberta-base-sentiment-latest"
params = {"num_train_epochs": 10,
          "learning_rate": 3e-5,
          "scheduler_type": "linear",
          "per_device_train_batch_size": 64,
          "warmup_ratio": 0.15,
          "early_stopping_patience": 5,
          "ignore_mismatched_sizes": True,
          "output_dir": None,
          "logging_dir": None,
          "language": "english",
          "output_type": "soft"
          }

_, eval_results = sexism_classification_pipeline_task2(EnTrainTask2, EnDevTask2, EnTestTask2, modelname, 4, "single_label_classification", **params)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.8491,0.859978,0.712121,0.345647,0.335255,0.365471
2,0.6334,0.67677,0.734848,0.475895,0.452461,0.523893
3,0.4952,0.669254,0.770202,0.483698,0.480643,0.490674
4,0.406,0.724687,0.729798,0.502441,0.522433,0.55957
5,0.3735,0.684532,0.765152,0.529355,0.540138,0.524119
6,0.218,0.867907,0.755051,0.51851,0.559958,0.531032
7,0.1459,0.928223,0.75,0.508224,0.525847,0.499132
8,0.1421,0.973566,0.752525,0.537099,0.538961,0.544385
9,0.1222,0.982749,0.752525,0.538542,0.552366,0.533787
10,0.0729,0.987804,0.760101,0.550725,0.563616,0.543363


Validation Results: {'eval_loss': 0.9878035187721252, 'eval_accuracy': 0.76010101010101, 'eval_f1': 0.5507246479390469, 'eval_precision': 0.5636160714285714, 'eval_recall': 0.5433627450980393, 'eval_runtime': 2.8465, 'eval_samples_per_second': 139.119, 'eval_steps_per_second': 2.459, 'epoch': 10.0}
Prediction TASK2 completed. Results saved to sexism_predictions_task2_soft_english.csv


In [12]:
modelname = "pysentimiento/robertuito-sentiment-analysis"
params = {"num_train_epochs": 10,
          "learning_rate": 3e-5,
          "scheduler_type": "linear",
          "per_device_train_batch_size": 64,
          "warmup_ratio": 0.15,
          "early_stopping_patience": 5, 
          "ignore_mismatched_sizes": True,
          "output_dir": None,
          "logging_dir": None,
          "language": "spanish",
          "output_type": "soft"
          }

_, eval_results = sexism_classification_pipeline_task2(SpTrainTask2, SpDevTask2, SpTestTask2, modelname, 4, "single_label_classification", **params)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.9835,1.076162,0.584862,0.308207,0.278284,0.346446
2,0.7006,0.77849,0.701835,0.420658,0.738648,0.452249
3,0.5581,0.668876,0.720183,0.537833,0.580084,0.539522
4,0.3265,0.708268,0.752294,0.602231,0.681216,0.580699
5,0.1982,0.80297,0.743119,0.565107,0.64056,0.553186
6,0.1334,0.805781,0.738532,0.590011,0.609761,0.588066
7,0.0985,0.858299,0.731651,0.585645,0.596692,0.586474
8,0.0558,0.941734,0.733945,0.593762,0.631564,0.582407
9,0.0492,0.975099,0.743119,0.585613,0.62062,0.57809


Validation Results: {'eval_loss': 0.7082683444023132, 'eval_accuracy': 0.7522935779816514, 'eval_f1': 0.6022310843082902, 'eval_precision': 0.6812157416923604, 'eval_recall': 0.5806986493588552, 'eval_runtime': 1.9767, 'eval_samples_per_second': 220.575, 'eval_steps_per_second': 3.541, 'epoch': 9.0}
Prediction TASK2 completed. Results saved to sexism_predictions_task2_soft_spanish.csv


## Save predictions

In [13]:
if create_submision_file("ArPa Project", 1, "soft", run_id, 2, ".", "exist2025_ArPa Project"):
    print("Predictions saved")

Predictions saved


## Training hard

In [14]:
modelname = "cardiffnlp/twitter-roberta-base-sentiment-latest"
params = {"num_train_epochs": 10,
          "learning_rate": 3e-5,
          "scheduler_type": "linear",
          "per_device_train_batch_size": 64,
          "warmup_ratio": 0.15,
          "early_stopping_patience": 5,
          "ignore_mismatched_sizes": True,
          "output_dir": None,
          "logging_dir": None,
          "language": "english"
          }

_, eval_results = sexism_classification_pipeline_task2(EnTrainTask2, EnDevTask2, EnTestTask2, modelname, 4, "single_label_classification", **params)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.8482,0.829997,0.714646,0.348852,0.338776,0.368412
2,0.6396,0.669174,0.742424,0.473641,0.460286,0.515246
3,0.5028,0.671299,0.762626,0.462646,0.469053,0.465253
4,0.4058,0.747266,0.737374,0.564501,0.572892,0.608037
5,0.334,0.70114,0.772727,0.558233,0.570323,0.562843
6,0.2088,0.865906,0.752525,0.518156,0.540531,0.505682
7,0.1305,0.914676,0.747475,0.547845,0.547596,0.554632
8,0.1221,0.979074,0.744949,0.552096,0.555825,0.553523
9,0.0973,1.005615,0.744949,0.54246,0.54546,0.541561


Validation Results: {'eval_loss': 0.7472657561302185, 'eval_accuracy': 0.7373737373737373, 'eval_f1': 0.564501364775285, 'eval_precision': 0.5728924052991485, 'eval_recall': 0.6080366692131398, 'eval_runtime': 1.6418, 'eval_samples_per_second': 241.192, 'eval_steps_per_second': 4.263, 'epoch': 9.0}
Prediction TASK2 completed. Results saved to sexism_predictions_task2_hard_english.csv


In [15]:
modelname = "pysentimiento/robertuito-sentiment-analysis"
params = {"num_train_epochs": 10,
          "learning_rate": 3e-5,
          "scheduler_type": "linear",
          "per_device_train_batch_size": 64,
          "warmup_ratio": 0.15,
          "early_stopping_patience": 5, 
          "ignore_mismatched_sizes": True,
          "output_dir": None,
          "logging_dir": None,
          "language": "spanish"
          }

_, eval_results = sexism_classification_pipeline_task2(SpTrainTask2, SpDevTask2, SpTestTask2, modelname, 4, "single_label_classification", **params)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.9974,1.075829,0.575688,0.287219,0.273707,0.32081
2,0.7074,0.795415,0.678899,0.407306,0.525359,0.438426
3,0.5563,0.687839,0.711009,0.514818,0.547572,0.52344
4,0.3431,0.733977,0.733945,0.572346,0.656579,0.54933
5,0.2124,0.770355,0.75,0.592712,0.650094,0.577639
6,0.1507,0.824805,0.745413,0.607194,0.631149,0.599296
7,0.1136,0.864053,0.747706,0.617053,0.632747,0.608009
8,0.0695,0.941696,0.754587,0.622907,0.662886,0.602031
9,0.0576,1.007053,0.743119,0.616354,0.656376,0.592494
10,0.0536,0.978838,0.75,0.623801,0.65263,0.604897


Validation Results: {'eval_loss': 0.9788379073143005, 'eval_accuracy': 0.75, 'eval_f1': 0.6238011502163793, 'eval_precision': 0.6526296371101322, 'eval_recall': 0.6048971813540956, 'eval_runtime': 1.8346, 'eval_samples_per_second': 237.658, 'eval_steps_per_second': 3.816, 'epoch': 10.0}
Prediction TASK2 completed. Results saved to sexism_predictions_task2_hard_spanish.csv


## Save predictions

In [16]:
if create_submision_file("ArPa Project", 1, "hard", run_id, 2, ".", "exist2025_ArPa Project"):
    print("Predictions saved")

Predictions saved
