In [None]:
# https://huggingface.co/am-azadi/EXIST2024_Task2_xlmRoberta_large_3_16

In [None]:
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# xlmRoBERTa

In [2]:
# Load the tokenizer and model
# model_name = "am-azadi/EXIST2024_Task1_xlmRoberta_large_3_16_merged_unshuffled"
model_name = "am-azadi/EXIST2024_Task2_xlmRoberta_large_3_16"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# use the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the dataset
with open("EXIST2024_dev.json", "r") as file:
    dataset = json.load(file)

config.json:   0%|          | 0.00/991 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Aggregation method : only the tweets that are sexist, and the label with max number of occurences

In [3]:
# Create a new label that is a proportion (0<prob<1) for each class for labels_task2 for each tweet if the tweet is sexist, otherwise "NO"
# Proportion = Number of occurences over total number of classes given by the annotators
# As the form of a dictionary with the following keys: "NO", "DIRECT" "REPORTED", "JUDGEMENTAL"

classes = ["NO", "DIRECT", "REPORTED", "JUDGEMENTAL"]
data = []
for item_id, item_data in dataset.items():
    if item_data["labels_task1"].count("YES") <= 3:
        proportions = {"NO": 1, "DIRECT": 0, "REPORTED": 0, "JUDGEMENTAL": 0}
    else:
        labels = [label for label in item_data["labels_task2"] if label != "-"]
        proportions = {label: labels.count(label)/len(labels) for label in classes}

        # Sanity check
        assert(0.99 < sum(proportions.values()) and sum(proportions.values()) < 1.01)

    data += [{"id": item_data["id_EXIST"], "tweet": item_data["tweet"], "label": proportions}]

print(data[1])
print(len(data))

{'id': '300002', 'tweet': '@anacaotica88 @MordorLivin No me acuerdo de los detalles de GamerGate, pero ella estuvo en el ojo del huracán recibiendo acoso de hombres indignados (y sus medios frikis) y creo que también acosaron a Brianna Wu, q es transfemenino. Seguramente tuvo eso que ver y quiso "cerrar filas".', 'label': {'NO': 0.0, 'DIRECT': 0.0, 'REPORTED': 0.4, 'JUDGEMENTAL': 0.6}}
1038


Useful functions

In [4]:
# Function to preprocess a tweet and predict as soft labels
def predict_sexism(tweet):
    inputs = tokenizer(tweet, return_tensors="pt", truncation=True, max_length=128).to(device)
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1).tolist()[0]
    return {classes[i]: prob for i, prob in enumerate(probs)}

# Function to extract actual and predicted
def get_actual_predicted(res) :
    actual = [item["actual"] for item in res]
    predicted = [item["value"] for item in res]
    return actual, predicted

Running on the dataset

In [5]:
result = [
    {
        "test_case": "EXIST2024",
        "id": item["id"],  # Use the id directly from subset_data
        "actual": item["label"],
        "value": predict_sexism(item["tweet"]),
    }
    for item in data
]

In [6]:
result[0]

{'test_case': 'EXIST2024',
 'id': '300001',
 'actual': {'NO': 1, 'DIRECT': 0, 'REPORTED': 0, 'JUDGEMENTAL': 0},
 'value': {'NO': 0.9793201684951782,
  'DIRECT': 0.005903935991227627,
  'REPORTED': 0.006269863341003656,
  'JUDGEMENTAL': 0.008506054989993572}}

In [10]:
# Save the predictions in a Json file
with open("preds_task2_soft.json", "w") as file:
    json.dump(result, file)