In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import json
from collections import Counter

import pandas as pd

import torch
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Define file paths

In [3]:
# Used for input data
training_path = "/content/drive/MyDrive/EXIST2024/dataset/training/EXIST2024_training.json"
validation_path = "/content/drive/MyDrive/EXIST2024/dataset/dev/EXIST2024_dev.json"

# Used for training params
training_out_path = "/content/drive/MyDrive/EXIST2024/fine_tune_roberta/results"
training_log_path = "/content/drive/MyDrive/EXIST2024/fine_tune_roberta/logs"

# Used to save and load model
model_path = "/content/drive/MyDrive/EXIST2024/fine_tune_roberta/model"

# Used for evaluation
testing_path = "/content/drive/MyDrive/EXIST2024/dataset/test/EXIST2023_test_clean.json"
results_path = "/content/drive/MyDrive/EXIST2024/fine_tune_roberta/preds.json"
gold_path = "/content/drive/MyDrive/EXIST2024/dataset/EXIST2024_dev_task1_gold_hard.json"

## Read data

In [None]:
# Apply preprocessing to the dataset
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [None]:
def training_gen():
    with open(training_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    df = pd.DataFrame.from_dict(data, orient='index')

    for _, row in df.iterrows():
        if isinstance(row['labels_task1'], list) and isinstance(row['tweet'], str):
            majority_label = Counter(row.labels_task1).most_common(1)[0][0]
            # Tokenize the tweet here
            inputs = tokenizer(row.tweet, padding="max_length", truncation=True)
            # Add the label to the inputs dictionary
            inputs['label'] = 1 if (majority_label == "YES") else 0
            yield inputs

def validation_gen():
    with open(validation_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    df = pd.DataFrame.from_dict(data, orient='index')

    for _, row in df.iterrows():
        if isinstance(row['labels_task1'], list) and isinstance(row['tweet'], str):
            majority_label = Counter(row.labels_task1).most_common(1)[0][0]
            inputs = tokenizer(row.tweet, padding="max_length", truncation=True)
            inputs['label'] = 1 if (majority_label == "YES") else 0
            yield inputs

In [None]:
# Assuming the dataset is in CSV format
from datasets import Dataset

train_dataset = Dataset.from_generator(training_gen)
test_dataset = Dataset.from_generator(validation_gen)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

## Prepare model and params

In [None]:
# Load the pre-trained XLM-RoBERTa model with a classification head
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)  # Update num_labels for multi-class

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(axis=-1)

    # Calculate precision, recall, f1-score
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    accuracy = accuracy_score(labels, preds)

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

In [None]:
training_args = TrainingArguments(
    output_dir=training_out_path,
    evaluation_strategy="epoch",    # Evaluate at the end of each epoch
    learning_rate=2e-5,             # Learning rate
    per_device_train_batch_size=16, # Batch size for training
    per_device_eval_batch_size=64,  # Batch size for evaluation
    num_train_epochs=3,             # Number of training epochs
    weight_decay=0.01,              # Weight decay
    logging_dir=training_log_path,
    logging_steps=10,               # Log every 10 steps
    load_best_model_at_end=True,    # Load the best model based on eval loss
    metric_for_best_model='accuracy',  # Use accuracy for determining the best model
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none",
    )

trainer = Trainer(
    model=model,                         # The pre-trained model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Validation dataset
    compute_metrics=compute_metrics      # Metrics function
)



## Train and save model

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5222,0.473228,0.78131,0.801296,0.733202,0.765738
2,0.3684,0.46847,0.819846,0.828866,0.794466,0.811302
3,0.4525,0.492828,0.815029,0.82438,0.788538,0.806061


TrainOutput(global_step=1299, training_loss=0.4467965117594753, metrics={'train_runtime': 2124.5856, 'train_samples_per_second': 9.771, 'train_steps_per_second': 0.611, 'total_flos': 5462185509273600.0, 'train_loss': 0.4467965117594753, 'epoch': 3.0})

In [None]:
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('/content/drive/MyDrive/EXIST2024/EXIST 2024 Tweets Dataset/sexism_detection_model/tokenizer_config.json',
 '/content/drive/MyDrive/EXIST2024/EXIST 2024 Tweets Dataset/sexism_detection_model/special_tokens_map.json',
 '/content/drive/MyDrive/EXIST2024/EXIST 2024 Tweets Dataset/sexism_detection_model/sentencepiece.bpe.model',
 '/content/drive/MyDrive/EXIST2024/EXIST 2024 Tweets Dataset/sexism_detection_model/added_tokens.json')

## Load model

In [None]:
model = XLMRobertaForSequenceClassification.from_pretrained(model_path, num_labels=2)
tokenizer = XLMRobertaTokenizer.from_pretrained(model_path)

## Perform inferencing

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    # Move the inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    return predictions.item()

# Example usage
tweet = "This is an example"
prediction = "YES" if predict(tweet) else "NO"
print("Predicted Label:", prediction)

Predicted Label: NO


In [None]:
def inferencing_test():
    with open(validation_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    df = pd.DataFrame.from_dict(data, orient='index')
    results = []
    for _, row in df.iterrows():
        if isinstance(row['tweet'], str) and isinstance(row['id_EXIST'], str):
            res = predict(row.tweet)
            # Tokenize the tweet here
            inputs = tokenizer(row.tweet, padding="max_length", truncation=True)
            results.append({
                "test_case": "EXIST2024",
                "id": row.id_EXIST,
                "value": "YES" if res == 1 else "NO"
            })

    # Write results to a JSON file
    with open(results_path, 'w', encoding='utf-8') as file:
        json.dump(results, file, ensure_ascii=False, indent=4)

In [None]:
inferencing_test()

##  Evaluation

In [43]:
pip install pyevall

Collecting pyevall
  Downloading PyEvALL-0.1.71.tar.gz (38 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jsbeautifier==1.14.9 (from pyevall)
  Downloading jsbeautifier-1.14.9.tar.gz (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting jsonschema==4.21.1 (from pyevall)
  Downloading jsonschema-4.21.1-py3-none-any.whl.metadata (7.8 kB)
Collecting pandas==2.2.1 (from pyevall)
  Downloading pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting setuptools==68.0.0 (from pyevall)
  Downloading setuptools-68.0.0-py3-none-any.whl.metadata (6.4 kB)
Collecting editorconfig>=0.12.2 (from jsbeautifier==1.14.9->pyevall)
  Downloading EditorConfig-0.12.4.tar.gz (13 kB)
  Inst

In [1]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils
from pyevall.metrics.metricfactory import MetricFactory

test = PyEvALLEvaluation()
metrics=[MetricFactory.ICM.value, MetricFactory.ICMNorm.value, MetricFactory.FMeasure.value, MetricFactory.Accuracy.value, MetricFactory.Precision.value, MetricFactory.Recall.value] # You can add more metrics here

Check here that there are no errors

In [4]:
params= dict()
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED
report = test.evaluate(results_path, gold_path, metrics, **params)
report.print_report()

2024-11-28 20:41:57,458 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure', 'Accuracy', 'Precision', 'Recall']
2024-11-28 20:41:58,552 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2024-11-28 20:42:01,609 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2024-11-28 20:42:01,643 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2024-11-28 20:42:04,528 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2024-11-28 20:42:07,257 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
2024-11-28 20:42:10,891 - pyevall.metrics.metrics - INFO -             evaluate() - Executing accuracy evaluation method
2024-11-28 20:42:10,922 - pyevall.metrics.metrics - INFO -             evaluate() - Executing precision e

Recieve more readable results

In [None]:
params= dict()
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_DATAFRAME
report = test.evaluate(results_path, gold_path, metrics, **params)
report.print_report()