# 0. Montar GoogleDrive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1. CONFIGURACIÓN DEL ENTORNO

# Configuración (importar dependencias, librerías, ...)

In [2]:
# Para garantizar la reproducibilidad de nuestros experimentos
# Set the seed value all over the place to make this reproducible.

# Instalación de paquetes necesarios
!pip install sentencepiece
!pip install pytorch-lightning
!pip install --upgrade accelerate
!pip install emoji
!pip install framework-reproducibility
!pip install transformers datasets
!pip install contractions
!pip install textblob
!pip install PyEvALL

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.2.4-py3-none-any.whl (802 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.2/802.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.4.0-py3-none-any.whl (868 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->pytorch-lightning)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->pytorch-lightning)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13.0->pytorc

In [2]:
# Set the seed value all over the place to make this reproducible.
# esto hay que ponerlo justo antes de importar para que los experimentos
# sean reproducible

import random
import torch
import numpy as np
import os
from pytorch_lightning import seed_everything
import matplotlib.pyplot as plt
import seaborn as sns
import re

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)# Store the average loss after eachepoch so we can plot them.
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ["TF_DETERMINISTIC_OPS"] = "1" # See:https://github.com/NVIDIA/tensorflow-determinism#confirmed-current-gpu-specific-sources-of-non-determinism-with-solutions
seed_everything(42, workers=True)

from datasets import Dataset, DatasetDict, load_metric
import pandas as pd
import sklearn as sk
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, f1_score
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
 TrainingArguments, Trainer, pipeline, EarlyStoppingCallback

INFO:lightning_fabric.utilities.seed:Seed set to 42


In [3]:
# Check that pyTorch is identifying the GPU
if torch.cuda.device_count() > 0:
    # If a GPU is available, print its name
    print(f'GPU detected. Currently using: "{torch.cuda.get_device_name(0)}"')
    # Set the device to GPU for accelerated computations
    device = torch.device("cuda")
else:
    # If no GPU is available, inform the user to change the runtime type
    print('Currently using CPU. To utilize GPU acceleration, change the runtime type in the \'runtime\' tab.')

GPU detected. Currently using: "Tesla T4"


# Preparación de los datos

## Lectura de los ficheros

In [4]:
# Usamos estas variables para que el código sea más portable
nombre_etiqueta1 = 'value_task1'
nombre_etiqueta2 = 'value'
campo_texto = 'tweet'

# Formateo y etiquetado de los Datasets

In [4]:
# # Ruta del archivo CSV en Google Drive
# file_path = '/content/drive/MyDrive/MODELOS TASK 1/TEST/df_es_TEST.csv'

# # Leer el archivo CSV y cargarlo en un DataFrame
# test_df = pd.read_csv(file_path)

# test_dataset = Dataset.from_pandas(test_df)

# print(test_dataset)

Dataset({
    features: ['id_EXIST', 'lang', 'tweet', 'number_annotators', 'annotators', 'gender_annotators', 'age_annotators', 'ethnicities_annotators', 'study_levels_annotators', 'countries_annotators', 'split'],
    num_rows: 1098
})


In [5]:
# # Se convierten los dataframes en objetos Datasets para que los acepten los Transformers


# # Supongamos que el JSON está en el archivo "datos.json" en la ruta especificada
ruta_json = '/content/drive/MyDrive/MODELOS TASK 1/TEST/EXIST2023_test_clean.json'

# Leer el JSON en un DataFrame
df = pd.read_json(ruta_json)

test_df=df.transpose()

test_dataset = Dataset.from_pandas(test_df)

print(test_dataset)

Dataset({
    features: ['id_EXIST', 'lang', 'tweet', 'number_annotators', 'annotators', 'gender_annotators', 'age_annotators', 'ethnicities_annotators', 'study_levels_annotators', 'countries_annotators', 'split', '__index_level_0__'],
    num_rows: 2076
})


In [6]:
# Esta función toma un registro como entrada, que contiene una etiqueta llamada 'label'.
# Si el valor de esta etiqueta es 0, asigna 0 a la variable 'label'. Si el valor no es 0
# asigna 1 a 'label'. A continuación, la función devuelve un diccionario con la etiqueta modificada, llamado "labels"

def set_labels(records):
    label_mapping = {
        'NO': 0,
        'YES': 1
    }
    label = label_mapping[records[nombre_etiqueta1]]
    return {'labels': label}

In [7]:
# Reseteamos el formato para que no haya fallos
test_dataset.reset_format()

# 2. CLASIFICACIÓN BINARIA ENTRE LAS CLASES DE LA TASK_1 YES/NO

## Crear tokenizador para tokenizar test y cargar modelo

In [8]:
model_checkpoint_1 = 'FacebookAI/xlm-roberta-base'

#model_checkpoint_1 = 'microsoft/deberta-v3-base'

#model_checkpoint_1 = 'google-bert/bert-base-multilingual-uncased'

#model_checkpoint_1 = 'FacebookAI/roberta-base'

#model_checkpoint_1 = 'PlanTL-GOB-ES/roberta-base-bne'

#model_checkpoint_1 = 'dccuchile/bert-base-spanish-wwm-uncased'

In [9]:
tokenizer_1 = AutoTokenizer.from_pretrained(model_checkpoint_1, use_auth_token='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')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
# Se carga el modelo preentrenado
n_labels = 2

# El uso de una función de inicialización facilita la repetición del entrenamiento
# Se puede usar la misma función de inicialización en diferentes ejecuciones del código o en configuraciones de entrenamiento diferentes
# Esto facilita la repetición del entrenamiento y la reproducibilidad, ya que se puede inicializar el modelo
# de la misma manera en cada ejecución.

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint_1,
                                                              num_labels = n_labels) #, return_dict = True )
                                                              # use_auth_token = 'token propio de HugginFace')

In [11]:
# Para saber el nombre del modelo
model_name_1 = model_checkpoint_1.split("/")[-1]
model_name_1

'xlm-roberta-base'

# Generación de Resultados para la competición

In [12]:
test_df

Unnamed: 0,id_EXIST,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,split
500001,500001,es,@Eurogamer_es Todo gamergate desde el desarrol...,6,"[Annotator_810, Annotator_811, Annotator_812, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, White or Caucasian, White ...","[High school degree or equivalent, Master’s de...","[Mexico, Spain, Italy, United States, Portugal...",TEST_ES
500002,500002,es,"@ArCaNgEl__23 @Benzenazi Hombre, no es compara...",6,"[Annotator_780, Annotator_816, Annotator_817, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, Black o...","[High school degree or equivalent, Bachelor’s ...","[Chile, Mexico, United States, Mexico, Mexico,...",TEST_ES
500003,500003,es,yo buscando las empresas metidas en el gamerga...,6,"[Annotator_821, Annotator_822, Annotator_823, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, Asian, ...","[Bachelor’s degree, Bachelor’s degree, Master’...","[Mexico, Mexico, VietNam, United States, Mexic...",TEST_ES
500004,500004,es,"@jordirico Primero fue internet, luego el game...",6,"[Annotator_827, Annotator_828, Annotator_829, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, White or Caucasian, Hispan...","[High school degree or equivalent, Bachelor’s ...","[Chile, Spain, Mexico, United Kingdom, Chile, ...",TEST_ES
500005,500005,es,@AlonsoQuijano12 Yo estuve metido en el gamerg...,6,"[Annotator_827, Annotator_828, Annotator_829, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, White or Caucasian, Hispan...","[High school degree or equivalent, Bachelor’s ...","[Chile, Spain, Mexico, United Kingdom, Chile, ...",TEST_ES
...,...,...,...,...,...,...,...,...,...,...,...
600974,600974,en,@AllyMae99 This straight up sounds like “you l...,6,"[Annotator_942, Annotator_943, Annotator_351, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Master’s de...","[South Africa, Spain, Portugal, United States,...",TEST_EN
600975,600975,en,Nathaniel is trying to help me with a new fake...,6,"[Annotator_997, Annotator_998, Annotator_999, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Bachelor’s ...","[South Africa, United Kingdom, Australia, Fran...",TEST_EN
600976,600976,en,walkin back from the gym &amp; an older lady s...,6,"[Annotator_997, Annotator_998, Annotator_999, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Bachelor’s ...","[South Africa, United Kingdom, Australia, Fran...",TEST_EN
600977,600977,en,You look like a whore of Babylon bc that’s the...,6,"[Annotator_1009, Annotator_1010, Annotator_101...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[White or Caucasian, White or Caucasian, Multi...","[High school degree or equivalent, Bachelor’s ...","[Poland, Portugal, United Kingdom, Greece, Gre...",TEST_EN


In [13]:
# Lo pasamos a objeto dataset
test_dataset = Dataset.from_pandas(test_df)
test_dataset

Dataset({
    features: ['id_EXIST', 'lang', 'tweet', 'number_annotators', 'annotators', 'gender_annotators', 'age_annotators', 'ethnicities_annotators', 'study_levels_annotators', 'countries_annotators', 'split', '__index_level_0__'],
    num_rows: 2076
})

In [14]:
test_dataset[5]

{'id_EXIST': '500006',
 'lang': 'es',
 'tweet': '@MrSandman1954 @AKN4710 El Gamergate jodió al internet para siempre.',
 'number_annotators': 6,
 'annotators': ['Annotator_833',
  'Annotator_462',
  'Annotator_834',
  'Annotator_835',
  'Annotator_836',
  'Annotator_837'],
 'gender_annotators': ['F', 'F', 'F', 'M', 'M', 'M'],
 'age_annotators': ['18-22', '23-45', '46+', '46+', '23-45', '18-22'],
 'ethnicities_annotators': ['White or Caucasian',
  'White or Caucasian',
  'White or Caucasian',
  'White or Caucasian',
  'Hispano or Latino',
  'Hispano or Latino'],
 'study_levels_annotators': ['High school degree or equivalent',
  'Bachelor’s degree',
  'Master’s degree',
  'Master’s degree',
  'Bachelor’s degree',
  'High school degree or equivalent'],
 'countries_annotators': ['Portugal',
  'Poland',
  'Australia',
  'Germany',
  'Mexico',
  'Mexico'],
 'split': 'TEST_ES',
 '__index_level_0__': 500006}

### Predicciones

In [15]:
# Se carga el modelo que se ha entrenado
model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/MODELOS TASK 1/MODELOS BIN 1.0/XLM_ROBERTA_BASE')

In [16]:
# Predicción con pipeline
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer_1, device=0)

In [17]:
# Hacemos las prediciones

def get_predictions(records):
  result = pipe(records[campo_texto], truncation=True)
  pred_label = result[0]['label']
  score_label = result[0]['score']
  #print(pred_label)

  if pred_label == 'LABEL_0':
    pred_label = 0
  else:
    pred_label = 1

  return {'pred_label': pred_label, 'score_label': score_label}

In [18]:
# Se hacen las predicciones sobre el conjunto de test
test_dataset_predicted = test_dataset.map(get_predictions)
test_dataset_predicted[0]

Map:   0%|          | 0/2076 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


{'id_EXIST': '500001',
 'lang': 'es',
 'tweet': '@Eurogamer_es Todo gamergate desde el desarrollo hasta los foros de juegos, clásico del mundo de los videojuegos.',
 'number_annotators': 6,
 'annotators': ['Annotator_810',
  'Annotator_811',
  'Annotator_812',
  'Annotator_813',
  'Annotator_814',
  'Annotator_815'],
 'gender_annotators': ['F', 'F', 'F', 'M', 'M', 'M'],
 'age_annotators': ['18-22', '23-45', '46+', '46+', '23-45', '18-22'],
 'ethnicities_annotators': ['Hispano or Latino',
  'White or Caucasian',
  'White or Caucasian',
  'Hispano or Latino',
  'White or Caucasian',
  'White or Caucasian'],
 'study_levels_annotators': ['High school degree or equivalent',
  'Master’s degree',
  'Master’s degree',
  'Bachelor’s degree',
  'Bachelor’s degree',
  'Bachelor’s degree'],
 'countries_annotators': ['Mexico',
  'Spain',
  'Italy',
  'United States',
  'Portugal',
  'Italy'],
 'split': 'TEST_ES',
 '__index_level_0__': 500001,
 'pred_label': 0,
 'score_label': 0.9829697608947754}

In [19]:
test_dataset_predicted.set_format('pandas')
df_test = test_dataset_predicted[:]
df_test

Unnamed: 0,id_EXIST,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,split,__index_level_0__,pred_label,score_label
0,500001,es,@Eurogamer_es Todo gamergate desde el desarrol...,6,"[Annotator_810, Annotator_811, Annotator_812, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, White or Caucasian, White ...","[High school degree or equivalent, Master’s de...","[Mexico, Spain, Italy, United States, Portugal...",TEST_ES,500001,0,0.982970
1,500002,es,"@ArCaNgEl__23 @Benzenazi Hombre, no es compara...",6,"[Annotator_780, Annotator_816, Annotator_817, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, Black o...","[High school degree or equivalent, Bachelor’s ...","[Chile, Mexico, United States, Mexico, Mexico,...",TEST_ES,500002,0,0.978337
2,500003,es,yo buscando las empresas metidas en el gamerga...,6,"[Annotator_821, Annotator_822, Annotator_823, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, Asian, ...","[Bachelor’s degree, Bachelor’s degree, Master’...","[Mexico, Mexico, VietNam, United States, Mexic...",TEST_ES,500003,0,0.980297
3,500004,es,"@jordirico Primero fue internet, luego el game...",6,"[Annotator_827, Annotator_828, Annotator_829, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, White or Caucasian, Hispan...","[High school degree or equivalent, Bachelor’s ...","[Chile, Spain, Mexico, United Kingdom, Chile, ...",TEST_ES,500004,0,0.938972
4,500005,es,@AlonsoQuijano12 Yo estuve metido en el gamerg...,6,"[Annotator_827, Annotator_828, Annotator_829, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, White or Caucasian, Hispan...","[High school degree or equivalent, Bachelor’s ...","[Chile, Spain, Mexico, United Kingdom, Chile, ...",TEST_ES,500005,0,0.968572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2071,600974,en,@AllyMae99 This straight up sounds like “you l...,6,"[Annotator_942, Annotator_943, Annotator_351, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Master’s de...","[South Africa, Spain, Portugal, United States,...",TEST_EN,600974,1,0.932723
2072,600975,en,Nathaniel is trying to help me with a new fake...,6,"[Annotator_997, Annotator_998, Annotator_999, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Bachelor’s ...","[South Africa, United Kingdom, Australia, Fran...",TEST_EN,600975,1,0.954971
2073,600976,en,walkin back from the gym &amp; an older lady s...,6,"[Annotator_997, Annotator_998, Annotator_999, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Bachelor’s ...","[South Africa, United Kingdom, Australia, Fran...",TEST_EN,600976,1,0.945241
2074,600977,en,You look like a whore of Babylon bc that’s the...,6,"[Annotator_1009, Annotator_1010, Annotator_101...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[White or Caucasian, White or Caucasian, Multi...","[High school degree or equivalent, Bachelor’s ...","[Poland, Portugal, United Kingdom, Greece, Gre...",TEST_EN,600977,1,0.957481


In [20]:
df_test_bin = df_test
df_test_bin

Unnamed: 0,id_EXIST,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,split,__index_level_0__,pred_label,score_label
0,500001,es,@Eurogamer_es Todo gamergate desde el desarrol...,6,"[Annotator_810, Annotator_811, Annotator_812, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, White or Caucasian, White ...","[High school degree or equivalent, Master’s de...","[Mexico, Spain, Italy, United States, Portugal...",TEST_ES,500001,0,0.982970
1,500002,es,"@ArCaNgEl__23 @Benzenazi Hombre, no es compara...",6,"[Annotator_780, Annotator_816, Annotator_817, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, Black o...","[High school degree or equivalent, Bachelor’s ...","[Chile, Mexico, United States, Mexico, Mexico,...",TEST_ES,500002,0,0.978337
2,500003,es,yo buscando las empresas metidas en el gamerga...,6,"[Annotator_821, Annotator_822, Annotator_823, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, Asian, ...","[Bachelor’s degree, Bachelor’s degree, Master’...","[Mexico, Mexico, VietNam, United States, Mexic...",TEST_ES,500003,0,0.980297
3,500004,es,"@jordirico Primero fue internet, luego el game...",6,"[Annotator_827, Annotator_828, Annotator_829, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, White or Caucasian, Hispan...","[High school degree or equivalent, Bachelor’s ...","[Chile, Spain, Mexico, United Kingdom, Chile, ...",TEST_ES,500004,0,0.938972
4,500005,es,@AlonsoQuijano12 Yo estuve metido en el gamerg...,6,"[Annotator_827, Annotator_828, Annotator_829, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, White or Caucasian, Hispan...","[High school degree or equivalent, Bachelor’s ...","[Chile, Spain, Mexico, United Kingdom, Chile, ...",TEST_ES,500005,0,0.968572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2071,600974,en,@AllyMae99 This straight up sounds like “you l...,6,"[Annotator_942, Annotator_943, Annotator_351, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Master’s de...","[South Africa, Spain, Portugal, United States,...",TEST_EN,600974,1,0.932723
2072,600975,en,Nathaniel is trying to help me with a new fake...,6,"[Annotator_997, Annotator_998, Annotator_999, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Bachelor’s ...","[South Africa, United Kingdom, Australia, Fran...",TEST_EN,600975,1,0.954971
2073,600976,en,walkin back from the gym &amp; an older lady s...,6,"[Annotator_997, Annotator_998, Annotator_999, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Bachelor’s ...","[South Africa, United Kingdom, Australia, Fran...",TEST_EN,600976,1,0.945241
2074,600977,en,You look like a whore of Babylon bc that’s the...,6,"[Annotator_1009, Annotator_1010, Annotator_101...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[White or Caucasian, White or Caucasian, Multi...","[High school degree or equivalent, Bachelor’s ...","[Poland, Portugal, United Kingdom, Greece, Gre...",TEST_EN,600977,1,0.957481


## Generar Hard Labels

In [None]:
# import json

# # Crear una lista para almacenar los objetos JSON de cada fila
# json_objects = []

# # Iterar sobre cada fila del DataFrame
# for index, row in df_test_bin.iterrows():
#     # Obtener el id del tweet/meme
#     tweet_id = str(row['id_EXIST'])

#     # Obtener el valor de pred_label y convertirlo a "YES" o "NO" según la correspondencia dada
#     pred_label = row['pred_label']
#     if pred_label == 0:
#         value = "NO"
#     elif pred_label == 1:
#         value = "YES"
#     else:
#         # Manejo de valores imprevistos
#         value = "UNKNOWN"

#     # Construir el objeto JSON para esta fila
#     json_obj = {
#         "id": tweet_id,
#         "value": value,
#         "test_case": "EXIST2024"
#     }

#     # Agregar el objeto JSON a la lista
#     json_objects.append(json_obj)

# # Escribir la lista de objetos JSON en un archivo JSON
# with open('/content/drive/MyDrive/TEST/predicciones/v1.35/EN_DEBERTA_V3_BASE_v_1_35_hard.json', 'w') as f:
#     json.dump(json_objects, f, indent=2)


## Generar Soft labels

In [22]:
import json

# Crear una lista para almacenar los objetos JSON de cada fila
json_objects = []

# Iterar sobre cada fila del DataFrame
for index, row in df_test_bin.iterrows():
    # Obtener el id del tweet/meme
    tweet_id = str(row['id_EXIST'])

    # Obtener el valor de pred_label y establecer la mayoritaria y minoritaria
    pred_label = row['pred_label']
    if pred_label == 0:
        majority_label = "NO"
        minority_label = "YES"
    elif pred_label == 1:
        majority_label = "YES"
        minority_label = "NO"
    else:
        # Manejo de valores imprevistos
        majority_label = "UNKNOWN"
        minority_label = "UNKNOWN"

    # Obtener el valor de score_label y calcular la probabilidad correspondiente
    score_label = row['score_label']
    # if majority_label == "YES":
    probability_majority = score_label
    probability_minority = 1 - score_label
    # else:
    #     probability_minority = score_label
    #     probability_majority = 1 - score_label

    # Construir el objeto JSON con las probabilidades
    json_obj = {
        "id": tweet_id,
        "value": {majority_label: probability_majority, minority_label: probability_minority},
        "test_case": "EXIST2024"
    }

    # Agregar el objeto JSON a la lista
    json_objects.append(json_obj)

# Escribir la lista de objetos JSON en un archivo JSON
with open('/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v1.0_correct/xlm_roberta_base_v_1_0_soft.json', 'w') as f:
    json.dump(json_objects, f, indent=2)


### Concatenador de jsons idiomas

In [31]:
# Rutas de los archivos JSON en Google Drive
file_path1 = '/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v1.35_correct/ES_ROBERTA_BASE_BNE_v_1_35_soft.json'
file_path2 = '/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v1.35_correct/EN_DEBERTA_V3_BASE_v_1_35_soft.json'

# Leer los archivos JSON y cargarlos en objetos Python
with open(file_path1, 'r') as file:
    content1 = json.load(file)

with open(file_path2, 'r') as file:
    content2 = json.load(file)

# Concatenar los objetos Python
combined_content = content1 + content2

# Ruta para guardar el archivo JSON combinado
output_file_path = '/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v1.35_correct/pred_v1_35_soft.json'

# Guardar el contenido combinado en un nuevo archivo JSON
with open(output_file_path, 'w') as file:
    json.dump(combined_content, file)

print("Archivos concatenados y guardados correctamente.")


Archivos concatenados y guardados correctamente.


In [32]:
import json

def guardar_json(datos, ruta):
    """
    Guarda los datos en formato JSON en la ruta especificada.

    Args:
    - datos (list): Lista de diccionarios que contiene los datos a guardar.
    - ruta (str): Ruta donde se guardará el archivo JSON.
    """
    with open(ruta, 'w') as archivo:
        json.dump(datos, archivo, indent=4, separators=(',', ': '))

# Rutas de los archivos JSON en Google Drive
file_path1 = '/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v1.35_correct/ES_ROBERTA_BASE_BNE_v_1_35_soft.json'
file_path2 = '/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v1.35_correct/EN_DEBERTA_V3_BASE_v_1_35_soft.json'

# Leer los archivos JSON y cargarlos en objetos Python
with open(file_path1, 'r') as file:
    content1 = json.load(file)

with open(file_path2, 'r') as file:
    content2 = json.load(file)

# Concatenar los objetos Python
combined_content = content1 + content2

# Ruta para guardar el archivo JSON combinado
output_file_path = '/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v1.35_correct/pred_v1_35_soft.json'

# Guardar el contenido combinado en un nuevo archivo JSON
guardar_json(combined_content, output_file_path)

print("Archivos concatenados y guardados correctamente.")


Archivos concatenados y guardados correctamente.


## Comprobar json formato competición

In [None]:
import json
from google.colab import files
from jsonschema import validate
import io

ID= "id"
TEST_CASE="test_case"
VALUE = "value"
TASK1="task1"
TASK2="task2"
TASK3="task3"
TASK4="task4"
TASK5="task5"
TASK6="task6"

LIST_LABELS_TASK1=["NO", "YES"]
LIST_LABELS_TASK2=["NO", "REPORTED", "JUDGEMENTAL", "DIRECT"]
LIST_LABELS_TASK3=["NO", "IDEOLOGICAL-INEQUALITY", "STEREOTYPING-DOMINANCE", "MISOGYNY-NON-SEXUAL-VIOLENCE", "SEXUAL-VIOLENCE", "OBJECTIFICATION"]
LIST_LABELS_TASK5=["NO", "JUDGEMENTAL", "DIRECT"]

FORMAT_JSON_SCHEMA= {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "test_case": {"type": "string"},
            "id":{"type": "string"},
            "value": {
                "anyOf": [
                    {"type": "string"},
                    {"type": "array", "items": {"type": "string"},"minItems": 1},
                    {"type": "integer"},
                    {
                        "type": "object",
                        "patternProperties": {
                        "^.*$": {"type": "number"},    }
                    },
                ]
            },
        },
        "required": ["test_case", "id", "value"],
        "additionalProperties": False
    },

}

def parser_json(content):
    data = None
    try:
        data = json.loads(content)
    except ValueError as e:
        print(e)
        return False

    try:
        validate(instance=data, schema=FORMAT_JSON_SCHEMA)
    except jsonschema.exceptions.ValidationError as e:
        print("Errors found in the JSON content.\n", e)
        return False

    return True


def process_format_runs_by_task(file_content):
    try:
        data = json.loads(file_content)
    except ValueError as e:
        print(e)
        return

    for instance in data:
        for property in instance:
            if property==ID or property==TEST_CASE:
                continue
            elif property==VALUE:
                if type(instance[VALUE])==type(""):
                    if instance[VALUE] not in LIST_LABELS_TASK1:
                        print("ERROR in label format: ", instance[VALUE])
                elif type(instance[VALUE])==type([]):
                    if instance[VALUE] not in LIST_LABELS_TASK3:
                        print("ERROR in label format: ", instance[VALUE])
                elif type(instance[VALUE])==type(dict()):
                    labels = instance[VALUE]
                    if len(labels)!=2:
                        print("ERROR in label format: ", instance[VALUE])
                    x=0
                    for label in labels:
                        x+=float(labels[label])
                        if label not in LIST_LABELS_TASK1:
                            print("ERROR in label format: ", instance[VALUE])
                    if x>1.001:
                        print("ERROR in label format: ", instance[VALUE])
                else:
                    print("Error format value property.")
            else:
                print("ERROR in json format, property not allowed: ", property)

    print("Completed processing.")

In [None]:
# Ruta del archivo en Google Drive
file_path = '/content/drive/MyDrive/TEST/predicciones/v1.35/pred_v1_35_hard.json'

# Leer el contenido del archivo
with open(file_path, 'r') as file:
    content = file.read()

# Analizar y procesar el archivo JSON
process_format_runs_by_task(content)

Completed processing.
