# 0. Montar GoogleDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1. CONFIGURACIÓN DEL ENTORNO

# Configuración (importar dependencias, librerías, ...)

In [None]:
# Para garantizar la reproducibilidad de nuestros experimentos
# Set the seed value all over the place to make this reproducible.

# Instalación de paquetes necesarios
!pip install sentencepiece
!pip install pytorch-lightning
!pip install --upgrade accelerate
!pip install emoji
!pip install framework-reproducibility
!pip install transformers datasets
!pip install contractions
!pip install textblob
!pip install PyEvALL

Collecting pytorch-lightning
  Using cached pytorch_lightning-2.2.4-py3-none-any.whl (802 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Using cached torchmetrics-1.4.0-py3-none-any.whl (868 kB)
Collecting lightning-utilities>=0.8.0 (from pytorch-lightning)
  Using cached lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->pytorch-lightning)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->pytorch-lightning)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13.0->pytorch-lightning)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.13.0->pytorch-lightning)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_

In [None]:
# Set the seed value all over the place to make this reproducible.
# esto hay que ponerlo justo antes de importar para que los experimentos
# sean reproducible

import random
import torch
import numpy as np
import os
from pytorch_lightning import seed_everything
import matplotlib.pyplot as plt
import seaborn as sns
import re

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)# Store the average loss after eachepoch so we can plot them.
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ["TF_DETERMINISTIC_OPS"] = "1" # See:https://github.com/NVIDIA/tensorflow-determinism#confirmed-current-gpu-specific-sources-of-non-determinism-with-solutions
seed_everything(42, workers=True)

from datasets import Dataset, DatasetDict, load_metric
import pandas as pd
import sklearn as sk
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, f1_score
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
 TrainingArguments, Trainer, pipeline, EarlyStoppingCallback

INFO:lightning_fabric.utilities.seed:Seed set to 42


In [None]:
# Check that pyTorch is identifying the GPU
if torch.cuda.device_count() > 0:
    # If a GPU is available, print its name
    print(f'GPU detected. Currently using: "{torch.cuda.get_device_name(0)}"')
    # Set the device to GPU for accelerated computations
    device = torch.device("cuda")
else:
    # If no GPU is available, inform the user to change the runtime type
    print('Currently using CPU. To utilize GPU acceleration, change the runtime type in the \'runtime\' tab.')

GPU detected. Currently using: "Tesla T4"


# Preparación de los datos

## Lectura de los ficheros

In [None]:
# Usamos estas variables para que el código sea más portable
nombre_etiqueta1 = 'value_task2'
nombre_etiqueta2 = 'value'
campo_texto = 'tweet'

# Formateo y etiquetado de los Datasets

In [None]:
# # Ruta del archivo CSV en Google Drive
# file_path = '/content/drive/MyDrive/TEST/df_en_TEST.csv'

# # Leer el archivo CSV y cargarlo en un DataFrame
# test_df = pd.read_csv(file_path)

# test_dataset = Dataset.from_pandas(test_df)

# print(test_dataset)

In [None]:
# Se convierten los dataframes en objetos Datasets para que los acepten los Transformers


# Supongamos que el JSON está en el archivo "datos.json" en la ruta especificada
ruta_json = '/content/drive/MyDrive/MODELOS TASK 1/TEST/EXIST2023_test_clean.json'

# Leer el JSON en un DataFrame
df = pd.read_json(ruta_json)

test_df=df.transpose()

test_dataset = Dataset.from_pandas(test_df)

print(test_dataset)

Dataset({
    features: ['id_EXIST', 'lang', 'tweet', 'number_annotators', 'annotators', 'gender_annotators', 'age_annotators', 'ethnicities_annotators', 'study_levels_annotators', 'countries_annotators', 'split', '__index_level_0__'],
    num_rows: 2076
})


In [None]:
# Se convierten los dataframes en objetos Datasets para que los acepten los Transformers


# Supongamos que el JSON está en el archivo "datos.json" en la ruta especificada
ruta_json1 = '/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v1.35/pred_v1_35_hard.json'

# Leer el JSON en un DataFrame
df_yes = pd.read_json(ruta_json1)

# test_df_yes=df_yes.transpose()

# test_dataset_yes = Dataset.from_pandas(test_df_yes)

print(df_yes)

FileNotFoundError: File /content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v1.35/pred_v1_35_hard.json does not exist

In [None]:
df_yes

In [None]:
df_auxiliar = df_yes[df_yes['value'] == 'YES']
df_auxiliar

In [None]:
test_df

In [None]:
# Convertir la columna "id" a tipo objeto
df_auxiliar['id'] = df_auxiliar['id'].astype(str)

# Unir los DataFrames por las columnas "id" y "id_EXIST" conservando solo las filas del primer DataFrame
df_unido = pd.merge(df_auxiliar, test_df, left_on='id', right_on='id_EXIST', how='left')

# Eliminar la columna redundante "id_EXIST"
df_unido.drop(columns='id_EXIST', inplace=True)

In [None]:
df_unido

In [None]:
test_dataset=df_unido

In [None]:
# Esta función toma un registro como entrada, que contiene una etiqueta llamada 'label'.
# Si el valor de esta etiqueta es 0, asigna 0 a la variable 'label'. Si el valor no es 0
# asigna 1 a 'label'. A continuación, la función devuelve un diccionario con la etiqueta modificada, llamado "labels"

def set_labels(records):
    label_mapping = {
        'DIRECT': 0,  # Clase directa
        'REPORTED': 1,  # Clase reportada
        'JUDGEMENTAL': 2  # Clase juiciosa
    }
    label = label_mapping[records[nombre_etiqueta2]]
    return {'labels': label}

In [None]:
# # Reseteamos el formato para que no haya fallos
# test_dataset.reset_format()

# 2. CLASIFICACIÓN BINARIA ENTRE LAS CLASES DE LA TASK_1 YES/NO

## Crear tokenizador para tokenizar test y cargar modelo

In [None]:
model_checkpoint_1 = 'FacebookAI/xlm-roberta-base'

#model_checkpoint_1 = 'microsoft/deberta-v3-base'

#model_checkpoint_1 = 'google-bert/bert-base-multilingual-uncased'

#model_checkpoint_1 = 'FacebookAI/roberta-base'

#model_checkpoint_1 = 'PlanTL-GOB-ES/roberta-base-bne'

#model_checkpoint_1 = 'dccuchile/bert-base-spanish-wwm-uncased'

In [None]:
tokenizer_1 = AutoTokenizer.from_pretrained(model_checkpoint_1, use_auth_token='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')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Se carga el modelo preentrenado
n_labels = 3

# El uso de una función de inicialización facilita la repetición del entrenamiento
# Se puede usar la misma función de inicialización en diferentes ejecuciones del código o en configuraciones de entrenamiento diferentes
# Esto facilita la repetición del entrenamiento y la reproducibilidad, ya que se puede inicializar el modelo
# de la misma manera en cada ejecución.

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint_1,
                                                              num_labels = n_labels) #, return_dict = True )
                                                              # use_auth_token = 'token propio de HugginFace')

In [None]:
# Para saber el nombre del modelo
model_name_1 = model_checkpoint_1.split("/")[-1]
model_name_1

'xlm-roberta-base'

# Generación de Resultados para la competición

In [None]:
test_dataset_aux = test_dataset

In [None]:
# Lo pasamos a objeto dataset
test_dataset = Dataset.from_pandas(test_dataset)
test_dataset

Dataset({
    features: ['id', 'value', 'test_case', 'lang', 'tweet', 'number_annotators', 'annotators', 'gender_annotators', 'age_annotators', 'ethnicities_annotators', 'study_levels_annotators', 'countries_annotators', 'split'],
    num_rows: 999
})

In [None]:
test_dataset[5]

{'id': '500028',
 'value': 'YES',
 'test_case': 'EXIST2024',
 'lang': 'es',
 'tweet': 'Machistas, misóginos, mgtow y demás caterva que pulula por el mundo, sois igual que las feminazis, seres amargados cargados de malas experiencias que solo sabéis llorar y apretar los puñitos, es que no lo veis?',
 'number_annotators': 6,
 'annotators': ['Annotator_902',
  'Annotator_903',
  'Annotator_904',
  'Annotator_905',
  'Annotator_906',
  'Annotator_907'],
 'gender_annotators': ['F', 'F', 'F', 'M', 'M', 'M'],
 'age_annotators': ['18-22', '23-45', '46+', '46+', '23-45', '18-22'],
 'ethnicities_annotators': ['Hispano or Latino',
  'Multiracial',
  'Hispano or Latino',
  'White or Caucasian',
  'other',
  'White or Caucasian'],
 'study_levels_annotators': ['High school degree or equivalent',
  'Bachelor’s degree',
  'Bachelor’s degree',
  'Bachelor’s degree',
  'Master’s degree',
  'Bachelor’s degree'],
 'countries_annotators': ['Mexico',
  'Ecuador',
  'Dominican Republic',
  'Spain',
  'Mexico

### Predicciones

In [None]:
# Se carga el modelo que se ha entrenado
model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/MODELOS TASK 2/ESTRATEGIA 2 - ENSAMBLADOR ANNOTATORS/TIPO6/modelo')

In [None]:
# Predicción con pipeline
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer_1, device=0)

In [None]:
campo_texto

'tweet'

In [None]:
def get_predictions(records):
    result = pipe(records[campo_texto], truncation=True)
    pred_label = result[0]['label']
    score_label = result[0]['score']

    if pred_label == 'LABEL_0':
        pred_label = 0  # Clase directa
    elif pred_label == 'LABEL_1':
        pred_label = 1  # Clase reportada
    else:
        pred_label = 2  # Clase juiciosa

    return {'pred_label': pred_label, 'score_label': score_label}

In [None]:
# Se hacen las predicciones sobre el conjunto de test
test_dataset_predicted = test_dataset.map(get_predictions)
test_dataset_predicted[0]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


{'id': '500004',
 'value': 'YES',
 'test_case': 'EXIST2024',
 'lang': 'es',
 'tweet': '@jordirico Primero fue internet, luego el gamergate, la manosfera y su misoginia extrema sin que las plataformas movieran un dedo, los incel-asesinatos, la extrema derecha capitalizando el odio, la mimetización de estas ideas con las de un debate respetable y finalmente aquí estamos',
 'number_annotators': 6,
 'annotators': ['Annotator_827',
  'Annotator_828',
  'Annotator_829',
  'Annotator_830',
  'Annotator_831',
  'Annotator_832'],
 'gender_annotators': ['F', 'F', 'F', 'M', 'M', 'M'],
 'age_annotators': ['18-22', '23-45', '46+', '46+', '23-45', '18-22'],
 'ethnicities_annotators': ['Hispano or Latino',
  'White or Caucasian',
  'Hispano or Latino',
  'White or Caucasian',
  'Hispano or Latino',
  'Hispano or Latino'],
 'study_levels_annotators': ['High school degree or equivalent',
  'Bachelor’s degree',
  'Bachelor’s degree',
  'Bachelor’s degree',
  'Bachelor’s degree',
  'Bachelor’s degree'],


In [None]:
test_dataset_predicted.set_format('pandas')
df_test = test_dataset_predicted[:]
df_test

Unnamed: 0,id,value,test_case,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,split,pred_label,score_label
0,500004,YES,EXIST2024,es,"@jordirico Primero fue internet, luego el game...",6,"[Annotator_827, Annotator_828, Annotator_829, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, White or Caucasian, Hispan...","[High school degree or equivalent, Bachelor’s ...","[Chile, Spain, Mexico, United Kingdom, Chile, ...",TEST_ES,2,0.444543
1,500012,YES,EXIST2024,es,@dimplerrylover lo se pero literalmente la chi...,6,"[Annotator_833, Annotator_462, Annotator_834, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[White or Caucasian, White or Caucasian, White...","[High school degree or equivalent, Bachelor’s ...","[Portugal, Poland, Australia, Germany, Mexico,...",TEST_ES,0,0.827712
2,500020,YES,EXIST2024,es,"@EstefaniaVeloz ...Con el proceso legal, no pa...",6,"[Annotator_867, Annotator_868, Annotator_869, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, Hispano...","[High school degree or equivalent, Bachelor’s ...","[Mexico, Mexico, Mexico, United States, Spain,...",TEST_ES,2,0.648023
3,500022,YES,EXIST2024,es,@geeksterilia Desde la Olivia que anda trepado...,6,"[Annotator_879, Annotator_880, Annotator_881, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, White o...","[High school degree or equivalent, Bachelor’s ...","[Mexico, Mexico, United Kingdom, Mexico, Mexic...",TEST_ES,0,0.847365
4,500024,YES,EXIST2024,es,@Aracely54051891 Iniciamos otro movimiento #Me...,6,"[Annotator_780, Annotator_816, Annotator_817, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, Black o...","[High school degree or equivalent, Bachelor’s ...","[Chile, Mexico, United States, Mexico, Mexico,...",TEST_ES,2,0.467899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,600974,YES,EXIST2024,en,@AllyMae99 This straight up sounds like “you l...,6,"[Annotator_942, Annotator_943, Annotator_351, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Master’s de...","[South Africa, Spain, Portugal, United States,...",TEST_EN,1,0.511119
995,600975,YES,EXIST2024,en,Nathaniel is trying to help me with a new fake...,6,"[Annotator_997, Annotator_998, Annotator_999, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Bachelor’s ...","[South Africa, United Kingdom, Australia, Fran...",TEST_EN,1,0.456296
996,600976,YES,EXIST2024,en,walkin back from the gym &amp; an older lady s...,6,"[Annotator_997, Annotator_998, Annotator_999, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Bachelor’s ...","[South Africa, United Kingdom, Australia, Fran...",TEST_EN,1,0.452427
997,600977,YES,EXIST2024,en,You look like a whore of Babylon bc that’s the...,6,"[Annotator_1009, Annotator_1010, Annotator_101...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[White or Caucasian, White or Caucasian, Multi...","[High school degree or equivalent, Bachelor’s ...","[Poland, Portugal, United Kingdom, Greece, Gre...",TEST_EN,0,0.738760


In [None]:
df_test.to_csv('/content/drive/MyDrive/MODELOS TASK 2/ESTRATEGIA 2 - ENSAMBLADOR ANNOTATORS/TIPO6/predicciones_6.csv', index=False)  # index=False para evitar escribir el índice del DataFrame en el archivo

In [None]:
df_test_bin = df_test
df_test_bin

Unnamed: 0,id,value,test_case,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,split,pred_label,score_label
0,500007,YES,EXIST2024,es,@grupoeldeber @grupoeldeber sería bueno que ta...,6,"[Annotator_780, Annotator_816, Annotator_817, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, Black o...","[High school degree or equivalent, Bachelor’s ...","[Chile, Mexico, United States, Mexico, Mexico,...",TEST_ES,1,0.990401
1,500019,YES,EXIST2024,es,@Fistroman1 @ElioGatsby Sin pruebas??Qué fue e...,6,"[Annotator_861, Annotator_862, Annotator_863, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, White o...","[High school degree or equivalent, Bachelor’s ...","[Mexico, Mexico, Italy, Spain, Mexico, Portugal]",TEST_ES,0,0.979219
2,500020,YES,EXIST2024,es,"@EstefaniaVeloz ...Con el proceso legal, no pa...",6,"[Annotator_867, Annotator_868, Annotator_869, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, Hispano...","[High school degree or equivalent, Bachelor’s ...","[Mexico, Mexico, Mexico, United States, Spain,...",TEST_ES,2,0.970411
3,500021,YES,EXIST2024,es,@robertoantoniow @JessicaLBedoya Despedidos lo...,6,"[Annotator_873, Annotator_874, Annotator_875, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, White o...","[High school degree or equivalent, Bachelor’s ...","[Mexico, Mexico, Spain, United States, Mexico,...",TEST_ES,1,0.959543
4,500022,YES,EXIST2024,es,@geeksterilia Desde la Olivia que anda trepado...,6,"[Annotator_879, Annotator_880, Annotator_881, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Hispano or Latino, Hispano or Latino, White o...","[High school degree or equivalent, Bachelor’s ...","[Mexico, Mexico, United Kingdom, Mexico, Mexic...",TEST_ES,0,0.807902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965,600974,YES,EXIST2024,en,@AllyMae99 This straight up sounds like “you l...,6,"[Annotator_942, Annotator_943, Annotator_351, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Master’s de...","[South Africa, Spain, Portugal, United States,...",TEST_EN,0,0.833788
966,600975,YES,EXIST2024,en,Nathaniel is trying to help me with a new fake...,6,"[Annotator_997, Annotator_998, Annotator_999, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Bachelor’s ...","[South Africa, United Kingdom, Australia, Fran...",TEST_EN,1,0.985770
967,600976,YES,EXIST2024,en,walkin back from the gym &amp; an older lady s...,6,"[Annotator_997, Annotator_998, Annotator_999, ...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[Black or African American, White or Caucasian...","[High school degree or equivalent, Bachelor’s ...","[South Africa, United Kingdom, Australia, Fran...",TEST_EN,1,0.977128
968,600977,YES,EXIST2024,en,You look like a whore of Babylon bc that’s the...,6,"[Annotator_1009, Annotator_1010, Annotator_101...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[White or Caucasian, White or Caucasian, Multi...","[High school degree or equivalent, Bachelor’s ...","[Poland, Portugal, United Kingdom, Greece, Gre...",TEST_EN,0,0.991512


In [None]:
# Rutas de los archivos CSV
ruta_archivo_1 = "/content/drive/MyDrive/MODELOS TASK 2/ESTRATEGIA 2 - ENSAMBLADOR ANNOTATORS/TIPO1/predicciones_1.csv"
ruta_archivo_2 = "/content/drive/MyDrive/MODELOS TASK 2/ESTRATEGIA 2 - ENSAMBLADOR ANNOTATORS/TIPO2/predicciones_2.csv"
ruta_archivo_3 = "/content/drive/MyDrive/MODELOS TASK 2/ESTRATEGIA 2 - ENSAMBLADOR ANNOTATORS/TIPO3/predicciones_3.csv"
ruta_archivo_4 = "/content/drive/MyDrive/MODELOS TASK 2/ESTRATEGIA 2 - ENSAMBLADOR ANNOTATORS/TIPO4/predicciones_4.csv"
ruta_archivo_5 = "/content/drive/MyDrive/MODELOS TASK 2/ESTRATEGIA 2 - ENSAMBLADOR ANNOTATORS/TIPO5/predicciones_5.csv"
ruta_archivo_6 = "/content/drive/MyDrive/MODELOS TASK 2/ESTRATEGIA 2 - ENSAMBLADOR ANNOTATORS/TIPO6/predicciones_6.csv"

# Cargar los archivos CSV en dataframes
df_1 = pd.read_csv(ruta_archivo_1)
df_2 = pd.read_csv(ruta_archivo_2)
df_3 = pd.read_csv(ruta_archivo_3)
df_4 = pd.read_csv(ruta_archivo_4)
df_5 = pd.read_csv(ruta_archivo_5)
df_6 = pd.read_csv(ruta_archivo_6)

In [None]:
df_1

Unnamed: 0,id,value,test_case,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,split,pred_label,score_label
0,500004,YES,EXIST2024,es,"@jordirico Primero fue internet, luego el game...",6,['Annotator_827' 'Annotator_828' 'Annotator_82...,['F' 'F' 'F' 'M' 'M' 'M'],['18-22' '23-45' '46+' '46+' '23-45' '18-22'],['Hispano or Latino' 'White or Caucasian' 'His...,['High school degree or equivalent' 'Bachelor’...,['Chile' 'Spain' 'Mexico' 'United Kingdom' 'Ch...,TEST_ES,2,0.410730
1,500012,YES,EXIST2024,es,@dimplerrylover lo se pero literalmente la chi...,6,['Annotator_833' 'Annotator_462' 'Annotator_83...,['F' 'F' 'F' 'M' 'M' 'M'],['18-22' '23-45' '46+' '46+' '23-45' '18-22'],['White or Caucasian' 'White or Caucasian' 'Wh...,['High school degree or equivalent' 'Bachelor’...,['Portugal' 'Poland' 'Australia' 'Germany' 'Me...,TEST_ES,1,0.806233
2,500020,YES,EXIST2024,es,"@EstefaniaVeloz ...Con el proceso legal, no pa...",6,['Annotator_867' 'Annotator_868' 'Annotator_86...,['F' 'F' 'F' 'M' 'M' 'M'],['18-22' '23-45' '46+' '46+' '23-45' '18-22'],['Hispano or Latino' 'Hispano or Latino' 'Hisp...,['High school degree or equivalent' 'Bachelor’...,['Mexico' 'Mexico' 'Mexico' 'United States' 'S...,TEST_ES,2,0.577306
3,500022,YES,EXIST2024,es,@geeksterilia Desde la Olivia que anda trepado...,6,['Annotator_879' 'Annotator_880' 'Annotator_88...,['F' 'F' 'F' 'M' 'M' 'M'],['18-22' '23-45' '46+' '46+' '23-45' '18-22'],['Hispano or Latino' 'Hispano or Latino' 'Whit...,['High school degree or equivalent' 'Bachelor’...,['Mexico' 'Mexico' 'United Kingdom' 'Mexico' '...,TEST_ES,1,0.535878
4,500024,YES,EXIST2024,es,@Aracely54051891 Iniciamos otro movimiento #Me...,6,['Annotator_780' 'Annotator_816' 'Annotator_81...,['F' 'F' 'F' 'M' 'M' 'M'],['18-22' '23-45' '46+' '46+' '23-45' '18-22'],['Hispano or Latino' 'Hispano or Latino' 'Blac...,['High school degree or equivalent' 'Bachelor’...,['Chile' 'Mexico' 'United States' 'Mexico' 'Me...,TEST_ES,2,0.590894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,600974,YES,EXIST2024,en,@AllyMae99 This straight up sounds like “you l...,6,['Annotator_942' 'Annotator_943' 'Annotator_35...,['F' 'F' 'F' 'M' 'M' 'M'],['18-22' '23-45' '46+' '46+' '23-45' '18-22'],['Black or African American' 'White or Caucasi...,['High school degree or equivalent' 'Master’s ...,['South Africa' 'Spain' 'Portugal' 'United Sta...,TEST_EN,0,0.462584
995,600975,YES,EXIST2024,en,Nathaniel is trying to help me with a new fake...,6,['Annotator_997' 'Annotator_998' 'Annotator_99...,['F' 'F' 'F' 'M' 'M' 'M'],['18-22' '23-45' '46+' '46+' '23-45' '18-22'],['Black or African American' 'White or Caucasi...,['High school degree or equivalent' 'Bachelor’...,['South Africa' 'United Kingdom' 'Australia' '...,TEST_EN,1,0.794991
996,600976,YES,EXIST2024,en,walkin back from the gym &amp; an older lady s...,6,['Annotator_997' 'Annotator_998' 'Annotator_99...,['F' 'F' 'F' 'M' 'M' 'M'],['18-22' '23-45' '46+' '46+' '23-45' '18-22'],['Black or African American' 'White or Caucasi...,['High school degree or equivalent' 'Bachelor’...,['South Africa' 'United Kingdom' 'Australia' '...,TEST_EN,1,0.765159
997,600977,YES,EXIST2024,en,You look like a whore of Babylon bc that’s the...,6,['Annotator_1009' 'Annotator_1010' 'Annotator_...,['F' 'F' 'F' 'M' 'M' 'M'],['18-22' '23-45' '46+' '46+' '23-45' '18-22'],['White or Caucasian' 'White or Caucasian' 'Mu...,['High school degree or equivalent' 'Bachelor’...,['Poland' 'Portugal' 'United Kingdom' 'Greece'...,TEST_EN,0,0.788913


In [None]:
# Crear un nuevo dataframe con las columnas 'id', 'value', 'test_case', 'pred_label_1' y 'score_label_1' del primer dataframe
df_combinado = df_1[['id', 'value', 'test_case', 'pred_label', 'score_label']].rename(columns={'pred_label': 'pred_label_1', 'score_label': 'score_label_1'})

# Renombrar las columnas 'pred_label' y 'score_label' de cada dataframe y unirlas al dataframe combinado
for i, df in enumerate([df_2, df_3], start=2):
    df_renombrado = df.rename(columns={'pred_label': f'pred_label_{i}', 'score_label': f'score_label_{i}'})
    df_combinado = df_combinado.merge(df_renombrado[['id', f'pred_label_{i}', f'score_label_{i}']], on='id')


In [None]:
df_combinado

Unnamed: 0,id,value,test_case,pred_label_1,score_label_1,pred_label_2,score_label_2,pred_label_3,score_label_3
0,500004,YES,EXIST2024,2,0.410730,2,0.616415,2,0.782030
1,500012,YES,EXIST2024,1,0.806233,0,0.691388,0,0.977974
2,500020,YES,EXIST2024,2,0.577306,2,0.593807,2,0.849221
3,500022,YES,EXIST2024,1,0.535878,1,0.446448,0,0.887473
4,500024,YES,EXIST2024,2,0.590894,1,0.487648,2,0.444795
...,...,...,...,...,...,...,...,...,...
994,600974,YES,EXIST2024,0,0.462584,0,0.703426,1,0.746899
995,600975,YES,EXIST2024,1,0.794991,1,0.829980,1,0.958311
996,600976,YES,EXIST2024,1,0.765159,1,0.877745,1,0.965586
997,600977,YES,EXIST2024,0,0.788913,0,0.677848,0,0.979768


In [None]:
import numpy as np

def obtener_mayoritarios(row):
    pred_labels = row[['pred_label_1', 'pred_label_2', 'pred_label_3']]
    score_labels = row[['score_label_1', 'score_label_2', 'score_label_3']]
    pred_label_mayoritario = pred_labels.mode()[0]
    score_label_mayoritario = np.mean(score_labels)
    return pd.Series([pred_label_mayoritario, score_label_mayoritario], index=['value', 'score_value'])

def generar_df_pred_mayoritarias(df_combinado):
    df_pred_mayoritarias = df_combinado.apply(obtener_mayoritarios, axis=1)
    df_pred_mayoritarias[['id', 'test_case']] = df_combinado[['id', 'test_case']]
    return df_pred_mayoritarias

# Uso de la función para generar el nuevo DataFrame
nuevo_df_pred_mayoritarias = generar_df_pred_mayoritarias(df_combinado)


In [None]:
nuevo_df_pred_mayoritarias

Unnamed: 0,value,score_value,id,test_case
0,2.0,0.603058,500004,EXIST2024
1,0.0,0.825198,500012,EXIST2024
2,2.0,0.673445,500020,EXIST2024
3,1.0,0.623266,500022,EXIST2024
4,2.0,0.507779,500024,EXIST2024
...,...,...,...,...
994,0.0,0.637636,600974,EXIST2024
995,1.0,0.861094,600975,EXIST2024
996,1.0,0.869497,600976,EXIST2024
997,0.0,0.815510,600977,EXIST2024


## Generar Hard Labels

In [None]:
import json

# Crear una lista para almacenar los objetos JSON de cada fila
json_objects = []

# Iterar sobre cada fila del DataFrame
for index, row in nuevo_df_pred_mayoritarias.iterrows():
    # Obtener el id del tweet/meme
    tweet_id = str(row['id'])

    # Obtener el valor de pred_label y convertirlo a "DIRECT", "REPORTED" o "JUDGEMENTAL" según la correspondencia dada
    pred_label = row['value']
    if pred_label == 0:
        value = "DIRECT"
    elif pred_label == 1:
        value = "REPORTED"
    elif pred_label == 2:
        value = "JUDGEMENTAL"
    else:
        # Manejo de valores imprevistos
        value = "UNKNOWN"

    # Construir el objeto JSON para esta fila
    json_obj = {
        "id": tweet_id,
        "value": value,
        "test_case": "EXIST2024"
    }

    # Agregar el objeto JSON a la lista
    json_objects.append(json_obj)

# Escribir la lista de objetos JSON en un archivo JSON
with open('/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v2.2/LEWIDI_ANN_x3_FEM_TASK2_hard.json', 'w') as f:
    json.dump(json_objects, f, indent=2)


## Generar predicciones COMPLETAS para entregar en la tarea 2

In [None]:
import json

# Cargar los dos archivos JSON
with open('/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v1.35_correct/PRED_V1_35_hard.json', 'r') as f1, open('/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v2.2/LEWIDI_ANN_x3_FEM_TASK2_hard.json', 'r') as f2:
    json_predicciones_1 = json.load(f1)
    json_predicciones_2 = json.load(f2)

# Crear un diccionario para mapear los IDs a las predicciones de la clase YES del segundo archivo
id_to_pred_class_yes = {prediccion['id']: prediccion['value'] for prediccion in json_predicciones_2 if prediccion['value'] != 'UNKNOWN'}

# Iterar sobre las predicciones del primer archivo y reemplazar las predicciones YES con las correspondientes de la clase YES del segundo archivo
for prediccion in json_predicciones_1:
    if prediccion['value'] == 'YES':
        id_tweet = prediccion['id']
        if id_tweet in id_to_pred_class_yes:
            prediccion['value'] = id_to_pred_class_yes[id_tweet]

# Guardar las predicciones actualizadas en un nuevo archivo JSON
with open('/content/drive/MyDrive/exist2024_I2C-UHU/task2_hard_I2C-UHU_3.json', 'w') as f:
    json.dump(json_predicciones_1, f, indent=2)


In [None]:
import json
from collections import Counter

def analyze_json(json_path):
    # Cargar el JSON desde el archivo
    with open(json_path, 'r') as file:
        data = json.load(file)

    # Inicializar contadores
    value_counter = Counter()
    id_set = set()

    # Contar el número de cada tipo de "value" y verificar los IDs únicos
    for entry in data:
        value_counter[entry["value"]] += 1
        id_set.add(entry["id"])

    # Calcular el total de IDs únicos
    total_unique_ids = len(id_set)

    # Verificar si hay IDs duplicados
    has_duplicates = len(id_set) != len(data)

    return value_counter, total_unique_ids, has_duplicates

In [None]:
# Ejemplo de uso
json_path = "/content/drive/MyDrive/exist2024_I2C-UHU/task2_hard_I2C-UHU_1.json"
value_counter, total_unique_ids, has_duplicates = analyze_json(json_path)

# Imprimir los resultados
print("Cantidad de cada tipo de 'value':", value_counter)
print("Total de IDs únicos:", total_unique_ids)
print("¿Hay IDs duplicados?", "Sí" if has_duplicates else "No")

Cantidad de cada tipo de 'value': Counter({'NO': 1106, 'DIRECT': 520, 'JUDGEMENTAL': 258, 'REPORTED': 192})
Total de IDs únicos: 2076
¿Hay IDs duplicados? No


In [None]:
# Ejemplo de uso
json_path = "/content/drive/MyDrive/exist2024_I2C-UHU/task2_hard_I2C-UHU_2.json"
value_counter, total_unique_ids, has_duplicates = analyze_json(json_path)

# Imprimir los resultados
print("Cantidad de cada tipo de 'value':", value_counter)
print("Total de IDs únicos:", total_unique_ids)
print("¿Hay IDs duplicados?", "Sí" if has_duplicates else "No")

Cantidad de cada tipo de 'value': Counter({'NO': 1077, 'DIRECT': 486, 'REPORTED': 261, 'JUDGEMENTAL': 252})
Total de IDs únicos: 2076
¿Hay IDs duplicados? No


In [None]:
# Ejemplo de uso
json_path = "/content/drive/MyDrive/exist2024_I2C-UHU/task2_hard_I2C-UHU_3.json"
value_counter, total_unique_ids, has_duplicates = analyze_json(json_path)

# Imprimir los resultados
print("Cantidad de cada tipo de 'value':", value_counter)
print("Total de IDs únicos:", total_unique_ids)
print("¿Hay IDs duplicados?", "Sí" if has_duplicates else "No")

Cantidad de cada tipo de 'value': Counter({'NO': 1077, 'REPORTED': 379, 'DIRECT': 341, 'JUDGEMENTAL': 279})
Total de IDs únicos: 2076
¿Hay IDs duplicados? No


## Generar Soft labels

In [None]:
def calcular_softlabel(df):
    # Inicializar una lista para almacenar las softlabels calculadas
    softlabels_list = []

    # Obtener las columnas que contienen las predicciones de cada modelo
    pred_columns = [col for col in df.columns if col.startswith("pred_label")]

    # Calcular la softlabel para cada fila del DataFrame
    for index, row in df.iterrows():
        # Inicializar un diccionario para almacenar las softlabels de cada clase
        softlabel_dict = {'id': row['id'], 'test_case': row['test_case']}

        # Calcular la softlabel para cada clase
        for i in range(3):  # Hay tres clases posibles: 0, 1, 2
            # Contar cuántos anotadores han dado esta pred_label
            count = sum(1 for col in pred_columns if row[col] == i)

            # Calcular la softlabel dividiendo el conteo por el número de modelos
            softlabel_dict[f'softlabel_{i}'] = count / len(pred_columns)

        # Agregar las softlabels calculadas para esta fila a la lista
        softlabels_list.append(softlabel_dict)

    # Crear un DataFrame a partir de la lista de softlabels
    softlabels_df = pd.DataFrame(softlabels_list)

    return softlabels_df

In [None]:
# Ejemplo de uso
softlabels_df = calcular_softlabel(df_combinado)
softlabels_df

Unnamed: 0,id,test_case,softlabel_0,softlabel_1,softlabel_2
0,500004,EXIST2024,0.000000,0.000000,1.000000
1,500012,EXIST2024,0.666667,0.333333,0.000000
2,500020,EXIST2024,0.000000,0.000000,1.000000
3,500022,EXIST2024,0.333333,0.666667,0.000000
4,500024,EXIST2024,0.000000,0.333333,0.666667
...,...,...,...,...,...
994,600974,EXIST2024,0.666667,0.333333,0.000000
995,600975,EXIST2024,0.000000,1.000000,0.000000
996,600976,EXIST2024,0.000000,1.000000,0.000000
997,600977,EXIST2024,1.000000,0.000000,0.000000


In [None]:
softlabels_df.to_csv('/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v2.2/softlabels2.csv', index=False)

In [None]:
import pandas as pd

# Lee los dos DataFrames desde ruta
df1 = pd.read_csv('/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v2.2/softlabels_NO.csv')
df2 = pd.read_csv('/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v2.2/softlabels2.csv')

df1


Unnamed: 0,id,test_case,softlabel_0,softlabel_1,softlabel_2
0,500001,EXIST2024,0.500000,0.000000,0.500000
1,500002,EXIST2024,0.666667,0.333333,0.000000
2,500003,EXIST2024,0.666667,0.166667,0.166667
3,500005,EXIST2024,0.333333,0.666667,0.000000
4,500006,EXIST2024,0.500000,0.333333,0.166667
...,...,...,...,...,...
1072,600950,EXIST2024,0.500000,0.333333,0.166667
1073,600951,EXIST2024,0.333333,0.666667,0.000000
1074,600952,EXIST2024,0.666667,0.333333,0.000000
1075,600953,EXIST2024,1.000000,0.000000,0.000000


In [None]:

# Concatena los dos DataFrames
df_concatenado = pd.concat([df1, df2])

# Ordena las filas por el valor de la columna 'id'
df_concatenado = df_concatenado.sort_values(by='id')

# Muestra el DataFrame resultante
print(df_concatenado)

         id  test_case  softlabel_0  softlabel_1  softlabel_2
0    500001  EXIST2024     0.500000     0.000000     0.500000
1    500002  EXIST2024     0.666667     0.333333     0.000000
2    500003  EXIST2024     0.666667     0.166667     0.166667
0    500004  EXIST2024     0.000000     0.000000     1.000000
3    500005  EXIST2024     0.333333     0.666667     0.000000
..      ...        ...          ...          ...          ...
994  600974  EXIST2024     0.666667     0.333333     0.000000
995  600975  EXIST2024     0.000000     1.000000     0.000000
996  600976  EXIST2024     0.000000     1.000000     0.000000
997  600977  EXIST2024     1.000000     0.000000     0.000000
998  600978  EXIST2024     1.000000     0.000000     0.000000

[2076 rows x 5 columns]


In [None]:
# Especifica la ruta del archivo JSON
ruta_json = "/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/v1.35_correct/pred_v1_35_soft.json"

# Lee el archivo JSON en un DataFrame
dfv135 = pd.read_json(ruta_json)

# Muestra el DataFrame
dfv135

Unnamed: 0,id,value,test_case
0,500001,"{'NO': 0.954119682312011, 'YES': 0.04588031768...",EXIST2024
1,500002,"{'NO': 0.8894460201263421, 'YES': 0.1105539798...",EXIST2024
2,500003,"{'NO': 0.9364690780639641, 'YES': 0.0635309219...",EXIST2024
3,500004,"{'YES': 0.691380023956298, 'NO': 0.30861997604...",EXIST2024
4,500005,"{'NO': 0.944931864738464, 'YES': 0.05506813526...",EXIST2024
...,...,...,...
2071,600974,"{'YES': 0.9756750464439391, 'NO': 0.0243249535...",EXIST2024
2072,600975,"{'YES': 0.9730844497680661, 'NO': 0.0269155502...",EXIST2024
2073,600976,"{'YES': 0.974555790424346, 'NO': 0.02544420957...",EXIST2024
2074,600977,"{'YES': 0.975379824638366, 'NO': 0.02462017536...",EXIST2024


In [None]:
df_concatenado

Unnamed: 0,id,test_case,softlabel_0,softlabel_1,softlabel_2
0,500001,EXIST2024,0.500000,0.000000,0.500000
1,500002,EXIST2024,0.666667,0.333333,0.000000
2,500003,EXIST2024,0.666667,0.166667,0.166667
0,500004,EXIST2024,0.000000,0.000000,1.000000
3,500005,EXIST2024,0.333333,0.666667,0.000000
...,...,...,...,...,...
994,600974,EXIST2024,0.666667,0.333333,0.000000
995,600975,EXIST2024,0.000000,1.000000,0.000000
996,600976,EXIST2024,0.000000,1.000000,0.000000
997,600977,EXIST2024,1.000000,0.000000,0.000000


In [None]:
# Función para redistribuir la probabilidad de YES entre las clases DIRECT, REPORTED y JUDGEMENTAL
def redistribuir_probabilidades(df, softlabels_df):
    # Copiar el DataFrame para evitar modificar el original
    df_nuevo = df.copy()

    # Iterar sobre las filas del DataFrame
    for index, row in df_nuevo.iterrows():
        # Obtener las probabilidades de las clases para esta fila del DataFrame softlabels_df
        softlabels_row = softlabels_df.loc[softlabels_df['id'] == row['id']]

        # Obtener las probabilidades de las clases
        softlabel_0 = softlabels_row['softlabel_0'].values[0]
        softlabel_1 = softlabels_row['softlabel_1'].values[0]
        softlabel_2 = softlabels_row['softlabel_2'].values[0]

        # Obtener la probabilidad asociada a "YES"
        probabilidad_yes = row['value']['YES']

        # Redistribuir la probabilidad de YES entre las clases DIRECT, REPORTED y JUDGEMENTAL
        # y actualizar el valor en el DataFrame
        df_nuevo.at[index, 'value'] = {
            'DIRECT': probabilidad_yes * softlabel_0,
            'REPORTED': probabilidad_yes * softlabel_1,
            'JUDGEMENTAL': probabilidad_yes * softlabel_2,
            'NO': row['value']['NO']
        }

    return df_nuevo

# Aplicar la función al DataFrame dfv135
dfv135_actualizado = redistribuir_probabilidades(dfv135, df_concatenado)

In [None]:
dfv135_actualizado

Unnamed: 0,id,value,test_case
0,500001,"{'DIRECT': 0.022940158843994002, 'REPORTED': 0...",EXIST2024
1,500002,"{'DIRECT': 0.07370265324910466, 'REPORTED': 0....",EXIST2024
2,500003,"{'DIRECT': 0.04235394795735667, 'REPORTED': 0....",EXIST2024
3,500004,"{'DIRECT': 0.0, 'REPORTED': 0.0, 'JUDGEMENTAL'...",EXIST2024
4,500005,"{'DIRECT': 0.018356045087178335, 'REPORTED': 0...",EXIST2024
...,...,...,...
2071,600974,"{'DIRECT': 0.650450030962626, 'REPORTED': 0.32...",EXIST2024
2072,600975,"{'DIRECT': 0.0, 'REPORTED': 0.9730844497680661...",EXIST2024
2073,600976,"{'DIRECT': 0.0, 'REPORTED': 0.974555790424346,...",EXIST2024
2074,600977,"{'DIRECT': 0.975379824638366, 'REPORTED': 0.0,...",EXIST2024


In [None]:
# Guardar el DataFrame como JSON legible
dfv135_actualizado.to_json("/content/drive/MyDrive/exist2024_I2C-UHU/task2_soft_I2C-UHU_3.json", orient="records", indent=4)

In [None]:
print(dfv135.columns)

Index(['id', 'value', 'test_case'], dtype='object')


In [None]:
# import json

# # Crear una lista para almacenar los objetos JSON de cada fila
# json_objects = []

# # Iterar sobre cada fila del DataFrame
# for index, row in df_test_bin.iterrows():
#     # Obtener el id del tweet/meme
#     tweet_id = str(row['id_EXIST'])

#     # Obtener el valor de pred_label y establecer la mayoritaria y minoritaria
#     pred_label = row['pred_label']
#     if pred_label == 0:
#         majority_label = "NO"
#         minority_label = "YES"
#     elif pred_label == 1:
#         majority_label = "YES"
#         minority_label = "NO"
#     else:
#         # Manejo de valores imprevistos
#         majority_label = "UNKNOWN"
#         minority_label = "UNKNOWN"

#     # Obtener el valor de score_label y calcular la probabilidad correspondiente
#     score_label = row['score_label']
#     if majority_label == "YES":
#         probability_majority = score_label
#         probability_minority = 1 - score_label
#     else:
#         probability_minority = score_label
#         probability_majority = 1 - score_label

#     # Construir el objeto JSON con las probabilidades
#     json_obj = {
#         "id": tweet_id,
#         "value": {majority_label: probability_majority, minority_label: probability_minority},
#         "test_case": "EXIST2024"
#     }

#     # Agregar el objeto JSON a la lista
#     json_objects.append(json_obj)

# # Escribir la lista de objetos JSON en un archivo JSON
# with open('/content/drive/MyDrive/TEST/predicciones/v1.35/EN_DEBERTA_V3_BASE_v_1_35_soft.json', 'w') as f:
#     json.dump(json_objects, f, indent=2)


### Concatenador de jsons idiomas

In [None]:
# # Rutas de los archivos JSON en Google Drive
# file_path1 = '/content/drive/MyDrive/TEST/predicciones/v1.35/ES_ROBERTA_BASE_BNE_v_1_35_soft.json'
# file_path2 = '/content/drive/MyDrive/TEST/predicciones/v1.35/EN_DEBERTA_V3_BASE_v_1_35_soft.json'

# # Leer los archivos JSON y cargarlos en objetos Python
# with open(file_path1, 'r') as file:
#     content1 = json.load(file)

# with open(file_path2, 'r') as file:
#     content2 = json.load(file)

# # Concatenar los objetos Python
# combined_content = content1 + content2

# # Ruta para guardar el archivo JSON combinado
# output_file_path = '/content/drive/MyDrive/TEST/predicciones/v1.35/pred_v1_35_soft.json'

# # Guardar el contenido combinado en un nuevo archivo JSON
# with open(output_file_path, 'w') as file:
#     json.dump(combined_content, file)

# print("Archivos concatenados y guardados correctamente.")


Archivos concatenados y guardados correctamente.


## Comprobar json formato competición

In [None]:
import json
from google.colab import files
from jsonschema import validate
import io

ID= "id"
TEST_CASE="test_case"
VALUE = "value"
TASK1="task1"
TASK2="task2"
TASK3="task3"
TASK4="task4"
TASK5="task5"
TASK6="task6"

LIST_LABELS_TASK1=["NO", "YES"]
LIST_LABELS_TASK2=["NO", "REPORTED", "JUDGEMENTAL", "DIRECT"]
LIST_LABELS_TASK3=["NO", "IDEOLOGICAL-INEQUALITY", "STEREOTYPING-DOMINANCE", "MISOGYNY-NON-SEXUAL-VIOLENCE", "SEXUAL-VIOLENCE", "OBJECTIFICATION"]
LIST_LABELS_TASK5=["NO", "JUDGEMENTAL", "DIRECT"]

FORMAT_JSON_SCHEMA= {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "test_case": {"type": "string"},
            "id":{"type": "string"},
            "value": {
                "anyOf": [
                    {"type": "string"},
                    {"type": "array", "items": {"type": "string"},"minItems": 1},
                    {"type": "integer"},
                    {
                        "type": "object",
                        "patternProperties": {
                        "^.*$": {"type": "number"},    }
                    },
                ]
            },
        },
        "required": ["test_case", "id", "value"],
        "additionalProperties": False
    },

}

def parser_json(content):
    data = None
    try:
        data = json.loads(content)
    except ValueError as e:
        print(e)
        return False

    try:
        validate(instance=data, schema=FORMAT_JSON_SCHEMA)
    except jsonschema.exceptions.ValidationError as e:
        print("Errors found in the JSON content.\n", e)
        return False

    return True


def process_format_runs_by_task(file_content):
    try:
        data = json.loads(file_content)
    except ValueError as e:
        print(e)
        return

    for instance in data:
        for property in instance:
            if property==ID or property==TEST_CASE:
                continue
            elif property==VALUE:
                if type(instance[VALUE])==type(""):
                    if instance[VALUE] not in LIST_LABELS_TASK2:
                        print("ERROR in label format: ", instance[VALUE])
                elif type(instance[VALUE])==type([]):
                    if instance[VALUE] not in LIST_LABELS_TASK2:
                        print("ERROR in label format: ", instance[VALUE])
                elif type(instance[VALUE])==type(dict()):
                    labels = instance[VALUE]
                    if len(labels)!=2:
                        print("ERROR in label format: ", instance[VALUE])
                    x=0
                    for label in labels:
                        x+=float(labels[label])
                        if label not in LIST_LABELS_TASK2:
                            print("ERROR in label format: ", instance[VALUE])
                    if x>1.001:
                        print("ERROR in label format: ", instance[VALUE])
                else:
                    print("Error format value property.")
            else:
                print("ERROR in json format, property not allowed: ", property)

    print("Completed processing.")

In [None]:
# Ruta del archivo en Google Drive
file_path = '/content/drive/MyDrive/MODELOS TASK 1/TEST/predicciones/FINALES_TASK2/ESTRATEGIA 1 TASK 2 FINAL/xlm_roberta_base_xlm_roberta_base_task2_hard.json'

# Leer el contenido del archivo
with open(file_path, 'r') as file:
    content = file.read()

# Analizar y procesar el archivo JSON
process_format_runs_by_task(content)

Completed processing.
