In [48]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import pipeline
import torch
import numpy as np
import os 
from datetime import datetime
import textwrap
import random

In [49]:
RANDOM_STATE = 42
path_DATA = "../../../../data/spanish"


# Dataset path
csv_path = f"{path_DATA}/dataset/oficialDatasetEAIM2026.csv"

paths_ = f"../experimentation/s__zero_shot_classification"

# Zero-shot file path
path_df_zero_test = f"{paths_}/deberta_test_predictions.csv"
path_df_zero_train = f"{paths_}/deberta_train_predictions.csv"

# Prompting path
prompting_path_test = f"{paths_}/predicciones_test_llama_8B.csv"
prompting_path_train = f"{paths_}/predicciones_train_llama_8B.csv"

TESTING = False

if TESTING:
    NROWS = 50
else:
    NROWS = None
    


In [77]:
# Load dataset

path_train_indices = f"{path_DATA}/train_indices_spanish.csv"
path_test_indices = f"{path_DATA}/test_indices_spanish.csv"

df_train_indices = pd.read_csv(path_train_indices, nrows=NROWS)
df_test_indices = pd.read_csv(path_test_indices, nrows=NROWS)

idx_train = df_train_indices['original_index'].values
idx_test = df_test_indices['original_index'].values

df_original = pd.read_csv(csv_path, nrows=NROWS)
df_song_train = df_original.iloc[idx_train].copy().reset_index(drop=True)
df_song_train['original_index'] = idx_train

df_song_test = df_original.iloc[idx_test].copy().reset_index(drop=True)
df_song_test['original_index'] = idx_test




display(df_song_train.head(2))
print("df_song_train shape:", df_song_train.shape)
display(df_song_test.head(2))
print("df_song_test shape:", df_song_test.shape)

display(df_original.head(2))
# print("df_original shape:", df_original.shape)

Unnamed: 0,_id,artist,genre,lyrics,composer,lyrics_word_count,title_songs_new,spotify_id,popularity,explicit_content,duration_ms,release_date,external_urls.spotify,letras_path,id_yt,original_index
0,689a8a9da437eda121bd54f1,Larry Hernandez,corridos,(Y el ventero que pida pariente\nY yo si se lo...,¿Sabes quién compuso esta canción? Envíanoslo.,318,Gente Vip,3g1GoAq0u4uDzrXFgvMII2,57,False,174586,2012-10-30,https://open.spotify.com/track/3g1GoAq0u4uDzrX...,Larry Hernandez/Gente Vip,f9WWYpC7kL0,2097
1,689a8ab7a437eda121bd6def,Conchita Bautista,bolero,"Qué bueno, qué bueno, qué bueno\nSaber que tú ...",Compuesta por: Antonio Figueroa Egea.,202,"¡Qué Bueno, Qué Bueno!",6ojK3fWtv67fPP3yMti3ZC,15,False,146471,2017-03-24,https://open.spotify.com/track/6ojK3fWtv67fPP3...,Conchita Bautista/Que Bueno Que Bueno,V9TbIw45Gec,4037


df_song_train shape: (5855, 16)


Unnamed: 0,_id,artist,genre,lyrics,composer,lyrics_word_count,title_songs_new,spotify_id,popularity,explicit_content,duration_ms,release_date,external_urls.spotify,letras_path,id_yt,original_index
0,689a8aafa437eda121bd65ae,Los Wawanco,mambo,"Se va el caimán, se va el caimán, (se va para ...",¿Sabes quién compuso esta canción? Envíanoslo.,298,Se Va El Caiman,6HXIJnjpxc05YZ47h8UzXq,11,False,61552,2021-12-17,https://open.spotify.com/track/6HXIJnjpxc05YZ4...,Los Wawanco/Se Va El Caiman,8qfTXhVCHDM,2988
1,689a8eaca437eda121bd84fe,Pimpinela,romantico,"Lucía: Hay amores que no se olvidan, \nAunque ...",Compuesta por: Joaquín Galán / Lucía Galán.,279,El Amor no se Puede Olvidar,5cwSxZHNXjWpR5KnC9S87k,57,False,294000,1993-01-01,https://open.spotify.com/track/5cwSxZHNXjWpR5K...,Pimpinela/318810,vy2rW_uvro8,6399


df_song_test shape: (1464, 16)


Unnamed: 0,_id,artist,genre,lyrics,composer,lyrics_word_count,title_songs_new,spotify_id,popularity,explicit_content,duration_ms,release_date,external_urls.spotify,letras_path,id_yt
0,689a8a83a437eda121bd38ce,Laura Pausini,pop,Ya no responde ni al teléfono\nPende de un hil...,Compuesta por: Federico Cavalli / Angelo Valsi...,445,Se Fue,5oQadhkuEdEhtdVn0QceyZ,71,False,240179,2024-11-15,https://open.spotify.com/track/5oQadhkuEdEhtdV...,Laura Pausini/30278,g-GBiuujmL8
1,689a8a83a437eda121bd389f,Shakira,pop,Loca\n(Loca)\nNo te ponga' bruto\n\nQue te la ...,Compuesta por: Pitbull / El Cata / Shakira / C...,360,Loca (part. El Cata),42k1KeBehAd83lrGt1okiC,76,False,183693,2010-10-19,https://open.spotify.com/track/42k1KeBehAd83lr...,Shakira/1735339,XAhTt60W7qo


In [62]:
# Zero-shot results dataframe
df_zero_shot_train = pd.read_csv(path_df_zero_train, nrows=NROWS)
df_zero_shot_test = pd.read_csv(path_df_zero_test, nrows=NROWS)
print("df_zero_shot_train shape:", df_zero_shot_train.shape)
print("df_zero_shot_test shape:", df_zero_shot_test.shape)
df_zero_shot_train['explicit_score'] = (df_zero_shot_train['explicit_score'] >= 0.5).astype(int)
df_zero_shot_test['explicit_score'] = (df_zero_shot_test['explicit_score'] >= 0.5).astype(int)

display(df_zero_shot_train.head())
display(df_zero_shot_test.head())


df_zero_shot_train shape: (5855, 4)
df_zero_shot_test shape: (1464, 4)


Unnamed: 0,original_index,__id,explicit_score,prediccion
0,2097,689a8a9da437eda121bd54f1,0,0
1,4037,689a8ab7a437eda121bd6def,1,1
2,2416,689a8a9fa437eda121bd5723,0,0
3,5377,689a8ac1a437eda121bd7974,0,0
4,202,689a8a86a437eda121bd3c09,0,0


Unnamed: 0,original_index,__id,explicit_score,prediccion
0,2988,689a8aafa437eda121bd65ae,0,0
1,6399,689a8eaca437eda121bd84fe,0,0
2,4406,689a8abba437eda121bd71c4,0,0
3,6099,689a8ac8a437eda121bd81c1,0,0
4,1473,689a8a99a437eda121bd5171,0,0


In [63]:
# df prompting results
df_prompting_train = pd.read_csv(prompting_path_train, nrows=NROWS)
df_prompting_test = pd.read_csv(prompting_path_test, nrows=NROWS)

print("df_prompting_train shape:", df_prompting_train.shape)
print("df_prompting_test shape:", df_prompting_test.shape)
display(df_prompting_train.head())
display(df_prompting_test.head())

df_prompting_train shape: (5855, 3)
df_prompting_test shape: (1464, 3)


Unnamed: 0,original_index,__id,prediccion
0,2097,689a8a9da437eda121bd54f1,0
1,4037,689a8ab7a437eda121bd6def,0
2,2416,689a8a9fa437eda121bd5723,0
3,5377,689a8ac1a437eda121bd7974,0
4,202,689a8a86a437eda121bd3c09,1


Unnamed: 0,original_index,__id,prediccion
0,2988,689a8aafa437eda121bd65ae,0
1,6399,689a8eaca437eda121bd84fe,0
2,4406,689a8abba437eda121bd71c4,1
3,6099,689a8ac8a437eda121bd81c1,0
4,1473,689a8a99a437eda121bd5171,0


In [65]:
# We are going to check the indexes to make sure everything aligns
if df_zero_shot_train['original_index'].equals(df_prompting_train['original_index']):
    # print("Indexes match between zero-shot and prompting results.")
    if df_song_train['original_index'].equals(df_prompting_train['original_index']):
        print('Correct')
    if df_song_test['original_index'].equals(df_song_test['original_index']):
        print('Correct')

    if df_zero_shot_test['original_index'].equals(df_prompting_test['original_index']):
        print("Indexes match between zero-shot and prompting results for test and train set as well.")
    else:
        print("Indexes do not match between zero-shot and prompting results for test set. Please check.")

else:
    print("Indexes do not match between zero-shot and prompting results. Please check.")



Correct
Correct
Indexes match between zero-shot and prompting results for test and train set as well.


In [66]:
display(df_original.head(1))

Unnamed: 0,_id,artist,genre,lyrics,composer,lyrics_word_count,title_songs_new,spotify_id,popularity,explicit_content,duration_ms,release_date,external_urls.spotify,letras_path,id_yt
0,689a8a83a437eda121bd38ce,Laura Pausini,pop,Ya no responde ni al teléfono\nPende de un hil...,Compuesta por: Federico Cavalli / Angelo Valsi...,445,Se Fue,5oQadhkuEdEhtdVn0QceyZ,71,False,240179,2024-11-15,https://open.spotify.com/track/5oQadhkuEdEhtdV...,Laura Pausini/30278,g-GBiuujmL8


In [78]:
new_labeles = []

# Start with train set
for i in range(len(df_song_train) +len(df_song_test)):
    # Remember df is completly splitted
    # So I have to find i in train or test

    if i in df_song_train['original_index'].values:
        # idx = df_song_train.loc[df_song_train['original_index'] == i].index
        idx = df_song_train.loc[df_song_train['original_index'] == i].index[0]
        df_label = df_song_train.iloc[idx]['explicit_content']
        zero_shot_label = df_zero_shot_train.iloc[idx]['prediccion']
        prompting_label = df_prompting_train.iloc[idx]['prediccion']
    else:
        # idx = df_song_test.loc[df_song_test['original_index'] == i].index
        idx = df_song_test.loc[df_song_test['original_index'] == i].index[0]

        df_label = df_song_test.iloc[idx]['explicit_content']
        zero_shot_label = df_zero_shot_test.iloc[idx]['prediccion']
        prompting_label = df_prompting_test.iloc[idx]['prediccion']

    # print(f'zero_shot_label:[{zero_shot_label}]')
    # print(f'prompting_label:[{prompting_label}]')
    # print(f'df_label:[{df_label}]')

    if zero_shot_label == prompting_label and zero_shot_label == df_label:
        new_labeles.append(df_label)
        # If all are equal, keep the original label
    elif zero_shot_label ==1 and prompting_label ==1:
        new_labeles.append(1)
        # If both models say explicit, label as explicit
    elif df_label == 1 and (zero_shot_label ==0 and prompting_label ==0):
        new_labeles.append(1)
        # If original is explicit and one model says explicit, label as explicit
    else:
        # Majority vote 
        votes = df_label + zero_shot_label + prompting_label
        if votes >=2:
            new_labeles.append(1)
        else:
            new_labeles.append(0)






# # Save the final dataset
new_csv_path = f"{path_DATA}/dataset/oficialDatasetEAIM2026_pseudolabeling.csv"

df_original['pseudo_label_explicit'] = new_labeles
df_original.to_csv(new_csv_path, index=False)



In [80]:
mismached_count = (df_original['explicit_content'] != df_original['pseudo_label_explicit']).sum()
total_rows = len(df_original)
mismached_percent = (mismached_count / total_rows) * 100

print("df_original shape:", df_original.shape)
print("Mismatched indexes count:", mismached_count)
print(f"Mismatched percentage: {mismached_percent:.2f}%")


df_original shape: (7319, 16)
Mismatched indexes count: 688
Mismatched percentage: 9.40%
