In [2]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import pipeline
import torch
import numpy as np
import os 
from datetime import datetime
import textwrap
import random

In [3]:
RANDOM_STATE = 42
path_DATA = "../../../data"

# Dataset path
csv_path = f"{path_DATA}/spotify_dataset_sin_duplicados_4.csv"
zero_shot_path = f"{path_DATA}/zero_outputs"

# Zero-shot file path
path_df_zero = f"{zero_shot_path}/zero_shot_test_results.csv"

# TF-IDF file path
tfidf_path = f"{zero_shot_path}/tfidf/test_predictions.csv"


TESTING = False

if TESTING:
    NROWS = 50
else:
    NROWS = None
    


In [4]:
def get_array(path):
    with open(path, "r") as f:
        array_ = json.load(f)  
    return array_

def get_song_and_target(csv_path, faltantes_path = "faltantes_according_token.json", sample_size=None):
    df = pd.read_csv(csv_path, nrows=sample_size)
    indices_to_remove = get_array(faltantes_path)
    if indices_to_remove[-1]>df.shape[0]:
        print("You are executing in testing way")
    else:
        df = df.drop(indices_to_remove).reset_index(drop=True)

    df['original_index'] = df.index # Indices correctos despues de la eliminación
    X = df['text']
    df['Explicit_binary'] = (df['Explicit'].str.lower() == 'yes').astype(int)
    y = df['Explicit_binary']
    return X, y, df['original_index']

In [5]:
# Load dataset
# df_song = pd.read_csv(csv_path, nrows=NROWS)
df_song,y, _ = get_song_and_target(csv_path,"faltantes_according_token.json", sample_size=NROWS)
display(df_song.head())

0    Friends told her she was better off at the bot...
1    Well I heard it, playing soft From a drunken b...
2    [Verse 1: Bill] Yeah You don't got bars that d...
3    [Verse 1] As I walk through the valley where I...
4    [Intro] Everybody shut up! (Woo!) Everyone lis...
Name: text, dtype: object

In [6]:
# Zero-shot results dataframe
df_zero_shot = pd.read_csv(path_df_zero, nrows=NROWS)
display(df_zero_shot.head())


Unnamed: 0,original_index,explicit_score
0,83131,0.999403
1,46547,0.996371
2,292,0.999687
3,22251,0.015336
4,81674,0.607372


In [7]:
df_metadata_tfidf = pd.read_csv(tfidf_path, nrows=NROWS)
display(df_metadata_tfidf.head())


Unnamed: 0,test_index,predicted_label
0,83131,1
1,46547,1
2,292,1
3,22251,0
4,81674,0


In [8]:
# We are going to check the indexes to make sure everything aligns
if df_zero_shot['original_index'].equals(df_metadata_tfidf['test_index']):
    print("Indexes align correctly between zero-shot and TF-IDF dataframes.")



Indexes align correctly between zero-shot and TF-IDF dataframes.


In [None]:
df_zero_shot['thresholded_05'] = (df_zero_shot['explicit_score'] >= 0.5).astype(int)

display(df_zero_shot.head())

Unnamed: 0,original_index,explicit_score,thresholded_05
0,83131,0.999403,1
1,46547,0.996371,1
2,292,0.999687,1
3,22251,0.015336,0
4,81674,0.607372,1


In [34]:
# Wrapper for better text display
wrapper = textwrap.TextWrapper(width=100)

# Indexes in the original dataframe
indexes = df_zero_shot.loc[
    df_zero_shot['thresholded_05'] != df_metadata_tfidf['predicted_label'],
    'original_index'
].tolist()

# df_indexes = [270]
print(f"Number of mismatched predictions: {len(indexes)}")
k = 5
random_indexes = random.sample(indexes, min(k, len(indexes)))
random_indexes.insert(0, int(df_zero_shot['original_index'][7]))
random_indexes.insert(0, int(df_zero_shot['original_index'][270]))

print(random_indexes)
for idx in random_indexes: 
    print(f"\nIndex: {idx}")
    song_text = df_song.iloc[idx]
    wrapped_text = wrapper.fill(song_text)  # Dividir en líneas de ancho 100
    print(f"Song Text:\n{wrapped_text}\n")
    print(f"True Label: {y.iloc[idx]}")
    idx_in_zero_shot = df_zero_shot[df_zero_shot['original_index'] == idx].index[0]
    idx_in_tfidf = df_metadata_tfidf[df_metadata_tfidf['test_index'] == idx].index[0]
    print(f"Zero-shot Prediction: {df_zero_shot.iloc[idx_in_zero_shot]['thresholded_05']} (Score: {df_zero_shot.iloc[idx_in_zero_shot]['explicit_score']})")
    print(f"TF-IDF Prediction: {df_metadata_tfidf.iloc[idx_in_tfidf]['predicted_label']}")

Number of mismatched predictions: 5118
[80549, 45621, 98372, 11407, 34396, 67140, 106722]

Index: 80549
Song Text:
[Intro] Drum Dummie Man,  I feel like that too, like (DJ Swift on the track) I  could watch the
fuckin' sun come up, you know? Never sleep, like a lil' kid (Mook got the keys jumpin')  [Chorus]
As  nights turn into days I  wonder why things won't change, yeah (Won't change, won't change) The
more I stay, the more it hurts Before  it gets better, it's gon' get worse Through all of the issues
and all of the pain I find happiness again Through all of the issues and all of the pain They  say
sun shines after it rains So as nights turn into days I wanna fly away, uh, yeah  [Verse 1] While in
my Sprinter van, I'm starin' at the cars passin' Wonderin' if it's enough space for me in thug's
mansion Ayy, fuck a rap nigga, fuck a feature, feel my life was harder I went and got it out the
mud, that's on my mama's daughters Don't get along with none of these niggas, get that from my
fa