In [6]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import pipeline
import torch
import numpy as np
import os 
from datetime import datetime


In [7]:
RANDOM_STATE = 42
path_DATA = "../../../data"
csv_path = f"{path_DATA}/spotify_dataset_sin_duplicados_4.csv"

# To compare with  tfidf
zero_shot_path = f"{path_DATA}/zero_outputs"
os.makedirs(zero_shot_path, exist_ok=True)



TESTING = False

if TESTING:
    NROWS = 50
else:
    NROWS = None
    


In [8]:
def get_array(path):
    with open(path, "r") as f:
        array_ = json.load(f)  
    return array_

def get_song_and_target(csv_path, faltantes_path = "faltantes_according_token.json", sample_size=None):
    df = pd.read_csv(csv_path, nrows=sample_size)
    indices_to_remove = get_array(faltantes_path)
    if indices_to_remove[-1]>df.shape[0]:
        print("You are executing in testing way")
    else:
        df = df.drop(indices_to_remove).reset_index(drop=True)

    df['original_index'] = df.index # Indices correctos despues de la eliminación
    X = df['text']
    df['Explicit_binary'] = (df['Explicit'].str.lower() == 'yes').astype(int)
    y = df['Explicit_binary']
    return X, y, df['original_index']

In [9]:
X,y, index = get_song_and_target(csv_path,"faltantes_according_token.json", sample_size=NROWS)

print(X.shape)
print(y.shape)
print(index.shape)


(108125,)
(108125,)
(108125,)


MoritzLaurer/deberta-v3-large-zeroshot-v2.0  
https://huggingface.co/MoritzLaurer/deberta-v3-large-zeroshot-v2.0

In [10]:
df_split_indices = pd.read_csv(f"{zero_shot_path}/split_indices_train.csv")
df_split_indices_test = pd.read_csv(f"{zero_shot_path}/split_indices_test.csv")

idx_train = df_split_indices["train_index"].values
idx_test = df_split_indices_test["test_index"].values

X_train = X.iloc[idx_train].reset_index(drop=True)
X_test = X.iloc[idx_test].reset_index(drop=True)

y_train = y.iloc[idx_train].reset_index(drop=True)
y_test = y.iloc[idx_test].reset_index(drop=True)

hypothesis_template = "This lyric contains {} content"
classes_verbalized = ["explicit", "not explicit"]

device = 0 if torch.cuda.is_available() else -1

zeroshot_classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
    device=device
)

Device set to use cuda:0


## Prueba con treshold 0.5

In [12]:
best_threshold = 0.5
print("Probando con: ",best_threshold )
test_scores = []
for lyric in X_test:
    output = zeroshot_classifier(
        lyric,
        classes_verbalized,
        hypothesis_template=hypothesis_template,
        multi_label=False
    )
    score_explicit = output['scores'][output['labels'].index('explicit')]
    test_scores.append(score_explicit)

test_scores = np.array(test_scores)
test_preds = (test_scores >= best_threshold).astype(int)

# Métricas
f1_test = f1_score(y_test, test_preds)
accuracy_test = accuracy_score(y_test, test_preds)
precision_test = precision_score(y_test, test_preds)
recall_test = recall_score(y_test, test_preds)

df_results = pd.DataFrame({
    "original_index": idx_test,
    "explicit_score": test_scores
})
df_results.to_csv(f"{zero_shot_path}/zero_shot_test_results.csv", index=False)


print(f"Accuracy en test: {accuracy_test:.8f}")
print(f"Precision en test: {precision_test:.8f}")
print(f"Recall en test: {recall_test:.8f}")
print(f"F1 en test: {f1_test:.8f}")

Probando con:  0.5
Accuracy en test: 0.77553757
Precision en test: 0.51920181
Recall en test: 0.80174419
F1 en test: 0.63025594
