In [19]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import pipeline
import torch
import numpy as np
import os 
from datetime import datetime


In [None]:
RANDOM_STATE = 42
path_DATA = "../../../data"
csv_path = f"{path_DATA}/spotify_dataset_sin_duplicados_4.csv"

# To compare with  tfidf
zero_shot_path = f"{path_DATA}/zero_outputs"
os.makedirs(zero_shot_path, exist_ok=True)



TESTING = False

if TESTING:
    NROWS = 50
else:
    NROWS = None
    


In [11]:
def get_array(path):
    with open(path, "r") as f:
        array_ = json.load(f)  
    return array_

def get_song_and_target(csv_path, faltantes_path = "faltantes_according_token.json", sample_size=None):
    df = pd.read_csv(csv_path, nrows=sample_size)
    indices_to_remove = get_array(faltantes_path)
    if indices_to_remove[-1]>df.shape[0]:
        print("You are executing in testing way")
    else:
        df = df.drop(indices_to_remove).reset_index(drop=True)

    df['original_index'] = df.index # Indices correctos despues de la eliminación
    X = df['text']
    df['Explicit_binary'] = (df['Explicit'].str.lower() == 'yes').astype(int)
    y = df['Explicit_binary']
    return X, y, df['original_index']

In [13]:
X,y, index = get_song_and_target(csv_path,"faltantes_according_token.json", sample_size=NROWS)

print(X.shape)
print(y.shape)
print(index.shape)


You are executing in testing way
(50,)
(50,)
(50,)


MoritzLaurer/deberta-v3-large-zeroshot-v2.0  
https://huggingface.co/MoritzLaurer/deberta-v3-large-zeroshot-v2.0

In [20]:
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X,
    y,
    index,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

print("X_train.shape:", X_train.shape)
print("y_test.shape:", y_train.shape)


hypothesis_template = "This lyric contains {} content"
classes_verbalized = ["explicit", "not explicit"]

device = 0 if torch.cuda.is_available() else -1

zeroshot_classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
    device=device
)

train_scores = []
for lyric in X_train:
    output = zeroshot_classifier(
        lyric,
        classes_verbalized,
        hypothesis_template=hypothesis_template,
        multi_label=False
    )
    score_explicit = output['scores'][output['labels'].index('explicit')]
    # print(output['labels'])
    # print(output['scores'])
    # print(score_explicit)
    train_scores.append(score_explicit)
    

train_scores = np.array(train_scores)

# Guardamos los indices originales con su probabilidad de explicitud
df_results = pd.DataFrame({
    "original_index": idx_train.values,
    "explicit_score": train_scores
})
df_results.to_csv(f"{zero_shot_path}/zero_shot_train_results.csv", index=False)

# Guardamos los indices de train y tes
df_split_indices = pd.DataFrame({
    "train_index": idx_train.values,
})

# Crear otro DataFrame para test
df_split_indices_test = pd.DataFrame({
    "test_index": idx_test.values,
})

df_split_indices.to_csv(f"{zero_shot_path}/split_indices_train.csv", index=False)
df_split_indices_test.to_csv(f"{zero_shot_path}/split_indices_test.csv", index=False)



mensaje = f"Proceso completado exitosamente.\nFecha y hora: {datetime.now()}\n"

with open(f"{zero_shot_path}/split_status.txt", "w", encoding="utf-8") as f:
    f.write(mensaje)

print("Se guardó split_status.txt indicando la finalización.")


X_train.shape: (40,)
y_test.shape: (40,)


Device set to use cuda:0


Se guardó split_status.txt indicando la finalización.


In [6]:
thresholds = np.arange(0.1, 0.9, 0.05)
best_threshold = 0.5
best_f1 = 0

for t in thresholds:
    preds_binary = (train_scores >= t).astype(int)
    f1 = f1_score(y_train, preds_binary)
    # print(f"Threshold {t:.2f} -> F1: {f1:.4f}")
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print("Mejor threshold:", best_threshold, "con F1:", best_f1)

Mejor threshold: 0.8000000000000002 con F1: 0.6363636363636364


In [8]:
test_scores = []
for lyric in X_test:
    output = zeroshot_classifier(
        lyric,
        classes_verbalized,
        hypothesis_template=hypothesis_template,
        multi_label=False
    )
    score_explicit = output['scores'][output['labels'].index('explicit')]
    test_scores.append(score_explicit)

test_scores = np.array(test_scores)
test_preds = (test_scores >= best_threshold).astype(int)

# Métricas
f1_test = f1_score(y_test, test_preds)
accuracy_test = accuracy_score(y_test, test_preds)
precision_test = precision_score(y_test, test_preds)
recall_test = recall_score(y_test, test_preds)

print(f"Accuracy en test: {accuracy_test:.8f}")
print(f"Precision en test: {precision_test:.8f}")
print(f"Recall en test: {recall_test:.8f}")
print(f"F1 en test: {f1_test:.8f}")

Accuracy en test: 0.80000000
Precision en test: 0.50000000
Recall en test: 1.00000000
F1 en test: 0.66666667


## Prueba con treshold 0.5

In [9]:
best_threshold = 0.5
print("Probando con: ",best_threshold )
test_scores = []
for lyric in X_test:
    output = zeroshot_classifier(
        lyric,
        classes_verbalized,
        hypothesis_template=hypothesis_template,
        multi_label=False
    )
    score_explicit = output['scores'][output['labels'].index('explicit')]
    test_scores.append(score_explicit)

test_scores = np.array(test_scores)
test_preds = (test_scores >= best_threshold).astype(int)

# Métricas
f1_test = f1_score(y_test, test_preds)
accuracy_test = accuracy_score(y_test, test_preds)
precision_test = precision_score(y_test, test_preds)
recall_test = recall_score(y_test, test_preds)

print(f"Accuracy en test: {accuracy_test:.8f}")
print(f"Precision en test: {precision_test:.8f}")
print(f"Recall en test: {recall_test:.8f}")
print(f"F1 en test: {f1_test:.8f}")

Probando con:  0.5
Accuracy en test: 0.60000000
Precision en test: 0.33333333
Recall en test: 1.00000000
F1 en test: 0.50000000
