In [1]:

import os
import pandas as pd
import time
import numpy as np
import re


In [2]:
# Paths
DATA_PATH ="../../data"

# path_lb_embb = os.path.join(DATA_PATH, "lb_npy.npy")
path_dataset = os.path.join(DATA_PATH, "spotify_dataset_sin_duplicados_4.csv")


## Extracción de embbedings con Lyrics Bert

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

MODEL_NAME = 'brunokreiner/lyrics-bert'
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, mean pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

print("Sentence embeddings:")
print(sentence_embeddings)


  from .autonotebook import tqdm as notebook_tqdm
2025-09-25 00:21:13.395317: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-25 00:21:13.474478: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-25 00:21:18.405937: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variab

Sentence embeddings:
tensor([[ 8.5982e-02, -1.9938e-01,  3.1403e-01, -2.1909e-01, -3.9347e-04,
          2.8418e-01,  1.1639e+00, -2.5347e-01, -5.9031e-01,  2.8978e-01,
         -4.3361e-01,  2.6265e-01,  1.2186e-02, -2.1822e-01,  1.0540e-01,
          1.8403e-01,  5.8598e-01, -1.6739e-01, -2.6611e-01,  3.8995e-01,
         -1.9267e-01,  3.5165e-01,  5.3251e-01,  4.8379e-01,  5.9596e-01,
         -1.0685e+00,  3.0516e-01, -4.1388e-01,  3.8347e-01, -3.3050e-01,
         -3.3174e-01,  4.0777e-01,  1.8344e-01, -2.9582e-01, -1.4029e-01,
         -4.2408e-01,  2.6159e-01, -2.7347e-01, -7.7970e-02, -3.2933e-01,
          3.9029e-01,  8.3547e-02,  1.7815e-01, -4.2287e-01,  2.5291e-02,
          5.4537e-01, -3.9749e-01,  1.0715e-01, -1.1429e+00,  4.7099e-01,
          2.0548e-01,  1.8556e-01, -3.1428e-01,  6.5785e-01,  2.4795e-01,
          5.3135e-01, -1.9034e-01,  3.3224e-01,  4.9253e-01, -1.7496e-02,
          4.9282e-01,  5.9564e-01,  2.2089e-01,  1.9639e-01, -1.7313e-01,
         -2.4567e

  return forward_call(*args, **kwargs)


## Clase que se encarga de la limpieza de los datos

In [4]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

import re

class TextPreprocessor:
    """
    Text preprocessing class for lyrics data
    """

    def __init__(self, language: str = 'english'):
        """
        Initialize the preprocessor

        Args:
            language: Language for stopwords (default: english)
        """
        self.language = language
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words(language))
        import re

    def clean_text(self, text: str) -> str:
        """
        Clean and normalize text

        Args:
            text: Input text to clean

        Returns:
            Cleaned text
        """
        if pd.isna(text) or text == '':
            return ''

        # Convert to lowercase
        text = str(text).lower()

        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # Remove special characters but keep spaces
        text = re.sub(r'[^\w\s]', ' ', text)

        # Remove numbers
        text = re.sub(r'\d+', '', text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()



        return text
    


    def tokenize_and_process(self, text, remove_stopwords = True,
                           apply_stemming = True):
        """
        Tokenize and process text

        Args:
            text: Input text
            remove_stopwords: Whether to remove stopwords
            apply_stemming: Whether to apply stemming

        Returns:
            List of processed tokens
        """
        if not text:
            return []

        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords if requested
        if remove_stopwords:
            tokens = [token for token in tokens if token not in self.stop_words]

        # Apply stemming if requested
        if apply_stemming:
            tokens = [self.stemmer.stem(token) for token in tokens]

        # Remove very short tokens
        tokens = [token for token in tokens if len(token) > 2]

        return tokens

    def preprocess(self, text: str, remove_stopwords: bool = True,
                   apply_stemming: bool = True) -> str:
        """
        Complete preprocessing pipeline

        Args:
            text: Input text
            remove_stopwords: Whether to remove stopwords
            apply_stemming: Whether to apply stemming

        Returns:
            Preprocessed text as string
        """
        cleaned_text = self.clean_text(text)
        tokens = self.tokenize_and_process(cleaned_text, remove_stopwords, apply_stemming)
        return ' '.join(tokens)



In [5]:
BATCH_SIZE = 500 # Porque nos quedamos sin RAM :(
INIT = 0
TOTAL_ROWS = 108138
# TOTAL_ROWS = 1000

# To save vectors
# save_dir = "/content/drive/MyDrive/embeddings_lyricsbert_to_fusion/"
save_dir = os.path.join(DATA_PATH, "new_embbedings_khipu")
os.makedirs(save_dir, exist_ok=True)
# start = 0
# end = 5000
# ROWS = end - start
A = ['text', 'song', 'Artist(s)', 'Album', 'Similar Artist 1', 'Genre']
B = ['Artist(s)', 'song', 'emotion', 'Genre', 'Album', 'Similar Artist 1', 'Similar Song 1', 'Similar Artist 2', 'Similar Song 2', 'Similar Artist 3', 'Similar Song 3', 'song_normalized', 'artist_normalized']
C = ['text', 'Artist(s)', 'song', 'emotion', 'Genre', 'Album', 'Similar Artist 1', 'Similar Song 1', 'Similar Artist 2', 'Similar Song 2', 'Similar Artist 3', 'Similar Song 3', 'song_normalized', 'artist_normalized']

D = ['emotion', 'Time signature', 'Artist(s)', 'song', 'Genre', 'Album', 'Release Date', 'Key', 'Similar Artist 1', 'Similar Song 1', 'Similar Artist 2', 'Similar Song 2', 'Similar Artist 3', 'Similar Song 3', 'song_normalized', 'artist_normalized']
T = ['text']
COL_LB = T

print("Embbedings generated with this cols: ")
print(COL_LB)
# ------ Configurations--------
steaming = True

# Incluiremos limpieza de datos 

for start in range(INIT, TOTAL_ROWS, BATCH_SIZE):
  end = min(start + BATCH_SIZE, TOTAL_ROWS)
  print(f"\nProcesando filas {start} a {end-1}...")
  df = pd.read_csv(path_dataset, skiprows=range(1, start + 1), nrows=end - start)
  
  df['combined_text'] = df[COL_LB].fillna('').agg(' '.join, axis=1)
  # Start preprocesing 
  preprocessor = TextPreprocessor()
  if steaming: 
    print("Preprocessing text...")
    # tqdm.pandas(desc="Text preprocessing")
    # df['processed_text'] = df['combined_text'].progress_apply(
    #     lambda x: preprocessor.preprocess(x, remove_stopwords=True, apply_stemming=steaming)
    # )
    df['processed_text'] = df['combined_text'].apply(
        lambda x: preprocessor.preprocess(x, remove_stopwords=True, apply_stemming=steaming)
    )
  else:
    df['processed_text'] = df['combined_text']

  # To se how is working the cleaning of the data
  # if start == 1: 
  print("Data witouth cleaning") 
  print(df['combined_text'])
  print("\nData cleaned")
  print(df['processed_text'])
  break


  df_sentences = df['processed_text'].fillna("").astype(str).tolist()

  print("shape de sentences: ", len(df_sentences))

  # Tokenize input
  start_time = time.time()

  encoded_df_input = tokenizer(df_sentences, padding=True, truncation=True, return_tensors='pt')

  # Compute embeddings
  with torch.no_grad():
    model_output = model(**encoded_df_input)

  # Pooling (mean)
  sentence_df_embeddings = mean_pooling(model_output, encoded_df_input['attention_mask'])



  # Print results
  end_time = time.time()
  print(f"\nTiempo total: {end_time - start_time:.2f} segundos")

  print("Iniciando guardado de los vectores: ")
  embeddings_np = sentence_df_embeddings.numpy()

  # Guardar en formato binario .npy
  npy_filename = f"embeddings_lyricsbert_{start}_{end-1}.npy"

  np.save(os.path.join(save_dir, npy_filename), embeddings_np)

  # # Guardar en CSV
  # csv_filename = f"embeddings_lyricsbert_{start}_{end-1}.csv"
  # np.savetxt(os.path.join(save_dir, csv_filename), embeddings_np, delimiter=",")
  print("Sentence embeddings:")
  # print(sentence_df_embeddings)
  print(sentence_df_embeddings.shape)
  print(f"Guardado batch {start}-{end-1}")


Embbedings generated with this cols: 
['text']

Procesando filas 0 a 499...
Preprocessing text...
Data witouth cleaning
0      Friends told her she was better off at the bot...
1      Well I heard it, playing soft From a drunken b...
2      [Verse 1: Bill] Yeah You don't got bars that d...
3      [Verse 1] As I walk through the valley where I...
4      [Intro] Everybody shut up! (Woo!) Everyone lis...
                             ...                        
495    [Verse 1: Daron Jones] What is this, numbers i...
496    [P. Diddy] Bad Boy Let's dance (7x) Yo, everyb...
497    [Intro] 112, 112 You know how we do This is th...
498    Oh yeah Yeah Fabolous 112 Peaches and cream Kn...
499    [Verse 1 - Daron (112)] I fell in love with yo...
Name: combined_text, Length: 500, dtype: object

Data cleaned
0      friend told better bottom river bed said tri k...
1      well heard play soft drunken bar jukebox momen...
2      vers bill yeah got bar drop hard hit neighborh...
3      vers walk val

In [6]:
## Make fusion of the embeddings of example
save_fussion = os.path.join(save_dir, "LB_fuss")
os.makedirs(save_fussion, exist_ok=True)

save_dir_df_npy = os.path.join(save_fussion, "lb_khipu_T.npy")
# print(os.listdir(save_dir))

embbedings_df_npy =os.listdir(save_dir)

def get_start_number(filename):
    match = re.search(r'embeddings_lyricsbert_(\d+)_\d+\.npy', filename)
    return int(match.group(1)) if match else float('inf')
embbedings_df_npy = sorted(embbedings_df_npy, key=get_start_number)

all_embeddings = []

for embb in embbedings_df_npy:
  if embb.endswith('.npy'):
    # print(embb)
    file_path = os.path.join(save_dir, embb)
    embeddings = np.load(file_path)
    all_embeddings.append(embeddings)
    print(f"Leído: {embb} - Shape: {embeddings.shape}")
    print(f"Total: {len(all_embeddings)}")

final_embeddings = np.vstack(all_embeddings)
print("Shape final:", final_embeddings.shape)

# Guardar en .npy
np.save(save_dir_df_npy, final_embeddings)
print(f"Embeddings guardados en: {save_dir_df_npy}")



Leído: embeddings_lyricsbert_0_499.npy - Shape: (500, 300)
Total: 1
Leído: embeddings_lyricsbert_500_999.npy - Shape: (500, 300)
Total: 2
Leído: embeddings_lyricsbert_1000_1499.npy - Shape: (500, 300)
Total: 3
Leído: embeddings_lyricsbert_1500_1999.npy - Shape: (500, 300)
Total: 4
Leído: embeddings_lyricsbert_2000_2499.npy - Shape: (500, 300)
Total: 5
Leído: embeddings_lyricsbert_2500_2999.npy - Shape: (500, 300)
Total: 6
Leído: embeddings_lyricsbert_3000_3499.npy - Shape: (500, 300)
Total: 7
Leído: embeddings_lyricsbert_3500_3999.npy - Shape: (500, 300)
Total: 8
Leído: embeddings_lyricsbert_4000_4499.npy - Shape: (500, 300)
Total: 9
Leído: embeddings_lyricsbert_4500_4999.npy - Shape: (500, 300)
Total: 10
Leído: embeddings_lyricsbert_5000_5499.npy - Shape: (500, 300)
Total: 11
Leído: embeddings_lyricsbert_5500_5999.npy - Shape: (500, 300)
Total: 12
Leído: embeddings_lyricsbert_6000_6499.npy - Shape: (500, 300)
Total: 13
Leído: embeddings_lyricsbert_6500_6999.npy - Shape: (500, 300)
Tot

Leído: embeddings_lyricsbert_47500_47999.npy - Shape: (500, 300)
Total: 96
Leído: embeddings_lyricsbert_48000_48499.npy - Shape: (500, 300)
Total: 97
Leído: embeddings_lyricsbert_48500_48999.npy - Shape: (500, 300)
Total: 98
Leído: embeddings_lyricsbert_49000_49499.npy - Shape: (500, 300)
Total: 99
Leído: embeddings_lyricsbert_49500_49999.npy - Shape: (500, 300)
Total: 100
Leído: embeddings_lyricsbert_50000_50499.npy - Shape: (500, 300)
Total: 101
Leído: embeddings_lyricsbert_50500_50999.npy - Shape: (500, 300)
Total: 102
Leído: embeddings_lyricsbert_51000_51499.npy - Shape: (500, 300)
Total: 103
Leído: embeddings_lyricsbert_51500_51999.npy - Shape: (500, 300)
Total: 104
Leído: embeddings_lyricsbert_52000_52499.npy - Shape: (500, 300)
Total: 105
Leído: embeddings_lyricsbert_52500_52999.npy - Shape: (500, 300)
Total: 106
Leído: embeddings_lyricsbert_53000_53499.npy - Shape: (500, 300)
Total: 107
Leído: embeddings_lyricsbert_53500_53999.npy - Shape: (500, 300)
Total: 108
Leído: embedding

Leído: embeddings_lyricsbert_65000_65499.npy - Shape: (500, 300)
Total: 131
Leído: embeddings_lyricsbert_65500_65999.npy - Shape: (500, 300)
Total: 132
Leído: embeddings_lyricsbert_66000_66499.npy - Shape: (500, 300)
Total: 133
Leído: embeddings_lyricsbert_66500_66999.npy - Shape: (500, 300)
Total: 134
Leído: embeddings_lyricsbert_67000_67499.npy - Shape: (500, 300)
Total: 135
Leído: embeddings_lyricsbert_67500_67999.npy - Shape: (500, 300)
Total: 136
Leído: embeddings_lyricsbert_68000_68499.npy - Shape: (500, 300)
Total: 137
Leído: embeddings_lyricsbert_68500_68999.npy - Shape: (500, 300)
Total: 138
Leído: embeddings_lyricsbert_69000_69499.npy - Shape: (500, 300)
Total: 139
Leído: embeddings_lyricsbert_69500_69999.npy - Shape: (500, 300)
Total: 140
Leído: embeddings_lyricsbert_70000_70499.npy - Shape: (500, 300)
Total: 141
Leído: embeddings_lyricsbert_70500_70999.npy - Shape: (500, 300)
Total: 142
Leído: embeddings_lyricsbert_71000_71499.npy - Shape: (500, 300)
Total: 143
Leído: embed

Leído: embeddings_lyricsbert_102000_102499.npy - Shape: (500, 300)
Total: 205
Leído: embeddings_lyricsbert_102500_102999.npy - Shape: (500, 300)
Total: 206
Leído: embeddings_lyricsbert_103000_103499.npy - Shape: (500, 300)
Total: 207
Leído: embeddings_lyricsbert_103500_103999.npy - Shape: (500, 300)
Total: 208
Leído: embeddings_lyricsbert_104000_104499.npy - Shape: (500, 300)
Total: 209
Leído: embeddings_lyricsbert_104500_104999.npy - Shape: (500, 300)
Total: 210
Leído: embeddings_lyricsbert_105000_105499.npy - Shape: (500, 300)
Total: 211
Leído: embeddings_lyricsbert_105500_105999.npy - Shape: (500, 300)
Total: 212
Leído: embeddings_lyricsbert_106000_106499.npy - Shape: (500, 300)
Total: 213
Leído: embeddings_lyricsbert_106500_106999.npy - Shape: (500, 300)
Total: 214
Leído: embeddings_lyricsbert_107000_107499.npy - Shape: (500, 300)
Total: 215
Leído: embeddings_lyricsbert_107500_107999.npy - Shape: (500, 300)
Total: 216
Leído: embeddings_lyricsbert_108000_108137.npy - Shape: (138, 30

Embeddings guardados en: ../../data/new_embbedings_khipu/LB_fuss/lb_khipu_T.npy


In [7]:
#Delete the butches files

dir_clean = save_dir
print(dir_clean)

butches_embb = [
    f for f in os.listdir(dir_clean)
    if os.path.isfile(os.path.join(dir_clean, f))
]

for butch in butches_embb:
    # if butch != "lb_khipu_A.npy" and butch != "lb_khipu.npy" and butch !="lb_khipu_B.npy" and butch !="lb_khipu_C.npy" and butch !="lb_khipu_D.npy"  :
        # Delete 
    file_path = os.path.join(dir_clean, butch)
    print(file_path)
    os.remove(file_path)


../../data/new_embbedings_khipu
../../data/new_embbedings_khipu/embeddings_lyricsbert_1000_1499.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_500_999.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_36500_36999.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_45500_45999.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_63000_63499.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_26500_26999.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_42000_42499.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_63500_63999.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_58000_58499.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_86500_86999.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_68000_68499.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_60500_60999.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_69000_69499.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_83000_8349

../../data/new_embbedings_khipu/embeddings_lyricsbert_48500_48999.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_35500_35999.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_24500_24999.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_11000_11499.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_78000_78499.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_107000_107499.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_97500_97999.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_103000_103499.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_71500_71999.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_53500_53999.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_3000_3499.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_49500_49999.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_6000_6499.npy
../../data/new_embbedings_khipu/embeddings_lyricsbert_19500_19999.npy
../../data/new_embbe