In [49]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [50]:
df = pd.read_parquet("../data/df_step2.parquet")

In [51]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 497975 entries, 0 to 551442
Data columns (total 139 columns):
 #    Column                          Dtype  
---   ------                          -----  
 0    Lyrics                          object 
 1    Length                          float64
 2    emotion                         object 
 3    Tempo                           float64
 4    Loudness                        float64
 5    Explicit                        float64
 6    Popularity                      float64
 7    Energy                          float64
 8    Danceability                    float64
 9    Positiveness                    float64
 10   Speechiness                     float64
 11   Liveness                        float64
 12   Acousticness                    float64
 13   Instrumentalness                float64
 14   Good for Party                  float64
 15   Good for Work/Study             float64
 16   Good for Relaxation/Meditation  float64
 17   Good for Exer

# Preprocesamiento 2

### Embeddings

In [52]:
# use a sample of the dataset, or the full dataset
useASample = True
if (useASample):
    df = df.sample(n=1000, random_state=42)

In [53]:
# Instanciás el modelo preentrenado
model = SentenceTransformer('all-MiniLM-L6-v2')

# Transformás las letras en embeddings
embeddings = model.encode(df['Lyrics'].tolist(), convert_to_tensor=False, show_progress_bar=True)

# Convertís los embeddings a DataFrame
embeddings_df = pd.DataFrame(embeddings, columns=[f'lyric_emb_{i}' for i in range(embeddings.shape[1])])

# Reseteás el índice de ambos DataFrames
df = df.reset_index(drop=True)
embeddings_df = embeddings_df.reset_index(drop=True)

# Concatenás al dataset original (sin la columna Lyrics)
df_final = pd.concat([df.drop(columns=['Lyrics']), embeddings_df], axis=1)

# Mostrás el resultado
df_final.head()

Batches: 100%|██████████| 32/32 [00:09<00:00,  3.27it/s]
Batches: 100%|██████████| 32/32 [00:09<00:00,  3.27it/s]


Batches: 100%|██████████| 32/32 [00:09<00:00,  3.27it/s]
Batches: 100%|██████████| 32/32 [00:09<00:00,  3.27it/s]


Unnamed: 0,Length,emotion,Tempo,Loudness,Explicit,Popularity,Energy,Danceability,Positiveness,Speechiness,...,lyric_emb_374,lyric_emb_375,lyric_emb_376,lyric_emb_377,lyric_emb_378,lyric_emb_379,lyric_emb_380,lyric_emb_381,lyric_emb_382,lyric_emb_383
0,0.076535,sadness,0.307692,0.78016,0.0,0.24,0.48,0.537634,0.33,0.0,...,0.008532,0.039354,0.016848,0.090464,-0.022972,-0.034667,0.015312,0.112379,-0.072987,-0.049524
1,0.046818,love,0.905325,0.700036,0.0,0.12,0.43,0.193548,0.15,0.031579,...,0.046552,0.047393,0.028893,0.057109,-0.008307,-0.025344,0.022032,0.074153,-0.026245,-0.100412
2,0.107934,sadness,0.414201,0.804142,0.0,0.35,0.97,0.225806,0.21,0.178947,...,0.02442,-0.035525,0.029651,0.076029,-0.000808,-0.00748,0.034198,-0.021866,-0.049595,-0.031475
3,0.044575,anger,0.289941,0.756904,0.0,0.25,0.3,0.688172,0.14,0.021053,...,0.024856,0.008566,-0.037895,0.086845,-0.065897,-0.048799,0.079852,0.050016,0.028316,-0.048752
4,0.063078,sadness,0.550296,0.782703,0.0,0.5,0.74,0.419355,0.13,0.021053,...,0.024535,0.074215,-0.011271,-0.019083,-0.056985,0.043992,0.033423,0.027449,0.004899,-0.051095


### PCA sobre embeddings

In [54]:
# 1. Inicializamos el PCA para retener el 80% de la varianza
pca = PCA(n_components=0.80, random_state=42)

# 2. Ajustamos y transformamos
lyrics_embeddings_pca = pca.fit_transform(embeddings_df)

# 3. Revisamos cuántos componentes quedaron
print(f"Componentes seleccionados: {pca.n_components_}")


Componentes seleccionados: 109


### Normalizar los embeddings PCA

In [56]:
# Normalizamos los embeddings PCA
scaler = MinMaxScaler()

lyrics_embeddings_pca_normalized = scaler.fit_transform(lyrics_embeddings_pca)

# Convertimos a DataFrame
lyrics_embeddings_pca_normalized_df = pd.DataFrame(lyrics_embeddings_pca_normalized,
                                                    columns=[f'lyric_emb_pca_{i}' for i in range(lyrics_embeddings_pca_normalized.shape[1])])

# Concatenamos al dataset original
df_final_pca = pd.concat([df.drop(columns=['Lyrics']), lyrics_embeddings_pca_normalized_df], axis=1)

In [57]:
# Guardar el DataFrame modificado en formato eficiente
df_final_pca.to_parquet("../data/df_step3.parquet")

En el proximo paso se comenzará con los entrenamiento de los modelos RF y KNN