In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer

KeyboardInterrupt: 

In [None]:
df = pd.read_parquet("../data/df_step1.parquet")

# Preprocesamiento

### Datos Faltantes

In [None]:
# Nulos
null_counts = df.isnull().sum()
print("Cantidad de valores nulos por columna (solo las que tienen nulos):", null_counts[null_counts > 0])

# Reemplazar valores nulos en 'Time signature' con la moda
mode_time_sig = df['Time signature'].mode()[0]
df['Time signature'].fillna(mode_time_sig, inplace=True)

Cantidad de valores nulos por columna (solo las que tienen nulos): Time signature    8
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Time signature'].fillna(mode_time_sig, inplace=True)


#### Duplicados

In [None]:
# Analizar duplicados
duplicates = df.duplicated().sum()
df.drop_duplicates(inplace=True)
print(f"Cantidad de duplicados eliminados: {duplicates}")

Cantidad de duplicados eliminados: 53442


#### Lyrics con menos de 100 caracteres

In [None]:
# Analizar columna Lyrics con largo de texto menor a 100 caracteres
lyrics_length = df['Lyrics'].str.len()
short_lyrics_count = (lyrics_length < 100).sum()
print(f"Cantidad de letras con menos de 10 caracteres: {short_lyrics_count}")

Cantidad de letras con menos de 10 caracteres: 0


### Tratamiento de Multietiqueta

In [None]:
# Separar por comas y quitar espacios innecesarios
df['Genre_list'] = df['Genre'].str.split(',')

# Remover espacios al principio/final de cada etiqueta
df['Genre_list'] = df['Genre_list'].apply(lambda genres: [g.strip().lower() for g in genres])

# Show unique genres
unique_genres = set()
for genres in df['Genre_list']:
    unique_genres.update(genres)
print("Géneros únicos:", unique_genres)
print("Cantidad de géneros únicos:", len(unique_genres))


Géneros únicos: {'punk rock', 'folk', 'thrash metal', 'chillwave', 'shoegaze', 'worship', 'dub', 'funk', 'country', 'alternative rock', 'hardcore', 'comedy', 'rock', 'nu metal', 'black metal', 'pop', 'hard rock', 'reggae', 'reggaeton', 'power metal', 'death metal', 'emo rap', 'punk', 'indie', 'blues', 'trip-hop', 'k-pop', 'alt-country', 'latin', 'dancehall', 'progressive metal', 'doom metal', 'techno', 'metal', 'melodic death metal', 'new wave', 'swing', 'house', 'indie rock', 'trance', 'industrial', 'metalcore', 'rnb', 'jazz', 'dubstep', 'hip hop', 'progressive rock', 'pop rock', 'classic rock', 'heavy metal', 'ambient', 'grime', 'drum and bass', 'garage rock', 'screamo', 'lo-fi', 'dance', 'deathcore', 'britpop', 'psychedelic', 'electronic', 'electropop', 'christian', 'experimental', 'j-pop', 'indie pop', 'psychedelic rock', 'post-hardcore', 'electro', 'hip-hop', 'post-punk', 'disco', 'trap', 'soundtrack', 'grunge', 'dream pop', 'pop punk', 'cloud rap', 'gospel', 'math rock', 'soul', 

In [None]:
mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(mlb.fit_transform(df['Genre_list']),
                             columns=mlb.classes_,
                             index=df.index)

# Concatenar al dataframe original
df = pd.concat([df.drop(columns=['Genre', 'Genre_list']), genre_dummies], axis=1)

In [None]:
genre_counts = df[mlb.classes_].sum().sort_values(ascending=False)
genre_counts

hip hop             239871
rock                 84086
pop                  63288
alternative rock     40259
indie rock           22742
                     ...  
alt-country            602
worship                449
psychedelic            323
chillout               149
reggaeton               34
Length: 88, dtype: int64

### Variables categóricas a numéricas

In [None]:
# One-Hot Encoding
df = pd.get_dummies(df, columns=['Key', 'Time signature'], prefix=['Key', 'TS'], dtype=int)

### Normalización

In [None]:
# All columns except Lyrics and emotion
num_cols = df.columns.difference(['Lyrics', 'emotion'])
num_cols

Index(['Acousticness', 'Danceability', 'Energy', 'Explicit',
       'Good for Driving', 'Good for Exercise', 'Good for Morning Routine',
       'Good for Party', 'Good for Relaxation/Meditation', 'Good for Running',
       ...
       'soul', 'soundtrack', 'swing', 'synthpop', 'techno', 'thrash metal',
       'trance', 'trap', 'trip-hop', 'worship'],
      dtype='object', length=137)

In [None]:
scaler  = MinMaxScaler()
df[num_cols] = scaler .fit_transform(df[num_cols])

In [None]:
df.head()

Unnamed: 0,Lyrics,Length,emotion,Tempo,Loudness,Explicit,Popularity,Energy,Danceability,Positiveness,Speechiness,Liveness,Acousticness,Instrumentalness,Good for Party,Good for Work/Study,Good for Relaxation/Meditation,Good for Exercise,Good for Running,Good for Yoga/Stretching,Good for Driving,Good for Social Gatherings,Good for Morning Routine,acoustic,alt-country,alternative,alternative rock,ambient,black metal,blues,britpop,chillout,chillwave,christian,classic rock,classical,cloud rap,comedy,country,dance,dancehall,death metal,deathcore,disco,doom metal,dream pop,drum and bass,dub,dubstep,electro,electronic,electropop,emo,emo rap,experimental,folk,funk,garage rock,gospel,grime,grunge,hard rock,hardcore,heavy metal,hip hop,hip-hop,house,indie,indie pop,indie rock,industrial,j-pop,jazz,k-pop,latin,lo-fi,math rock,melodic death metal,metal,metalcore,new wave,nu metal,pop,pop punk,pop rock,post-hardcore,post-punk,power metal,progressive metal,progressive rock,psychedelic,psychedelic rock,punk,punk rock,rap,reggae,reggaeton,rnb,rock,screamo,shoegaze,soul,soundtrack,swing,synthpop,techno,thrash metal,trance,trap,trip-hop,worship,Key_A Maj,Key_A min,Key_A# Maj,Key_A# min,Key_B Maj,Key_B min,Key_C Maj,Key_C min,Key_C# Maj,Key_C# min,Key_D Maj,Key_D min,Key_D# Maj,Key_D# min,Key_E Maj,Key_E min,Key_F Maj,Key_F min,Key_F# Maj,Key_F# min,Key_G Maj,Key_G min,Key_G# Maj,Key_G# min,TS_1/4,TS_3/4,TS_4/4,TS_5/4
0,Friends told her she was better off at the bot...,0.062237,sadness,0.43787,0.785065,0.0,0.4,0.83,0.698925,0.87,0.021053,0.151515,0.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,"Well I heard it, playing soft From a drunken b...",0.066723,sadness,0.508876,0.805051,0.0,0.42,0.85,0.688172,0.87,0.021053,0.313131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,"Oh my god, did I just say that out loud? Shoul...",0.097,joy,0.532544,0.799419,0.0,0.29,0.89,0.698925,0.63,0.063158,0.636364,0.0,0.2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,[Verse 1] Remember when I called you on the te...,0.061396,joy,0.538462,0.811047,0.0,0.24,0.84,0.774194,0.97,0.021053,0.111111,0.12,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,[Verse 1] Calling me like I got something to s...,0.099523,joy,0.544379,0.808321,0.0,0.3,0.71,0.763441,0.7,0.052632,0.090909,0.04,0.01,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# Guardar el DataFrame modificado en formato eficiente
df.to_parquet("../data/df_step2.parquet")

En la proxima iteración de preprocesamiento se hará el embedding de las letras de las canciones utilizando sentence-transformers.