In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
df = pd.read_parquet("../data/df_step1.parquet")

# Preprocesamiento

### Datos Faltantes

In [3]:
# Nulos
null_counts = df.isnull().sum()
print("Cantidad de valores nulos por columna (solo las que tienen nulos):", null_counts[null_counts > 0])

# Reemplazar valores nulos en 'Time signature' con la moda
mode_time_sig = df['Time signature'].mode()[0]
df['Time signature'].fillna(mode_time_sig, inplace=True)

Cantidad de valores nulos por columna (solo las que tienen nulos): Time signature    8
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Time signature'].fillna(mode_time_sig, inplace=True)


#### Duplicados

In [4]:
# Analizar duplicados
duplicates = df.duplicated().sum()
df.drop_duplicates(inplace=True)
print(f"Cantidad de duplicados eliminados: {duplicates}")

Cantidad de duplicados eliminados: 53442


#### Lyrics con menos de 100 caracteres

In [5]:
# Analizar columna Lyrics con largo de texto menor a 100 caracteres
lyrics_length = df['Lyrics'].str.len()
short_lyrics_count = (lyrics_length < 100).sum()
print(f"Cantidad de letras con menos de 10 caracteres: {short_lyrics_count}")

Cantidad de letras con menos de 10 caracteres: 0


### Tratamiento de Multietiqueta

In [6]:
# Separar por comas y quitar espacios innecesarios
df['Genre_list'] = df['Genre'].str.split(',')

# Remover espacios al principio/final de cada etiqueta
df['Genre_list'] = df['Genre_list'].apply(lambda genres: [g.strip().lower() for g in genres])

# Show unique genres
unique_genres = set()
for genres in df['Genre_list']:
    unique_genres.update(genres)
print("Géneros únicos:", unique_genres)
print("Cantidad de géneros únicos:", len(unique_genres))


Géneros únicos: {'pop rock', 'blues', 'grunge', 'hip-hop', 'acoustic', 'drum and bass', 'funk', 'reggaeton', 'classical', 'nu metal', 'dub', 'rap', 'dancehall', 'indie rock', 'chillout', 'new wave', 'country', 'emo rap', 'lo-fi', 'ambient', 'rnb', 'dubstep', 'thrash metal', 'jazz', 'chillwave', 'hard rock', 'melodic death metal', 'christian', 'rock', 'death metal', 'electro', 'indie', 'post-punk', 'screamo', 'industrial', 'house', 'alternative rock', 'post-hardcore', 'soul', 'folk', 'disco', 'britpop', 'trip-hop', 'trance', 'psychedelic', 'electropop', 'indie pop', 'pop', 'punk', 'heavy metal', 'pop punk', 'gospel', 'hip hop', 'punk rock', 'grime', 'black metal', 'latin', 'psychedelic rock', 'reggae', 'techno', 'swing', 'shoegaze', 'garage rock', 'progressive rock', 'dance', 'synthpop', 'classic rock', 'deathcore', 'experimental', 'metal', 'emo', 'worship', 'k-pop', 'soundtrack', 'math rock', 'cloud rap', 'trap', 'hardcore', 'electronic', 'dream pop', 'j-pop', 'alternative', 'power met

In [7]:
mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(mlb.fit_transform(df['Genre_list']),
                             columns=mlb.classes_,
                             index=df.index)

# Concatenar al dataframe original
df = pd.concat([df.drop(columns=['Genre', 'Genre_list']), genre_dummies], axis=1)

In [8]:
genre_counts = df[mlb.classes_].sum().sort_values(ascending=False)
genre_counts

hip hop             239871
rock                 84086
pop                  63288
alternative rock     40259
indie rock           22742
                     ...  
alt-country            602
worship                449
psychedelic            323
chillout               149
reggaeton               34
Length: 88, dtype: int64

### Variables categóricas a numéricas

In [9]:
# One-Hot Encoding
df = pd.get_dummies(df, columns=['Key', 'Time signature'], prefix=['Key', 'TS'], dtype=int)

### Normalización

In [10]:
# All columns except Lyrics and emotion
num_cols = df.columns.difference(['Lyrics', 'emotion'])
num_cols

Index(['Acousticness', 'Danceability', 'Energy', 'Explicit',
       'Good for Driving', 'Good for Exercise', 'Good for Morning Routine',
       'Good for Party', 'Good for Relaxation/Meditation', 'Good for Running',
       ...
       'soul', 'soundtrack', 'swing', 'synthpop', 'techno', 'thrash metal',
       'trance', 'trap', 'trip-hop', 'worship'],
      dtype='object', length=137)

In [11]:
scaler  = MinMaxScaler()
df[num_cols] = scaler .fit_transform(df[num_cols])

In [12]:
df.head()

Unnamed: 0,Lyrics,Length,emotion,Tempo,Loudness,Explicit,Popularity,Energy,Danceability,Positiveness,...,Key_F# Maj,Key_F# min,Key_G Maj,Key_G min,Key_G# Maj,Key_G# min,TS_1/4,TS_3/4,TS_4/4,TS_5/4
0,Friends told her she was better off at the bot...,0.062237,sadness,0.43787,0.785065,0.0,0.4,0.83,0.698925,0.87,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,"Well I heard it, playing soft From a drunken b...",0.066723,sadness,0.508876,0.805051,0.0,0.42,0.85,0.688172,0.87,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,"Oh my god, did I just say that out loud? Shoul...",0.097,joy,0.532544,0.799419,0.0,0.29,0.89,0.698925,0.63,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,[Verse 1] Remember when I called you on the te...,0.061396,joy,0.538462,0.811047,0.0,0.24,0.84,0.774194,0.97,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,[Verse 1] Calling me like I got something to s...,0.099523,joy,0.544379,0.808321,0.0,0.3,0.71,0.763441,0.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [13]:
# Guardar el DataFrame modificado en formato eficiente
df.to_parquet("../data/df_step2.parquet")

En la proxima iteración de preprocesamiento se hará el embedding de las letras de las canciones utilizando sentence-transformers.