## Modelado

Vamos a elegir 3 modelos de regresión y estudiar cuál se ajusta mejor a nuestros datos.

In [52]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('spotify_limpio.csv')

df.head()

Unnamed: 0,Artist,Track,Album,Album_type,Stream,Danceability,Energy,Key,Key_name,Loudness,Speechiness,Acousticness,Liveness,Valence,Tempo,Tempo_category,Duration_ms,Licensed,official_video
0,Cults,Gilded Lily,Offering,album,83194819.0,0.4,0.571,11,B,-6.534,0.0288,0.468,0.519,0.146,61.859,Adagio,212736.0,True,True
1,Los Pericos,Runaway,Pericos & Friends,album,101408935.0,0.676,0.731,9,A,-5.253,0.0368,0.0438,0.624,0.812,150.151,Allegro,233268.0,False,False
2,Richard Marx,Satisfied,Repeat Offender,album,4281404.0,0.572,0.914,8,G# / Ab,-8.436,0.0311,0.0717,0.319,0.843,108.991,Andante,254467.0,False,True
3,The Supremes,Come See About Me,Where Did Our Love Go,album,70357721.0,0.732,0.506,7,G,-11.735,0.0462,0.688,0.146,0.867,126.325,Allegro,163093.0,False,False
4,El Fantasma,Equipo RR,Equipo RR,single,13232046.0,0.755,0.55,7,G,-6.102,0.107,0.618,0.0868,0.926,114.501,Moderato / Allegretto,159976.0,False,False


In [53]:
df = df.drop(columns=[ 'Artist', 'Track', 'Album', 'Album_type', 'Key_name', 'Tempo_category'])

En base a las conclusiones del Baseline model vamos a transformar la columna stream con un logaritmo y a normalizar las columnas Loudness, Tempo y Duration_ms

In [54]:
df["log_stream"] = np.log1p(df["Stream"])

In [55]:
df.head()

Unnamed: 0,Stream,Danceability,Energy,Key,Loudness,Speechiness,Acousticness,Liveness,Valence,Tempo,Duration_ms,Licensed,official_video,log_stream
0,83194819.0,0.4,0.571,11,-6.534,0.0288,0.468,0.519,0.146,61.859,212736.0,True,True,18.236696
1,101408935.0,0.676,0.731,9,-5.253,0.0368,0.0438,0.624,0.812,150.151,233268.0,False,False,18.434672
2,4281404.0,0.572,0.914,8,-8.436,0.0311,0.0717,0.319,0.843,108.991,254467.0,False,True,15.269792
3,70357721.0,0.732,0.506,7,-11.735,0.0462,0.688,0.146,0.867,126.325,163093.0,False,False,18.069103
4,13232046.0,0.755,0.55,7,-6.102,0.107,0.618,0.0868,0.926,114.501,159976.0,False,False,16.398152


In [56]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

cols_to_scale = ["Loudness", "Tempo", "Duration_ms"]

df[cols_to_scale] = scaler.fit_transform(
    df[cols_to_scale]
)

In [57]:
import pandas as pd

X = df.drop(columns=['Stream', 'log_stream'], axis=1)
y = df[ 'log_stream']


In [58]:
from sklearn.model_selection import cross_val_score
import numpy as np


def val_cruz(model, X, y, n_splits=5, random_state=42):
    

    #cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)


    scores_rmse = cross_val_score(
        model,
        X,
        y,
        cv=n_splits,
        scoring="neg_root_mean_squared_error"
    )


    scores_mape = cross_val_score(
            model,
            X,
            y,
            cv=n_splits,
            scoring="neg_mean_absolute_percentage_error"
        )
        
    scores_mae = cross_val_score(
        model,
        X,
        y,
        cv=n_splits,
        scoring="neg_mean_absolute_error"
    )

    scores_r2 = cross_val_score(
        model,
        X,
        y,
        cv=n_splits,
        scoring="r2"
    )

    # Pasar a positivo
    scores_rmse = -scores_rmse
    scores_mae = -scores_mae
    scores_mape = -scores_mape 

    print(" Resultados validación cruzada")
    print("-" * 40)
    print(f"RMSE: {scores_rmse}")
    print(f"MAPE: {scores_mape}")
    print(f"MAE:  {scores_mae}")
    print(f"R²:   {scores_r2}")
    print("-" * 40)
    print(f"RMSE medio: {scores_rmse.mean():.4f} ± {scores_rmse.std():.4f}")
    print(f"MAPE medio:  {scores_mape.mean():.4f} ± {scores_mape.std():.4f}")
    print(f"MAE medio:  {scores_mae.mean():.4f} ± {scores_mae.std():.4f}")
    print(f"R² medio:   {scores_r2.mean():.4f} ± {scores_r2.std():.4f}")

    return scores_rmse.mean(), scores_rmse.std(), scores_mape.mean(), scores_mape.std(), scores_mae.mean(),scores_mae.std(),scores_r2.mean(),scores_r2.std()



#### XGboost

In [59]:
import xgboost as xgb

def xgb_model(X,y,random_state=42):
    
    model = xgb.XGBRegressor(random_state=random_state)

    
    rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std= val_cruz(
        model,
        X,
        y,
        random_state=random_state
    )
    

    
    return model, rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std



In [None]:
_, xgb_rmse_mean, xgb_rmse_std, xgb_mape_mean, xgb_mape_std, xgb_mae_mean, xgb_mae_std, xgb_r2_mean, xgb_r2_std = xgb_model(X,y)

 Resultados validación cruzada
----------------------------------------
RMSE: [1.58376721 1.5739099  1.5692844  1.60681643 1.62429319]
MAPE: [0.07115237 0.07122984 0.07076864 0.07217336 0.07305044]
MAE:  [1.20937241 1.21613853 1.20480825 1.23190945 1.24924235]
R²:   [0.05103595 0.12929282 0.07222589 0.0502546  0.06677911]
----------------------------------------
RMSE medio: 1.5916 ± 0.0208
MAPE medio:  0.0717 ± 0.0008
MAE medio:  1.2223 ± 0.0163
R² medio:   0.0739 ± 0.0290


### Gradient Boosting Regresion

In [61]:
from sklearn.ensemble import GradientBoostingRegressor

In [62]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

def gb_model(X, y, random_state=42):

    
    model = GradientBoostingRegressor( random_state=random_state)

    
    rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std = val_cruz(
        model,
        X,
        y,
        random_state=random_state
    )
    
    return model, rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std


In [63]:
_, gb_rmse_mean, gb_rmse_std, gb_mape_mean, gb_mape_std, gb_mae_mean, gb_mae_std, gb_r2_mean, gb_r2_std= gb_model(X,y)

 Resultados validación cruzada
----------------------------------------
RMSE: [1.54379472 1.56960041 1.54035638 1.56233235 1.58250101]
MAPE: [0.07026924 0.07166744 0.07035492 0.07142026 0.07184427]
MAE:  [1.19383547 1.22133627 1.19828437 1.21986441 1.22751538]
R²:   [0.09833303 0.13405443 0.1061156  0.10211333 0.11418384]
----------------------------------------
RMSE medio: 1.5597 ± 0.0158
MAPE medio:  0.0711 ± 0.0007
MAE medio:  1.2122 ± 0.0135
R² medio:   0.1110 ± 0.0127


## Random forest

In [64]:
from sklearn.ensemble import RandomForestRegressor

def rf_regression(X, y, random_state=42):
    model = RandomForestRegressor( random_state=random_state)

    
    rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std = val_cruz(
        model,
        X,
        y,
        random_state=random_state
    )
    
    return model, rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std


In [65]:
_, rf_rmse_mean, rf_rmse_std, rf_mape_mean, rf_mape_std, rf_mae_mean, rf_mae_std, rf_r2_mean, rf_r2_std= rf_regression(X, y)

 Resultados validación cruzada
----------------------------------------
RMSE: [1.4914332  1.53079852 1.49735339 1.51709713 1.52732659]
MAPE: [0.06615992 0.06814313 0.06678364 0.0672654  0.06763406]
MAE:  [1.12343987 1.16019719 1.13600278 1.14774792 1.15392443]
R²:   [0.15846017 0.17633909 0.15532906 0.1533548  0.17487561]
----------------------------------------
RMSE medio: 1.5128 ± 0.0158
MAPE medio:  0.0672 ± 0.0007
MAE medio:  1.1443 ± 0.0131
R² medio:   0.1637 ± 0.0099


Vamos a crear una tabla comparando los valores

In [68]:
tabla_resultados = pd.DataFrame({
    "Algoritmo": ["XGBoost", "Gradient Boosting", "Random Forest"],

    "RMSE (mean ± std)": [
        f"{xgb_rmse_mean:.3f} ± {xgb_rmse_std:.3f}",
        f"{gb_rmse_mean:.3f} ± {gb_rmse_std:.3f}",
        f"{rf_rmse_mean:.3f} ± {rf_rmse_std:.3f}"
    ],

    "MAPE (mean ± std)": [
        f"{xgb_mape_mean:.3f} ± {xgb_mape_std:.3f}",
        f"{gb_mape_mean:.3f} ± {gb_mape_std:.3f}",
        f"{rf_mape_mean:.3f} ± {rf_mape_std:.3f}"
    ],

    "MAE (mean ± std)": [
        f"{xgb_mae_mean:.3f} ± {xgb_mae_std:.3f}",
        f"{gb_mae_mean:.3f} ± {gb_mae_std:.3f}",
        f"{rf_mae_mean:.3f} ± {rf_mae_std:.3f}"
    ],

    "R² (mean ± std)": [
        f"{xgb_r2_mean:.3f} ± {xgb_r2_std:.3f}",
        f"{gb_r2_mean:.3f} ± {gb_r2_std:.3f}",
        f"{rf_r2_mean:.3f} ± {rf_r2_std:.3f}"
    ]
})

tabla_resultados


Unnamed: 0,Algoritmo,RMSE (mean ± std),MAPE (mean ± std),MAE (mean ± std),R² (mean ± std)
0,XGBoost,1.592 ± 0.021,0.072 ± 0.001,1.222 ± 0.016,0.074 ± 0.029
1,Gradient Boosting,1.560 ± 0.016,0.071 ± 0.001,1.212 ± 0.013,0.111 ± 0.013
2,Random Forest,1.513 ± 0.016,0.067 ± 0.001,1.144 ± 0.013,0.164 ± 0.010


Vemos que hay menor error en Random Forest en las métricas RMSE, MAPE y MAE y la mayor en R**2, lo que significa que Random Forest es el algoritmo más adecuado para nuestro Dataset. Además, no hay mucha desviación típica por lo que nuestros valores son estables.