## Modelado

Vamos a elegir 3 modelos de regresión y estudiar cuál se ajusta mejor a nuestros datos.

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('spotify_limpio_train.csv')

df.head()

Unnamed: 0,Artist,Track,Album,Album_type,Stream,Danceability,Energy,Key,Key_name,Loudness,Speechiness,Acousticness,Liveness,Valence,Tempo,Tempo_category,Duration_ms,Licensed,official_video
0,MGMT,Me and Michael,Little Dark Age,album,51859353.0,0.628,0.841,9,A,-4.583,0.0268,0.0285,0.254,0.706,111.985,Moderato / Allegretto,289853.0,True,True
1,Mc Lipi,Quebradas,Quebradas,single,31545318.0,0.762,0.714,9,A,-2.436,0.219,0.146,0.0415,0.558,112.985,Moderato / Allegretto,250588.0,True,True
2,Justin Quiles,La Esquina del Mall,La Esquina del Mall,single,17950239.0,0.701,0.628,6,F# / Gb,-5.026,0.293,0.243,0.333,0.777,189.893,Presto,207388.0,True,True
3,Arctic Monkeys,Snap Out Of It,AM,album,376045969.0,0.728,0.638,5,F,-6.455,0.0336,0.249,0.116,0.872,130.014,Allegro,193030.0,True,True
4,MEDUZA,Tell Me Why - MEDUZA Remix,Tell Me Why (MEDUZA Remix),single,22769089.0,0.561,0.928,6,F# / Gb,-5.812,0.0371,0.087,0.332,0.182,125.988,Allegro,171429.0,True,True


In [4]:
df = df.drop(columns=[ 'Artist', 'Track', 'Album', 'Album_type', 'Key_name', 'Tempo_category'])

En base a las conclusiones del Baseline model vamos a transformar la columna stream con un logaritmo y a normalizar las columnas Loudness, Tempo y Duration_ms

In [5]:
df["log_stream"] = np.log1p(df["Stream"])

In [6]:
df.head()

Unnamed: 0,Stream,Danceability,Energy,Key,Loudness,Speechiness,Acousticness,Liveness,Valence,Tempo,Duration_ms,Licensed,official_video,log_stream
0,51859353.0,0.628,0.841,9,-4.583,0.0268,0.0285,0.254,0.706,111.985,289853.0,True,True,17.764046
1,31545318.0,0.762,0.714,9,-2.436,0.219,0.146,0.0415,0.558,112.985,250588.0,True,True,17.266936
2,17950239.0,0.701,0.628,6,-5.026,0.293,0.243,0.333,0.777,189.893,207388.0,True,True,16.703114
3,376045969.0,0.728,0.638,5,-6.455,0.0336,0.249,0.116,0.872,130.014,193030.0,True,True,19.745222
4,22769089.0,0.561,0.928,6,-5.812,0.0371,0.087,0.332,0.182,125.988,171429.0,True,True,16.940914


In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

cols_to_scale = ["Loudness", "Tempo", "Duration_ms"]

df[cols_to_scale] = scaler.fit_transform(
    df[cols_to_scale]
)

In [8]:
import pandas as pd

X = df.drop(columns=['Stream', 'log_stream'], axis=1)
y = df[ 'log_stream']


In [9]:
from sklearn.model_selection import cross_val_score
import numpy as np


def val_cruz(model, X, y, n_splits=5, random_state=42):
    

    #cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)


    scores_rmse = cross_val_score(
        model,
        X,
        y,
        cv=n_splits,
        scoring="neg_root_mean_squared_error"
    )


    scores_mape = cross_val_score(
            model,
            X,
            y,
            cv=n_splits,
            scoring="neg_mean_absolute_percentage_error"
        )
        
    scores_mae = cross_val_score(
        model,
        X,
        y,
        cv=n_splits,
        scoring="neg_mean_absolute_error"
    )

    scores_r2 = cross_val_score(
        model,
        X,
        y,
        cv=n_splits,
        scoring="r2"
    )

    # Pasar a positivo
    scores_rmse = -scores_rmse
    scores_mae = -scores_mae
    scores_mape = -scores_mape 

    print(" Resultados validación cruzada")
    print("-" * 40)
    print(f"RMSE: {scores_rmse}")
    print(f"MAPE: {scores_mape}")
    print(f"MAE:  {scores_mae}")
    print(f"R²:   {scores_r2}")
    print("-" * 40)
    print(f"RMSE medio: {scores_rmse.mean():.4f} ± {scores_rmse.std():.4f}")
    print(f"MAPE medio:  {scores_mape.mean():.4f} ± {scores_mape.std():.4f}")
    print(f"MAE medio:  {scores_mae.mean():.4f} ± {scores_mae.std():.4f}")
    print(f"R² medio:   {scores_r2.mean():.4f} ± {scores_r2.std():.4f}")

    return scores_rmse.mean(), scores_rmse.std(), scores_mape.mean(), scores_mape.std(), scores_mae.mean(),scores_mae.std(),scores_r2.mean(),scores_r2.std()



#### XGboost

In [10]:
import xgboost as xgb

def xgb_model(X,y,random_state=42):
    
    model = xgb.XGBRegressor(random_state=random_state)

    
    rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std= val_cruz(
        model,
        X,
        y,
        random_state=random_state
    )
    

    
    return model, rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std



In [11]:
_, xgb_rmse_mean, xgb_rmse_std, xgb_mape_mean, xgb_mape_std, xgb_mae_mean, xgb_mae_std, xgb_r2_mean, xgb_r2_std = xgb_model(X,y)

 Resultados validación cruzada
----------------------------------------
RMSE: [1.55881093 1.55783141 1.57604264 1.59898198 1.56671496]
MAPE: [0.07021883 0.06975711 0.07046245 0.0723097  0.07123017]
MAE:  [1.19527851 1.2003163  1.20203801 1.23148647 1.21596464]
R²:   [0.08391065 0.10133588 0.05515318 0.08165841 0.09867792]
----------------------------------------
RMSE medio: 1.5717 ± 0.0151
MAPE medio:  0.0708 ± 0.0009
MAE medio:  1.2090 ± 0.0132
R² medio:   0.0841 ± 0.0165


### Gradient Boosting Regresion

In [12]:
from sklearn.ensemble import GradientBoostingRegressor

In [13]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

def gb_model(X, y, random_state=42):

    
    model = GradientBoostingRegressor( random_state=random_state)

    
    rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std = val_cruz(
        model,
        X,
        y,
        random_state=random_state
    )
    
    return model, rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std


In [14]:
_, gb_rmse_mean, gb_rmse_std, gb_mape_mean, gb_mape_std, gb_mae_mean, gb_mae_std, gb_r2_mean, gb_r2_std= gb_model(X,y)

 Resultados validación cruzada
----------------------------------------
RMSE: [1.53691219 1.53899759 1.53170755 1.57329385 1.54787468]
MAPE: [0.06995013 0.06985232 0.06926573 0.07208321 0.07096129]
MAE:  [1.19287987 1.20164257 1.18187742 1.22767995 1.20977036]
R²:   [0.10946896 0.12293381 0.10756379 0.11092826 0.120225  ]
----------------------------------------
RMSE medio: 1.5458 ± 0.0147
MAPE medio:  0.0704 ± 0.0010
MAE medio:  1.2028 ± 0.0155
R² medio:   0.1142 ± 0.0062


## Random forest

In [15]:
from sklearn.ensemble import RandomForestRegressor

def rf_regression(X, y, random_state=42):
    model = RandomForestRegressor( random_state=random_state)

    
    rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std = val_cruz(
        model,
        X,
        y,
        random_state=random_state
    )
    
    return model, rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std


In [16]:
_, rf_rmse_mean, rf_rmse_std, rf_mape_mean, rf_mape_std, rf_mae_mean, rf_mae_std, rf_r2_mean, rf_r2_std= rf_regression(X, y)

 Resultados validación cruzada
----------------------------------------
RMSE: [1.48696377 1.49855724 1.49232406 1.51386006 1.48840559]
MAPE: [0.06604188 0.06640489 0.0657942  0.06750525 0.06659482]
MAE:  [1.12533629 1.14012918 1.12135608 1.14807161 1.13390717]
R²:   [0.16641149 0.16842167 0.15286669 0.17683181 0.186528  ]
----------------------------------------
RMSE medio: 1.4960 ± 0.0098
MAPE medio:  0.0665 ± 0.0006
MAE medio:  1.1338 ± 0.0097
R² medio:   0.1702 ± 0.0112


Vamos a crear una tabla comparando los valores

In [17]:
tabla_resultados = pd.DataFrame({
    "Algoritmo": ["XGBoost", "Gradient Boosting", "Random Forest"],

    "RMSE (mean ± std)": [
        f"{xgb_rmse_mean:.3f} ± {xgb_rmse_std:.3f}",
        f"{gb_rmse_mean:.3f} ± {gb_rmse_std:.3f}",
        f"{rf_rmse_mean:.3f} ± {rf_rmse_std:.3f}"
    ],

    "MAPE (mean ± std)": [
        f"{xgb_mape_mean:.3f} ± {xgb_mape_std:.3f}",
        f"{gb_mape_mean:.3f} ± {gb_mape_std:.3f}",
        f"{rf_mape_mean:.3f} ± {rf_mape_std:.3f}"
    ],

    "MAE (mean ± std)": [
        f"{xgb_mae_mean:.3f} ± {xgb_mae_std:.3f}",
        f"{gb_mae_mean:.3f} ± {gb_mae_std:.3f}",
        f"{rf_mae_mean:.3f} ± {rf_mae_std:.3f}"
    ],

    "R² (mean ± std)": [
        f"{xgb_r2_mean:.3f} ± {xgb_r2_std:.3f}",
        f"{gb_r2_mean:.3f} ± {gb_r2_std:.3f}",
        f"{rf_r2_mean:.3f} ± {rf_r2_std:.3f}"
    ]
})

tabla_resultados


Unnamed: 0,Algoritmo,RMSE (mean ± std),MAPE (mean ± std),MAE (mean ± std),R² (mean ± std)
0,XGBoost,1.572 ± 0.015,0.071 ± 0.001,1.209 ± 0.013,0.084 ± 0.016
1,Gradient Boosting,1.546 ± 0.015,0.070 ± 0.001,1.203 ± 0.016,0.114 ± 0.006
2,Random Forest,1.496 ± 0.010,0.066 ± 0.001,1.134 ± 0.010,0.170 ± 0.011


Vemos que hay menor error en Random Forest en las métricas RMSE, MAPE y MAE y la mayor en R**2, lo que significa que Random Forest es el algoritmo más adecuado para nuestro Dataset. Además, no hay mucha desviación típica por lo que nuestros valores son estables.