## Modelado

Vamos a elegir 3 modelos de regresión y estudiar cuál se ajusta mejor a nuestros datos.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('data/spotify_limpio_train.csv')

df.head()

Unnamed: 0,Artist,Track,Album,Album_type,Stream,Danceability,Energy,Key,Key_name,Loudness,Speechiness,Acousticness,Liveness,Valence,Tempo,Tempo_category,Duration_ms,Licensed,official_video
0,KAYTRANADA,LITE SPOTS,99.9%,album,29240851.0,0.884,0.549,1,C# / Db,-11.683,0.471,0.0346,0.112,0.394,120.461,Moderato / Allegretto,230920.0,True,True
1,Rita Ora,Follow Me,Follow Me,single,82813284.0,0.673,0.729,6,F# / Gb,-4.879,0.0496,0.083,0.1,0.675,122.023,Allegro,169672.0,True,True
2,Big Sean,Bounce Back,I Decided.,album,666145000.0,0.78,0.575,1,C# / Db,-5.628,0.139,0.106,0.129,0.273,81.502,Andante,222360.0,True,True
3,Dave Matthews Band,#41,Crash,album,34528391.0,0.577,0.726,4,E,-8.011,0.0299,0.00199,0.158,0.764,107.416,Andante,399800.0,True,True
4,Ray Dalton,Call It Love - Klingande Remix,Call It Love (Klingande Remix),single,1153344.0,0.671,0.687,10,A# / Bb,-8.669,0.0502,0.0111,0.504,0.623,121.992,Allegro,149508.0,True,True


In [2]:
df = df.drop(columns=[ 'Artist', 'Track', 'Album', 'Album_type', 'Key_name', 'Tempo_category'])

En base a las conclusiones del feature engineering vamos a transformar la columna stream con un logaritmo.

In [3]:
df["log_stream"] = np.log1p(df["Stream"])

In [4]:
df.head()

Unnamed: 0,Stream,Danceability,Energy,Key,Loudness,Speechiness,Acousticness,Liveness,Valence,Tempo,Duration_ms,Licensed,official_video,log_stream
0,29240851.0,0.884,0.549,1,-11.683,0.471,0.0346,0.112,0.394,120.461,230920.0,True,True,17.191077
1,82813284.0,0.673,0.729,6,-4.879,0.0496,0.083,0.1,0.675,122.023,169672.0,True,True,18.232099
2,666145000.0,0.78,0.575,1,-5.628,0.139,0.106,0.129,0.273,81.502,222360.0,True,True,20.317018
3,34528391.0,0.577,0.726,4,-8.011,0.0299,0.00199,0.158,0.764,107.416,399800.0,True,True,17.357293
4,1153344.0,0.671,0.687,10,-8.669,0.0502,0.0111,0.504,0.623,121.992,149508.0,True,True,13.958177


In [5]:
import pandas as pd

X = df.drop(columns=['Stream', 'log_stream'], axis=1)
y = df[ 'log_stream']


In [6]:
from sklearn.model_selection import cross_val_score
import numpy as np


def val_cruz(model, X, y, n_splits=5, random_state=42):
    

    #cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)


    scores_rmse = cross_val_score(
        model,
        X,
        y,
        cv=n_splits,
        scoring="neg_root_mean_squared_error"
    )


    scores_mape = cross_val_score(
            model,
            X,
            y,
            cv=n_splits,
            scoring="neg_mean_absolute_percentage_error"
        )
        
    scores_mae = cross_val_score(
        model,
        X,
        y,
        cv=n_splits,
        scoring="neg_mean_absolute_error"
    )

    scores_r2 = cross_val_score(
        model,
        X,
        y,
        cv=n_splits,
        scoring="r2"
    )

    # Pasar a positivo
    scores_rmse = -scores_rmse
    scores_mae = -scores_mae
    scores_mape = -scores_mape 

    print(" Resultados validación cruzada")
    print("-" * 40)
    print(f"RMSE: {scores_rmse}")
    print(f"MAPE: {scores_mape}")
    print(f"MAE:  {scores_mae}")
    print(f"R²:   {scores_r2}")
    print("-" * 40)
    print(f"RMSE medio: {scores_rmse.mean():.4f} ± {scores_rmse.std():.4f}")
    print(f"MAPE medio:  {scores_mape.mean():.4f} ± {scores_mape.std():.4f}")
    print(f"MAE medio:  {scores_mae.mean():.4f} ± {scores_mae.std():.4f}")
    print(f"R² medio:   {scores_r2.mean():.4f} ± {scores_r2.std():.4f}")

    return scores_rmse.mean(), scores_rmse.std(), scores_mape.mean(), scores_mape.std(), scores_mae.mean(),scores_mae.std(),scores_r2.mean(),scores_r2.std()



#### XGboost

In [7]:
import xgboost as xgb

def xgb_model(X,y,random_state=42):
    
    model = xgb.XGBRegressor(random_state=random_state)

    
    rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std= val_cruz(
        model,
        X,
        y,
        random_state=random_state
    )
    

    
    return model, rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std



In [8]:
_, xgb_rmse_mean, xgb_rmse_std, xgb_mape_mean, xgb_mape_std, xgb_mae_mean, xgb_mae_std, xgb_r2_mean, xgb_r2_std = xgb_model(X,y)

 Resultados validación cruzada
----------------------------------------
RMSE: [1.5794513  1.59280892 1.54401715 1.55158945 1.57440551]
MAPE: [0.07112502 0.07173187 0.06965711 0.06958145 0.07001354]
MAE:  [1.21920649 1.21812299 1.19750145 1.18317914 1.19549893]
R²:   [0.08888849 0.05011301 0.09770443 0.0795479  0.07454496]
----------------------------------------
RMSE medio: 1.5685 ± 0.0181
MAPE medio:  0.0704 ± 0.0009
MAE medio:  1.2027 ± 0.0139
R² medio:   0.0782 ± 0.0161


### Gradient Boosting Regresion

In [9]:
from sklearn.ensemble import GradientBoostingRegressor

In [10]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

def gb_model(X, y, random_state=42):

    
    model = GradientBoostingRegressor( random_state=random_state)

    
    rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std = val_cruz(
        model,
        X,
        y,
        random_state=random_state
    )
    
    return model, rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std


In [11]:
_, gb_rmse_mean, gb_rmse_std, gb_mape_mean, gb_mape_std, gb_mae_mean, gb_mae_std, gb_r2_mean, gb_r2_std= gb_model(X,y)

 Resultados validación cruzada
----------------------------------------
RMSE: [1.54637269 1.55944625 1.53430175 1.52886103 1.54074269]
MAPE: [0.07082065 0.07151713 0.06983657 0.06922024 0.07015656]
MAE:  [1.21351459 1.21364535 1.20205802 1.17662663 1.19952245]
R²:   [0.12665188 0.08948857 0.10902371 0.10631683 0.11369672]
----------------------------------------
RMSE medio: 1.5419 ± 0.0106
MAPE medio:  0.0703 ± 0.0008
MAE medio:  1.2011 ± 0.0135
R² medio:   0.1090 ± 0.0120


## Random forest

In [12]:
from sklearn.ensemble import RandomForestRegressor

def rf_regression(X, y, random_state=42):
    model = RandomForestRegressor( random_state=random_state)

    
    rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std = val_cruz(
        model,
        X,
        y,
        random_state=random_state
    )
    
    return model, rmse_mean, rmse_std, mape_mean, mape_std, mae_mean, mae_std, r2_mean, r2_std


In [13]:
_, rf_rmse_mean, rf_rmse_std, rf_mape_mean, rf_mape_std, rf_mae_mean, rf_mae_std, rf_r2_mean, rf_r2_std= rf_regression(X, y)

 Resultados validación cruzada
----------------------------------------
RMSE: [1.4932487  1.50899241 1.49472118 1.46997601 1.48684814]
MAPE: [0.0663092  0.0673939  0.06605287 0.06471902 0.06591798]
MAE:  [1.13513109 1.14353177 1.13579512 1.09799088 1.12607675]
R²:   [0.18562705 0.14745229 0.15440002 0.1738326  0.17461732]
----------------------------------------
RMSE medio: 1.4908 ± 0.0127
MAPE medio:  0.0661 ± 0.0009
MAE medio:  1.1277 ± 0.0159
R² medio:   0.1672 ± 0.0141


Vamos a crear una tabla comparando los valores

In [14]:
tabla_resultados = pd.DataFrame({
    "Algoritmo": ["XGBoost", "Gradient Boosting", "Random Forest"],

    "RMSE (mean ± std)": [
        f"{xgb_rmse_mean:.3f} ± {xgb_rmse_std:.3f}",
        f"{gb_rmse_mean:.3f} ± {gb_rmse_std:.3f}",
        f"{rf_rmse_mean:.3f} ± {rf_rmse_std:.3f}"
    ],

    "MAPE (mean ± std)": [
        f"{xgb_mape_mean:.3f} ± {xgb_mape_std:.3f}",
        f"{gb_mape_mean:.3f} ± {gb_mape_std:.3f}",
        f"{rf_mape_mean:.3f} ± {rf_mape_std:.3f}"
    ],

    "MAE (mean ± std)": [
        f"{xgb_mae_mean:.3f} ± {xgb_mae_std:.3f}",
        f"{gb_mae_mean:.3f} ± {gb_mae_std:.3f}",
        f"{rf_mae_mean:.3f} ± {rf_mae_std:.3f}"
    ],

    "R² (mean ± std)": [
        f"{xgb_r2_mean:.3f} ± {xgb_r2_std:.3f}",
        f"{gb_r2_mean:.3f} ± {gb_r2_std:.3f}",
        f"{rf_r2_mean:.3f} ± {rf_r2_std:.3f}"
    ]
})

tabla_resultados


Unnamed: 0,Algoritmo,RMSE (mean ± std),MAPE (mean ± std),MAE (mean ± std),R² (mean ± std)
0,XGBoost,1.568 ± 0.018,0.070 ± 0.001,1.203 ± 0.014,0.078 ± 0.016
1,Gradient Boosting,1.542 ± 0.011,0.070 ± 0.001,1.201 ± 0.014,0.109 ± 0.012
2,Random Forest,1.491 ± 0.013,0.066 ± 0.001,1.128 ± 0.016,0.167 ± 0.014


Vemos que hay menor error en Random Forest en las métricas RMSE, MAPE y MAE y la mayor en R**2, lo que significa que Random Forest es el algoritmo más adecuado para nuestro Dataset. Además, no hay mucha desviación típica por lo que nuestros valores son estables.