## Modelado

Vamos a elegir 3 modelos de regresión y estudiar cuál se ajusta mejor a nuestros datos.

In [3]:
import pandas as pd
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt

df = pd.read_csv('spotify_limpio.csv')

df.head()

Unnamed: 0,Artist,Track,Album,Album_type,Stream,Danceability,Energy,Key,Loudness,Speechiness,Acousticness,Liveness,Valence,Tempo,Tempo_category,Duration_ms,Licensed,official_video
0,Cults,Gilded Lily,Offering,album,83194819.0,0.4,0.571,B,-6.534,0.0288,0.468,0.519,0.146,61.859,Adagio,212736.0,True,True
1,Los Pericos,Runaway,Pericos & Friends,album,101408935.0,0.676,0.731,A,-5.253,0.0368,0.0438,0.624,0.812,150.151,Allegro,233268.0,False,False
2,Richard Marx,Satisfied,Repeat Offender,album,4281404.0,0.572,0.914,G# / Ab,-8.436,0.0311,0.0717,0.319,0.843,108.991,Andante,254467.0,False,True
3,The Supremes,Come See About Me,Where Did Our Love Go,album,70357721.0,0.732,0.506,G,-11.735,0.0462,0.688,0.146,0.867,126.325,Allegro,163093.0,False,False
4,El Fantasma,Equipo RR,Equipo RR,single,13232046.0,0.755,0.55,G,-6.102,0.107,0.618,0.0868,0.926,114.501,Moderato / Allegretto,159976.0,False,False


In [4]:
df = df.drop(columns=[ 'Artist', 'Track', 'Album', 'Album_type','Key', 'Tempo_category'])

In [5]:
import pandas as pd

X = df.drop(columns='Stream')
y = df['Stream']


In [6]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np

def val_cruz(model, X, y, n_splits=5, random_state=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    scores = cross_val_score(
        model, X, y,
        scoring='neg_root_mean_squared_error',
        cv=kf
    )

    print(f"RMSE medio (CV): {scores.mean():.4f}")
    print(f"Desviación estándar RMSE (CV): {scores.std():.4f}")
    return -scores.mean(), scores.std()



#### XGboost

In [18]:
import xgboost as xgb

def xgb_model(
    X,
    y,
    n_splits=5,
    random_state=42,
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    reg_alpha=0.1, 
    reg_lambda=1, 
    eval_metric="logloss"
):
    
    model = xgb.XGBRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        random_state=random_state,
        reg_alpha = reg_alpha,
        reg_lambda=reg_lambda,
        eval_metric=eval_metric
    )

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)

    # Predictions and evaluation
    y_pred = model.predict(X_test)
    
    
    mean_rmse, std_rmse = val_cruz(
        model,
        X,
        y,
        n_splits=n_splits,
        random_state=random_state
    )
    

    
    return mean_rmse, std_rmse, model



In [19]:
xgb_model(X,y)

RMSE medio (CV): -237479021.1131
Desviación estándar RMSE (CV): 5739022.9103


(np.float64(237479021.1130794),
 np.float64(5739022.910323551),
 XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=300, n_jobs=None,
              num_parallel_tree=None, ...))

### Gradient Boosting Regresion

In [9]:
from sklearn.ensemble import GradientBoostingRegressor

In [14]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

def gradient_boosting_regression_model(
    X,
    y,
    n_splits=5,
    random_state=42,
):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = GradientBoostingRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=random_state)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    mean_rmse, std_rmse = val_cruz(
        model,
        X,
        y,
        n_splits=n_splits,
        random_state=random_state
    )
    
    return mean_rmse, std_rmse, model


In [15]:
gradient_boosting_regression_model(X,y)

RMSE medio (CV): -238250429.8340
Desviación estándar RMSE (CV): 5571960.5409


(np.float64(238250429.8339708),
 np.float64(5571960.540932735),
 GradientBoostingRegressor(max_depth=6, random_state=42))

## Random forest

In [20]:
from sklearn.ensemble import RandomForestRegressor

def random_forest_regression_model(
    X,
    y,
    n_splits=5,
    random_state=42,
    n_estimators=300,
    max_depth=None,
):
    
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state,
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model.fit(X_train, y_train)
    
    mean_rmse, std_rmse = val_cruz(
        model,
        X,
        y,
        n_splits=n_splits,
        random_state=random_state
    )
    
    return mean_rmse, std_rmse, model


In [None]:
random_forest_regression_model(X, y)
