# Preliminares

Copiado de notebook de ETL

In [4]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [5]:
df = pd.read_csv('../podcast_dataset.csv')

# drop duplicates
df.drop_duplicates(inplace=True)

# target relativo
df['fraction_listened'] = df['Listening_Time_minutes'] / df['Episode_Length_minutes']

# encoding de episode number
df['Episode_Number'] = df['Episode_Title'].apply(lambda x: x.split(" ")[-1]).astype(int)

# ausencia de guest
df['no_guest'] = df['Guest_Popularity_percentage'].isna()

# imputacion de NaNs
df.fillna({'Guest_Popularity_percentage':-1}, inplace=True)

# dropeo de NaNs de las otras columnas y reseteo de index
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)

# encoding de sentiment
df['sentiment_encoded'] = df['Episode_Sentiment'].map({'Neutral':0, 'Negative':-1, 'Positive':1}).astype(int)

# drop de variables ahora redundantes
df.drop(columns=['Episode_Title', 'Episode_Sentiment', 'Listening_Time_minutes'], inplace=True)

# params
target_col = 'fraction_listened'
test_size = 0.3

# Establecemos convención de target relativo

X = df.drop(columns=[target_col])
y = df[[target_col]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=float(test_size))


In [6]:
X_train.head()

Unnamed: 0,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Number,no_guest,sentiment_encoded
30012,Market Masters,7.48,Business,26.42,Friday,Morning,87.46,0,23,False,0
15723,Sports Weekly,13.43,Sports,92.66,Tuesday,Evening,94.68,3,18,False,0
22062,Melody Mix,47.18,Music,99.54,Tuesday,Night,48.52,0,40,False,1
17759,True Crime Stories,51.98,True Crime,84.87,Monday,Night,48.25,2,16,False,-1
18317,Fitness First,14.27,Health,93.42,Thursday,Evening,-1.0,3,56,True,1


In [7]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12113 entries, 26734 to 19454
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Podcast_Name                 12113 non-null  object 
 1   Episode_Length_minutes       12113 non-null  float64
 2   Genre                        12113 non-null  object 
 3   Host_Popularity_percentage   12113 non-null  float64
 4   Publication_Day              12113 non-null  object 
 5   Publication_Time             12113 non-null  object 
 6   Guest_Popularity_percentage  12113 non-null  float64
 7   Number_of_Ads                12113 non-null  int64  
 8   Episode_Number               12113 non-null  int64  
 9   no_guest                     12113 non-null  bool   
 10  sentiment_encoded            12113 non-null  int64  
dtypes: bool(1), float64(3), int64(3), object(4)
memory usage: 1.0+ MB


# HP search

See references

* [MLFlow + Optuna](https://mlflow.org/docs/latest/ml/traditional-ml/tutorials/hyperparameter-tuning/notebooks/hyperparameter-tuning-with-child-runs)

## Evaluación de un set de HPs

In [24]:
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import TargetEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import KFold
import numpy as np
from sklearn.metrics import root_mean_squared_error, mean_absolute_error

In [12]:
cat_cols = X_train.columns[X_train.dtypes=='object'].tolist()
numeric_cols = X_train.columns[X_train.dtypes!='object'].tolist()

cat_cols, numeric_cols

(['Podcast_Name', 'Genre', 'Publication_Day', 'Publication_Time'],
 ['Episode_Length_minutes',
  'Host_Popularity_percentage',
  'Guest_Popularity_percentage',
  'Number_of_Ads',
  'Episode_Number',
  'no_guest',
  'sentiment_encoded'])

In [14]:
SPLITTER = KFold(n_splits=5, random_state=42, shuffle=True)

In [32]:
METRICS = {
    'RMSE': root_mean_squared_error,
    'MAE': mean_absolute_error
}

target_metric = 'RMSE'

In [30]:
# utils

def make_rf_pip(rf_params, cat_encoder):
    """
    Make a simple pipeline where
    * Numerical columns are passed through
    * Categorical columns are encoded using a passed `cat_encoder` encoder
    * A RandomForestRegressor is used as final estimator using `rf_params` parameters

    Both the preprocessing and RF models are passed `n_jobs=-1` for max usage of cores.
    """
    return make_pipeline(
        make_column_transformer(
            ('passthrough',numeric_cols),
            (cat_encoder, cat_cols),
            sparse_threshold=0,
            n_jobs=-1
        ),
        RandomForestRegressor(n_jobs=-1, **rf_params)
    )

def cv_model(rf_params, cat_encoder, X, y):
    n_splits = SPLITTER.get_n_splits()

    results = {k:np.empty(n_splits) for k in METRICS}
    for i, (train_idx, test_idx) in enumerate(SPLITTER.split(X)):

        # get splits
        X_train = X.iloc[train_idx]
        X_test = X.iloc[test_idx]

        y_train_rel = y.iloc[train_idx]
        y_test_abs = y.iloc[test_idx] * X_test['Episode_Length_minutes']

        # train model
        pipe = make_rf_pip(rf_params, cat_encoder)
        pipe.fit(X_train, y_train_rel)

        # get absolute preds
        y_pred_rel = pipe.predict(X_test)
        y_pred_abs = y_pred_rel * X_test['Episode_Length_minutes']

        # calculate metrics, save them
        for k, metric in METRICS.items():
            results[k][i] = metric(
                y_test_abs,
                y_pred_abs
            )
    # return results
    return results

Mostramos un ejemplo de cross validation para un ejemplo de parámetros

In [31]:
cv_model(
    rf_params={'n_estimators':50, 'max_depth':4, 'min_samples_split':3},
    cat_encoder=TargetEncoder(target_type='continuous', random_state=42),
    X=X_train,
    y=y_train[target_col]
)

{'RMSE': array([11.25387199, 11.31476545, 11.19360097, 10.95370621, 11.08710962]),
 'MAE': array([8.87480125, 8.86869128, 8.83628072, 8.61340798, 8.73323277])}

## Optimización de HPs

In [10]:
# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)

# define a logging callback that will report on only new challenger parameter configurations if a
# trial has usurped the state of 'best conditions'
def champion_callback(study, frozen_trial):
  """
  Logging callback that will report when a new trial iteration improves upon existing
  best trial values.
  """

  winner = study.user_attrs.get("winner", None)

  if study.best_value and winner != study.best_value:
      study.set_user_attr("winner", study.best_value)
      if winner:
          improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100
          print(
              f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
              f"{improvement_percent: .4f}% improvement"
          )
      else:
          print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")

In [None]:
def objective(trial):

    # define hps
    rf_params = dict(
        n_estimators = trial.suggest_int('n_estimators', 50, 100),
        max_depth = trial.suggest_int('max_depth', 3, 6),
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    )

    # CV the HPs
    results = cv_model(
        rf_params=rf_params,
        cat_encoder=TargetEncoder(target_type='continuous', random_state=42),
        X=X_train,
        y=y_train[target_col]
    )

    # log metrics x params
    # mlflow.log_params(params)
    # for name, values in results.items():
    #    mlflow.log_metric(name.lower(), values.mean().item())

    # return mean target metric
    return results[target_metric].mean()


Mostramos corrida

In [35]:
study = optuna.create_study(direction="minimize")

study.optimize(objective, n_trials=5, callbacks=[champion_callback])

Initial trial 0 achieved value: 11.103759426679199


In [38]:
challenger_params = study.best_params

challenger_params

{'n_estimators': 52, 'max_depth': 6, 'min_samples_split': 5}

# Entrenamiento usando mejores HP

In [42]:
pipe = make_rf_pip(
    rf_params=challenger_params,
    cat_encoder=TargetEncoder(target_type='continuous', random_state=42)
)

pipe.fit(X_train, y_train[target_col])
y_pred_abs = pipe.predict(X_test) * X_test['Episode_Length_minutes']
y_test_abs = y_test[target_col] * X_test['Episode_Length_minutes']

for k,f in METRICS.items():
    print(k, round(f(y_test_abs, y_pred_abs), 3))

RMSE 11.046
MAE 8.713


`pipe` es el *challenger*, si las predicciones sobre **test** son mejores que las del modelo anterior (el *champion*) entonces lo destrona y se toma el primero como nuevo modelo vigente.