In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import lightgbm as lgb
from tqdm.auto import tqdm

In [2]:
DATA_ROOT   = Path('./data')
WORKING_DIR = Path('./')
STUDY_PATH = WORKING_DIR / 'studies'

In [3]:
n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=False)

X = pd.read_parquet(DATA_ROOT / 'working_dataset.parquet')
y = pd.read_csv(DATA_ROOT / 'train_labels.csv')['imdb_score']

In [4]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        try:
            y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        except AttributeError:
            y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [5]:
def train(params):
    fitted_models = []
    cv_scores = []


    for idx_train, idx_valid in tqdm(cv.split(X, y), total=n_splits):
        X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

        model = lgb.LGBMRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            callbacks=[lgb.log_evaluation(200), lgb.early_stopping(60)]
            # callbacks=[lgb.early_stopping(50, verbose=False)]
        )
        fitted_models.append(model)
    
        y_pred = model.predict(X_valid)

        mse_score = mean_squared_error(y_valid, y_pred)
        cv_scores.append(mse_score)
    
    model = VotingModel(fitted_models)

    return model, cv_scores

In [6]:
import joblib

# Create or load the study
def create_or_load_study(study_name, storage=None):
    try:
        study = joblib.load(f"{STUDY_PATH}/{study_name}.pkl")
        print(f"Loaded study '{study_name}' from file.")
    except FileNotFoundError:
        study = optuna.create_study(study_name=study_name, storage=storage, direction='maximize')
        print(f"Created new study '{study_name}'.")
    return study

# Function to save the study
def save_study(study, study_name):
    joblib.dump(study, f"{STUDY_PATH}/{study_name}.pkl")
    print(f"Study '{study_name}' saved to file.")

In [None]:
import optuna

def objective(trial):
    params = {
            'metric': 'rmse', 
            'random_state': 48,
            'n_estimators': 20000,
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
            'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
            'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
            'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
            'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
            'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
            'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100),
            'verbose': -1
        }

    model, cv_scores = train(params)

    #print(f'======= Mean of scores: {np.mean(cv_scores):.3f}±{2*np.std(cv_scores):.3f} =======')
    return np.mean(cv_scores)


study_name = 'optuna'
study = create_or_load_study(study_name)

def print_callback(study, trial):
    print(f"Current value: {trial.value}, Current params: {trial.params}")
    #print(f"Best value: {study.best_value}, Best params: {trial.best_trial.params}")

    study_df = study.trials_dataframe()
    save_study(study, study_name)
    study_df.to_csv('optuna.csv', index=False)


study.optimize(objective, n_trials=40, callbacks=[print_callback], n_jobs=1, show_progress_bar=True)

[I 2024-04-05 20:55:16,582] A new study created in memory with name: optuna


Created new study 'optuna'.


  0%|          | 0/40 [00:00<?, ?it/s]

  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),


  0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.412973
[400]	valid_0's rmse: 0.383718
[600]	valid_0's rmse: 0.379671
[800]	valid_0's rmse: 0.377533
[1000]	valid_0's rmse: 0.37687
Early stopping, best iteration is:
[1005]	valid_0's rmse: 0.376752
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.411374
[400]	valid_0's rmse: 0.387888
[600]	valid_0's rmse: 0.385843
Early stopping, best iteration is:
[546]	valid_0's rmse: 0.385762
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.445588
[400]	valid_0's rmse: 0.416044
[600]	valid_0's rmse: 0.409679
[800]	valid_0's rmse: 0.407315
[1000]	valid_0's rmse: 0.40531
[1200]	valid_0's rmse: 0.404378
Early stopping, best iteration is:
[1140]	valid_0's rmse: 0.404039
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.402237
[400]	valid_0's rmse: 0.379346
[600]	valid_0's rmse: 0.374881
Early stopping, best iterati

  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),


  0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.397854
[400]	valid_0's rmse: 0.37756
Early stopping, best iteration is:
[531]	valid_0's rmse: 0.375274
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.379437
[400]	valid_0's rmse: 0.372666
Early stopping, best iteration is:
[356]	valid_0's rmse: 0.372496
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.421237
[400]	valid_0's rmse: 0.396987
[600]	valid_0's rmse: 0.392567
[800]	valid_0's rmse: 0.389846
[1000]	valid_0's rmse: 0.388481
[1200]	valid_0's rmse: 0.387694
[1400]	valid_0's rmse: 0.386291
Early stopping, best iteration is:
[1403]	valid_0's rmse: 0.386281
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.386912
[400]	valid_0's rmse: 0.369499
Early stopping, best iteration is:
[451]	valid_0's rmse: 0.36805
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.37

  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),


  0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.480938
[400]	valid_0's rmse: 0.457258
[600]	valid_0's rmse: 0.44853
[800]	valid_0's rmse: 0.44495
[1000]	valid_0's rmse: 0.443844
[1200]	valid_0's rmse: 0.442426
[1400]	valid_0's rmse: 0.440486
Early stopping, best iteration is:
[1498]	valid_0's rmse: 0.439643
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.448285
[400]	valid_0's rmse: 0.43066
[600]	valid_0's rmse: 0.428146
[800]	valid_0's rmse: 0.426854
[1000]	valid_0's rmse: 0.425508
[1200]	valid_0's rmse: 0.424785
Early stopping, best iteration is:
[1151]	valid_0's rmse: 0.424712
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.502857
[400]	valid_0's rmse: 0.474711
[600]	valid_0's rmse: 0.469955
[800]	valid_0's rmse: 0.467726
[1000]	valid_0's rmse: 0.465995
[1200]	valid_0's rmse: 0.465688
[1400]	valid_0's rmse: 0.46478
Early stopping, best iteration is:
[1465]	valid_0's rmse: 0

  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),


  0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.470597
[400]	valid_0's rmse: 0.450809
[600]	valid_0's rmse: 0.443471
[800]	valid_0's rmse: 0.440211
[1000]	valid_0's rmse: 0.437548
[1200]	valid_0's rmse: 0.435114
[1400]	valid_0's rmse: 0.433776
[1600]	valid_0's rmse: 0.432867
[1800]	valid_0's rmse: 0.432384
Early stopping, best iteration is:
[1759]	valid_0's rmse: 0.432134
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.430612
[400]	valid_0's rmse: 0.420302
[600]	valid_0's rmse: 0.419489
Early stopping, best iteration is:
[586]	valid_0's rmse: 0.419313
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.473372
[400]	valid_0's rmse: 0.456057
[600]	valid_0's rmse: 0.452313
[800]	valid_0's rmse: 0.449707
[1000]	valid_0's rmse: 0.447385
[1200]	valid_0's rmse: 0.446163
Early stopping, best iteration is:
[1245]	valid_0's rmse: 0.445648
Training until validation scores don't improve for 6

  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),


  0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.456218
[400]	valid_0's rmse: 0.429179
[600]	valid_0's rmse: 0.422692
[800]	valid_0's rmse: 0.419202
[1000]	valid_0's rmse: 0.417955
[1200]	valid_0's rmse: 0.416488
[1400]	valid_0's rmse: 0.415011
Early stopping, best iteration is:
[1405]	valid_0's rmse: 0.414822
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.443573
[400]	valid_0's rmse: 0.415242
[600]	valid_0's rmse: 0.411181
Early stopping, best iteration is:
[637]	valid_0's rmse: 0.410828
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.474485
[400]	valid_0's rmse: 0.4404
[600]	valid_0's rmse: 0.433192
[800]	valid_0's rmse: 0.429443
[1000]	valid_0's rmse: 0.427663
[1200]	valid_0's rmse: 0.426276
[1400]	valid_0's rmse: 0.425999
Early stopping, best iteration is:
[1373]	valid_0's rmse: 0.425711
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.4

  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),


  0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.469946
[400]	valid_0's rmse: 0.449291
[600]	valid_0's rmse: 0.442998
[800]	valid_0's rmse: 0.439146
[1000]	valid_0's rmse: 0.437508
[1200]	valid_0's rmse: 0.435623
[1400]	valid_0's rmse: 0.434506
[1600]	valid_0's rmse: 0.433242
[1800]	valid_0's rmse: 0.432141
[2000]	valid_0's rmse: 0.431279
Early stopping, best iteration is:
[2031]	valid_0's rmse: 0.431162
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.453932
[400]	valid_0's rmse: 0.434012
[600]	valid_0's rmse: 0.429318
[800]	valid_0's rmse: 0.426729
[1000]	valid_0's rmse: 0.425541
Early stopping, best iteration is:
[1072]	valid_0's rmse: 0.425171
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.495552
[400]	valid_0's rmse: 0.472153
[600]	valid_0's rmse: 0.466904
[800]	valid_0's rmse: 0.464745
[1000]	valid_0's rmse: 0.462774
[1200]	valid_0's rmse: 0.461387
[1400]	valid_0's rmse: 

  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),


  0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.499189
[400]	valid_0's rmse: 0.424674
[600]	valid_0's rmse: 0.408244
[800]	valid_0's rmse: 0.402268
[1000]	valid_0's rmse: 0.399629
[1200]	valid_0's rmse: 0.398332
[1400]	valid_0's rmse: 0.397217
[1600]	valid_0's rmse: 0.395715
[1800]	valid_0's rmse: 0.395278
Early stopping, best iteration is:
[1761]	valid_0's rmse: 0.395188
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.4719
[400]	valid_0's rmse: 0.394072
[600]	valid_0's rmse: 0.385059
Early stopping, best iteration is:
[649]	valid_0's rmse: 0.384579
Training until validation scores don't improve for 60 rounds
[200]	valid_0's rmse: 0.530673
[400]	valid_0's rmse: 0.452309
[600]	valid_0's rmse: 0.432603
[800]	valid_0's rmse: 0.425043
[1000]	valid_0's rmse: 0.42053
[1200]	valid_0's rmse: 0.418256
[1400]	valid_0's rmse: 0.416873
[1600]	valid_0's rmse: 0.41582
Early stopping, best iteration is:
[1619]	valid_0's rmse: 0

In [None]:
params = {
        'metric': 'rmse', 
        'random_state': 48,
        'n_estimators': 20000,
        #'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        #'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        #'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        #'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        #'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        #'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        #'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        #'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        #'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100),
        'verbose': -1

    }
    
model, cv_scores = train(params)

In [None]:
f'{np.mean(cv_scores): .4f} ± {2*np.std(cv_scores): .4f}' # before 0.8577 ± 0.0048

## Feature Importance

In [None]:
import shap

shap_acc = None
X_acc = None

for i, (idx_train, idx_valid) in tqdm(enumerate(cv.split(X, y)), total=n_splits):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    explainer = shap.TreeExplainer(model.estimators[i])
    shap_values = explainer.shap_values(X_train)
    
    if shap_acc is not None:
        shap_acc = np.concatenate([shap_acc, shap_values])
    else:
        shap_acc = shap_values
    
    if X_acc is not None:
        X_acc = pd.concat([X_acc, X_train])
    else:
        X_acc = X_train

In [None]:
X.shape

In [None]:
shap.summary_plot(shap_acc, X_acc)