Authors:
    <br>Alejandro Alvarez (axa)
    <br>Brenda Palma (bpalmagu)

# <center>ML-Jokes: Content-Based Filtering</center>

In [1]:
# Path to ml-jokes folder
import os
if os.getcwd().split('/')[-2] == 'ml-jokes': os.chdir('..')
print(f'Current directory: {os.getcwd()}')
assert set(['data', 'mljokes', 'environment.yml', 'nbs']) <= set(os.listdir()), \
    'Wrong path; go to ./heinz-95729-project/api/ml-jokes'

Current directory: /home/alejandroxag/my_files/heinz-95729-project/api/ml-jokes


In [2]:
# imports
import optuna
import pickle
import numpy as np
import pandas as pd
from mljokes.topics import get_lda_topics
from sklearn.metrics import mean_absolute_error
from mljokes.data import read_jokes, read_ratings
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score

# Regression model inputs

In [72]:
def get_model_inputs(val_size, test_size, sample_size, n_topics, random_state):
    jokes_df = read_jokes()
    ratings_df = read_ratings()

    # Topics
    topics = get_lda_topics(text=jokes_df.clean_text, 
                                        ids=jokes_df.joke_id,
                                        n_topics=n_topics, 
                                        random_state=random_state)
    topics.rename(columns={'id': 'joke_id'}, inplace=True)

    # User-joke information
    user_joke_info = ratings_df.copy()
    user_joke_info.insert(4, 'unseen_joke', (user_joke_info.rating == 99.0).astype(int))

    test_examples = user_joke_info.index[user_joke_info.unseen_joke == 0]
    test_examples = np.random.choice(test_examples,
                                    size=round(test_size * len(test_examples)),
                                    replace=False)
    user_joke_info.insert(5, 
                          'test_example', 
                          user_joke_info.index.isin(test_examples).astype(int))

    val_examples = user_joke_info.index[(user_joke_info.unseen_joke + \
                                        user_joke_info.test_example) == 0]
    val_examples = np.random.choice(val_examples,
                                    size=round(val_size * len(val_examples)),
                                    replace=False)
    user_joke_info.insert(6, 
                          'val_example', 
                          user_joke_info.index.isin(val_examples).astype(int))

    user_joke_info.insert(7, 
                          'training_example', 
                          ((user_joke_info.unseen_joke + \
                            user_joke_info.test_example + \
                            user_joke_info.val_example) == 0).astype(int))

    user_joke_info = user_joke_info.merge(topics, how='left', on='joke_id')
    user_joke_info.rename(columns=dict(zip(user_joke_info.columns[8:],
                                          ['topic_percentage_' + c  \
                                            for c in np.array(user_joke_info.columns[8:], 
                                                              dtype=str)])), 
                          inplace=True)
    user_joke_info.reset_index(drop=True, inplace=True)

    user_joke_info = user_joke_info.merge(jokes_df.loc[:, ['joke_id', 'len']], how='left', on='joke_id')

    user_joke_info = user_joke_info.loc[:, list(user_joke_info.columns[:3]) + \
                                       ['len'] + \
                                       list(user_joke_info.columns[3:-1])]

    if sample_size < 1.0:
        user_joke_info = user_joke_info.iloc[np.random.randint(0, 
                                                               len(user_joke_info), 
                                                               size=round(sample_size * \
                                                                          len(user_joke_info))), :]

    user_joke_info_training = user_joke_info[(user_joke_info.unseen_joke + \
                                          user_joke_info.test_example + \
                                          user_joke_info.val_example) == 0]
    
    user_joke_info_training.drop(columns=['test_user', 
                                          'unseen_joke', 
                                          'test_example', 
                                          'val_example', 
                                          'training_example'], 
                                inplace=True)

    user_profiles = user_joke_info_training.iloc[:, :2]
    user_profiles.loc[:, list(range(len(user_joke_info_training.columns[4:])))] = \
        (user_joke_info_training.rating / 10).values.reshape((-1, 1)) * \
        user_joke_info_training.iloc[:, 4:].values
    user_profiles = user_profiles.groupby(['user_id'])[user_profiles.columns[2:]].agg('mean').reset_index()
    user_profiles.rename(columns=dict(zip(user_profiles.columns[1:],
                                          ['topic_user_rating_' + c \
                                          for c in np.array(user_profiles.columns[1:], dtype=str)])), 
                        inplace=True)
    user_joke_info_training = user_joke_info_training.merge(user_profiles, how='left', on='user_id')

    user_joke_info = user_joke_info.merge(user_joke_info_training \
                                            .iloc[:, [0] + list(-np.arange(n_topics, 0, -1))] \
                                            .drop_duplicates(),
                                        how='left', on='user_id')

    user_joke_info = user_joke_info.loc[~np.logical_and(user_joke_info.topic_user_rating_0.isna(), 
                                                    user_joke_info.val_example + \
                                                    user_joke_info.test_example + \
                                                    user_joke_info.unseen_joke == 1), :]
    
    user_joke_info.sort_values(by=['user_id', 'joke_id'], inplace=True)
    user_joke_info.reset_index(drop=True, inplace=True)
    features = ['len'] + list(user_joke_info.columns[9:])
    
    train_idxs = user_joke_info.index[user_joke_info.training_example + user_joke_info.val_example == 1]
    test_idxs = user_joke_info.index[user_joke_info.test_example == 1]

    folds = user_joke_info.loc[train_idxs, ['training_example', 'val_example']].reset_index(drop=True)
    folds = [(folds.index[folds.training_example == 1], folds.index[folds.val_example == 1])]

    y_train = user_joke_info.rating[train_idxs].values
    X_train = user_joke_info.loc[train_idxs, features].values
    y_test = user_joke_info.rating[test_idxs].values
    X_test = user_joke_info.loc[test_idxs, features].values

    return {'user_joke_info': user_joke_info,
            'features': features,
            'X_train': X_train,
            'y_train': y_train,
            'X_test': X_test,
            'y_test': y_test,
            'folds': folds,
            'jokes_df': jokes_df,
            'topics': topics,}

# Regression model (HGB)

### Optuna hypertuning

In [69]:
def tune(objective, n_trials, file_name, save_path='./results'):
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    params = study.best_params
    best_score = study.best_value
    print(f"Best score: {best_score}\n")
    print(f"Optimized parameters: {params}\n")

    with open(f'{save_path}/{file_name}.pkl', 'wb') as f:
        pickle.dump(params, f)

    return params

def hgb_objective(trial):
    _learning_rate = trial.suggest_loguniform('learning_rate', low=1e-3, high=0.25)
    _max_depth = trial.suggest_int('max_depth', low=5, high=20, log=True)
    _l2_regularization = trial.suggest_loguniform('l2_regularization', low=1e-6, high=1)
    _min_samples_leaf = trial.suggest_int('min_samples_leaf', low=100, high=1_000)
    _tol = trial.suggest_loguniform('tol', low=1e-2, high=1e-1)
    _n_iter_no_change = trial.suggest_int('n_iter_no_change', low=10, high=100)
    _random_state = trial.suggest_int('random_state', 0, 1_000_000_000)

    fixed_params = {
                    'max_iter': 100_000,
                    'loss': 'absolute_error',
                    'max_leaf_nodes': None,
                    'early_stopping': True,
                    'scoring': 'loss',
                    'validation_fraction': 0.2,
                    # 'tol': 5e-2,
                    # 'n_iter_no_change': 50,
                    'verbose': 4,
                  }

    hgb = HistGradientBoostingRegressor(**fixed_params)
    
    hgb.set_params(**{
                      'learning_rate': _learning_rate,
                      'max_depth': _max_depth,
                      'l2_regularization': _l2_regularization,
                      'min_samples_leaf': _min_samples_leaf,
                      'tol': _tol,
                      'n_iter_no_change': _n_iter_no_change,
                      'random_state': _random_state
                      })

    print(f'\n\nCurrent model:\n\n{hgb.get_params()}\n')

    X = model_inputs['X_train']
    y = model_inputs['y_train']
    train_val_folds = model_inputs['folds']
    scores = cross_val_score(hgb, X, y, 
                             cv=train_val_folds, 
                             scoring="neg_mean_absolute_error")

    return scores.mean()

In [46]:
test_size = 0.15
val_size = 0.15
sample_size = 0.25
n_topics = 10
random_state = 0

model_inputs = get_model_inputs(val_size, test_size, sample_size, n_topics, random_state)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [56]:
hgb_params = tune(hgb_objective, n_trials=20, file_name='test_optuna2')

[32m[I 2021-11-27 13:44:57,225][0m A new study created in memory with name: no-name-58c37c4f-8496-41d6-b2b6-44b6f8629dfc[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 2.8204642290576796e-05, 'learning_rate': 0.044102916305382434, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 6, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 173, 'monotonic_cst': None, 'n_iter_no_change': 62, 'random_state': 830205047, 'scoring': 'loss', 'tol': 0.08059856466264227, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.858 s
Binning 0.025 GB of validation data: 0.126 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 64 leaves, max depth = 6, train loss: 4.32955, val loss: 4.33520, in 0.174s
[2/100000] 1 tree, 64 leaves, max depth = 6, train loss: 4.25298, val loss: 4.25839, in 0.160s
[3/100000] 1 tree, 64 leaves, max depth = 6, train loss: 4.18105, val loss: 4.18639, in 0.395s
[4/100000] 1 tree, 64 leaves, max depth = 6, train loss: 4.11311, val loss: 4.11852, in 0.221s
[5/100000] 1 tree, 64 leaves,

[32m[I 2021-11-27 13:45:29,962][0m Trial 0 finished with value: -3.8876998387783153 and parameters: {'learning_rate': 0.044102916305382434, 'max_depth': 6, 'l2_regularization': 2.8204642290576796e-05, 'min_samples_leaf': 173, 'tol': 0.08059856466264227, 'n_iter_no_change': 62, 'random_state': 830205047}. Best is trial 0 with value: -3.8876998387783153.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 0.02168029753677356, 'learning_rate': 0.001415445349883366, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 12, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 160, 'monotonic_cst': None, 'n_iter_no_change': 79, 'random_state': 812124917, 'scoring': 'loss', 'tol': 0.08906732523947954, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.887 s
Binning 0.025 GB of validation data: 0.092 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 1352 leaves, max depth = 12, train loss: 4.40778, val loss: 4.41311, in 0.826s
[2/100000] 1 tree, 1371 leaves, max depth = 12, train loss: 4.40401, val loss: 4.40939, in 0.812s
[3/100000] 1 tree, 1364 leaves, max depth = 12, train loss: 4.40024, val loss: 4.40567, in 0.788s
[4/100000] 1 tree, 1368 leaves, max depth = 12, train loss: 4.39648, val loss: 4.40196, in 0.785s
[5/100000] 1 tree, 

[32m[I 2021-11-27 13:54:14,732][0m Trial 1 finished with value: -3.9744934105310463 and parameters: {'learning_rate': 0.001415445349883366, 'max_depth': 12, 'l2_regularization': 0.02168029753677356, 'min_samples_leaf': 160, 'tol': 0.08906732523947954, 'n_iter_no_change': 79, 'random_state': 812124917}. Best is trial 0 with value: -3.8876998387783153.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 0.003675731545372673, 'learning_rate': 0.06461169725805752, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 5, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 838, 'monotonic_cst': None, 'n_iter_no_change': 51, 'random_state': 818719848, 'scoring': 'loss', 'tol': 0.014377252348738231, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.804 s
Binning 0.025 GB of validation data: 0.081 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.30497, val loss: 4.30725, in 0.135s
[2/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.20774, val loss: 4.21085, in 0.120s
[3/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.11732, val loss: 4.12109, in 0.117s
[4/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.03548, val loss: 4.04013, in 0.117s
[5/100000] 1 tree, 32 leaves, m

[32m[I 2021-11-27 13:55:07,656][0m Trial 2 finished with value: -4.015267084832223 and parameters: {'learning_rate': 0.06461169725805752, 'max_depth': 5, 'l2_regularization': 0.003675731545372673, 'min_samples_leaf': 838, 'tol': 0.014377252348738231, 'n_iter_no_change': 51, 'random_state': 818719848}. Best is trial 0 with value: -3.8876998387783153.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 5.188815353200851e-06, 'learning_rate': 0.004488484279995024, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 19, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 138, 'monotonic_cst': None, 'n_iter_no_change': 12, 'random_state': 660896967, 'scoring': 'loss', 'tol': 0.02971884839526376, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.944 s
Binning 0.025 GB of validation data: 0.084 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 3003 leaves, max depth = 19, train loss: 4.40141, val loss: 4.39452, in 1.205s
[2/100000] 1 tree, 3046 leaves, max depth = 19, train loss: 4.38882, val loss: 4.38228, in 1.201s
[3/100000] 1 tree, 3045 leaves, max depth = 19, train loss: 4.37631, val loss: 4.37013, in 1.225s
[4/100000] 1 tree, 2983 leaves, max depth = 19, train loss: 4.36382, val loss: 4.35801, in 1.325s
[5/100000] 1 tree

[32m[I 2021-11-27 14:01:19,223][0m Trial 3 finished with value: -3.957693746504444 and parameters: {'learning_rate': 0.004488484279995024, 'max_depth': 19, 'l2_regularization': 5.188815353200851e-06, 'min_samples_leaf': 138, 'tol': 0.02971884839526376, 'n_iter_no_change': 12, 'random_state': 660896967}. Best is trial 0 with value: -3.8876998387783153.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 3.4250244414691977e-06, 'learning_rate': 0.00839347327820516, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 7, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 652, 'monotonic_cst': None, 'n_iter_no_change': 61, 'random_state': 98391219, 'scoring': 'loss', 'tol': 0.07404686918012796, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.810 s
Binning 0.025 GB of validation data: 0.098 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 114 leaves, max depth = 7, train loss: 4.39739, val loss: 4.38737, in 0.197s
[2/100000] 1 tree, 114 leaves, max depth = 7, train loss: 4.38044, val loss: 4.37052, in 0.202s
[3/100000] 1 tree, 115 leaves, max depth = 7, train loss: 4.36362, val loss: 4.35381, in 0.154s
[4/100000] 1 tree, 115 leaves, max depth = 7, train loss: 4.34704, val loss: 4.33729, in 0.204s
[5/100000] 1 tree, 116 leav

[32m[I 2021-11-27 14:02:20,120][0m Trial 4 finished with value: -3.8671193948794778 and parameters: {'learning_rate': 0.00839347327820516, 'max_depth': 7, 'l2_regularization': 3.4250244414691977e-06, 'min_samples_leaf': 652, 'tol': 0.07404686918012796, 'n_iter_no_change': 61, 'random_state': 98391219}. Best is trial 4 with value: -3.8671193948794778.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 0.18562461162944977, 'learning_rate': 0.0012316727243264671, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 20, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 770, 'monotonic_cst': None, 'n_iter_no_change': 76, 'random_state': 612102684, 'scoring': 'loss', 'tol': 0.026790991216243388, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.802 s
Binning 0.025 GB of validation data: 0.084 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 577 leaves, max depth = 20, train loss: 4.40885, val loss: 4.41249, in 0.513s
[2/100000] 1 tree, 585 leaves, max depth = 20, train loss: 4.40584, val loss: 4.40950, in 0.469s
[3/100000] 1 tree, 573 leaves, max depth = 20, train loss: 4.40282, val loss: 4.40651, in 0.462s
[4/100000] 1 tree, 573 leaves, max depth = 20, train loss: 4.39981, val loss: 4.40353, in 0.468s
[5/100000] 1 tree, 57

[32m[I 2021-11-27 14:14:29,617][0m Trial 5 finished with value: -3.9131123741768463 and parameters: {'learning_rate': 0.0012316727243264671, 'max_depth': 20, 'l2_regularization': 0.18562461162944977, 'min_samples_leaf': 770, 'tol': 0.026790991216243388, 'n_iter_no_change': 76, 'random_state': 612102684}. Best is trial 4 with value: -3.8671193948794778.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 0.00048202884166468533, 'learning_rate': 0.1390202811380725, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 19, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 296, 'monotonic_cst': None, 'n_iter_no_change': 38, 'random_state': 613240236, 'scoring': 'loss', 'tol': 0.029417783554341034, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 1.092 s
Binning 0.025 GB of validation data: 0.163 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 1462 leaves, max depth = 19, train loss: 4.04975, val loss: 4.05837, in 0.874s
[2/100000] 1 tree, 1490 leaves, max depth = 19, train loss: 3.75151, val loss: 3.76738, in 0.892s
[3/100000] 1 tree, 1496 leaves, max depth = 19, train loss: 3.50737, val loss: 3.53003, in 0.809s
[4/100000] 1 tree, 1506 leaves, max depth = 19, train loss: 3.30626, val loss: 3.33545, in 0.833s
[5/100000] 1 tree

[32m[I 2021-11-27 14:15:24,340][0m Trial 6 finished with value: -4.072599119845924 and parameters: {'learning_rate': 0.1390202811380725, 'max_depth': 19, 'l2_regularization': 0.00048202884166468533, 'min_samples_leaf': 296, 'tol': 0.029417783554341034, 'n_iter_no_change': 38, 'random_state': 613240236}. Best is trial 4 with value: -3.8671193948794778.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 0.00011106602542718992, 'learning_rate': 0.11260609344570907, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 12, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 524, 'monotonic_cst': None, 'n_iter_no_change': 57, 'random_state': 519810070, 'scoring': 'loss', 'tol': 0.01286689612618834, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.840 s
Binning 0.025 GB of validation data: 0.084 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 640 leaves, max depth = 12, train loss: 4.13949, val loss: 4.12694, in 0.511s
[2/100000] 1 tree, 653 leaves, max depth = 12, train loss: 3.90208, val loss: 3.89268, in 0.475s
[3/100000] 1 tree, 641 leaves, max depth = 12, train loss: 3.69782, val loss: 3.69108, in 0.477s
[4/100000] 1 tree, 656 leaves, max depth = 12, train loss: 3.52777, val loss: 3.52378, in 0.490s
[5/100000] 1 tree, 67

[32m[I 2021-11-27 14:16:19,266][0m Trial 7 finished with value: -4.09674109449148 and parameters: {'learning_rate': 0.11260609344570907, 'max_depth': 12, 'l2_regularization': 0.00011106602542718992, 'min_samples_leaf': 524, 'tol': 0.01286689612618834, 'n_iter_no_change': 57, 'random_state': 519810070}. Best is trial 4 with value: -3.8671193948794778.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 0.0003637037583843697, 'learning_rate': 0.032624991512661135, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 12, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 856, 'monotonic_cst': None, 'n_iter_no_change': 34, 'random_state': 138255994, 'scoring': 'loss', 'tol': 0.02048992647488654, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.793 s
Binning 0.025 GB of validation data: 0.082 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 415 leaves, max depth = 12, train loss: 4.33506, val loss: 4.33180, in 0.388s
[2/100000] 1 tree, 412 leaves, max depth = 12, train loss: 4.26030, val loss: 4.25774, in 0.349s
[3/100000] 1 tree, 426 leaves, max depth = 12, train loss: 4.18864, val loss: 4.18702, in 0.395s
[4/100000] 1 tree, 432 leaves, max depth = 12, train loss: 4.12011, val loss: 4.11936, in 0.393s
[5/100000] 1 tree, 42

[32m[I 2021-11-27 14:17:22,260][0m Trial 8 finished with value: -3.9899319333226746 and parameters: {'learning_rate': 0.032624991512661135, 'max_depth': 12, 'l2_regularization': 0.0003637037583843697, 'min_samples_leaf': 856, 'tol': 0.02048992647488654, 'n_iter_no_change': 34, 'random_state': 138255994}. Best is trial 4 with value: -3.8671193948794778.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 0.0028293832772098315, 'learning_rate': 0.0015404686190992243, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 20, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 671, 'monotonic_cst': None, 'n_iter_no_change': 32, 'random_state': 140178333, 'scoring': 'loss', 'tol': 0.0241957713771533, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.950 s
Binning 0.025 GB of validation data: 0.087 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 677 leaves, max depth = 20, train loss: 4.40647, val loss: 4.41796, in 0.542s
[2/100000] 1 tree, 676 leaves, max depth = 20, train loss: 4.40264, val loss: 4.41417, in 0.512s
[3/100000] 1 tree, 679 leaves, max depth = 20, train loss: 4.39883, val loss: 4.41039, in 0.547s
[4/100000] 1 tree, 680 leaves, max depth = 20, train loss: 4.39502, val loss: 4.40661, in 0.514s
[5/100000] 1 tree, 67

[32m[I 2021-11-27 14:25:27,310][0m Trial 9 finished with value: -3.9199530174428006 and parameters: {'learning_rate': 0.0015404686190992243, 'max_depth': 20, 'l2_regularization': 0.0028293832772098315, 'min_samples_leaf': 671, 'tol': 0.0241957713771533, 'n_iter_no_change': 32, 'random_state': 140178333}. Best is trial 4 with value: -3.8671193948794778.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 3.105406144245114e-06, 'learning_rate': 0.008597652632011232, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 7, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 480, 'monotonic_cst': None, 'n_iter_no_change': 88, 'random_state': 296799738, 'scoring': 'loss', 'tol': 0.05022401732166865, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.878 s
Binning 0.025 GB of validation data: 0.089 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 116 leaves, max depth = 7, train loss: 4.39460, val loss: 4.39683, in 0.226s
[2/100000] 1 tree, 116 leaves, max depth = 7, train loss: 4.37723, val loss: 4.37949, in 0.191s
[3/100000] 1 tree, 117 leaves, max depth = 7, train loss: 4.36006, val loss: 4.36233, in 0.199s
[4/100000] 1 tree, 116 leaves, max depth = 7, train loss: 4.34303, val loss: 4.34535, in 0.185s
[5/100000] 1 tree, 117 lea

[32m[I 2021-11-27 14:26:51,152][0m Trial 10 finished with value: -3.878704763540726 and parameters: {'learning_rate': 0.008597652632011232, 'max_depth': 7, 'l2_regularization': 3.105406144245114e-06, 'min_samples_leaf': 480, 'tol': 0.05022401732166865, 'n_iter_no_change': 88, 'random_state': 296799738}. Best is trial 4 with value: -3.8671193948794778.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 1.7268131603042725e-06, 'learning_rate': 0.009504394709649432, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 7, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 493, 'monotonic_cst': None, 'n_iter_no_change': 96, 'random_state': 307496158, 'scoring': 'loss', 'tol': 0.054300052004819944, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 1.105 s
Binning 0.025 GB of validation data: 0.418 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 118 leaves, max depth = 7, train loss: 4.39294, val loss: 4.39354, in 0.397s
[2/100000] 1 tree, 118 leaves, max depth = 7, train loss: 4.37362, val loss: 4.37445, in 0.710s
[3/100000] 1 tree, 118 leaves, max depth = 7, train loss: 4.35459, val loss: 4.35563, in 0.731s
[4/100000] 1 tree, 117 leaves, max depth = 7, train loss: 4.33577, val loss: 4.33697, in 0.367s
[5/100000] 1 tree, 118 l

[32m[I 2021-11-27 14:28:14,280][0m Trial 11 finished with value: -3.8827030913941902 and parameters: {'learning_rate': 0.009504394709649432, 'max_depth': 7, 'l2_regularization': 1.7268131603042725e-06, 'min_samples_leaf': 493, 'tol': 0.054300052004819944, 'n_iter_no_change': 96, 'random_state': 307496158}. Best is trial 4 with value: -3.8671193948794778.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 1.3025875335465571e-05, 'learning_rate': 0.010800991214926146, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 8, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 388, 'monotonic_cst': None, 'n_iter_no_change': 100, 'random_state': 302761117, 'scoring': 'loss', 'tol': 0.05087030034684442, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.804 s
Binning 0.025 GB of validation data: 0.084 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 215 leaves, max depth = 8, train loss: 4.38878, val loss: 4.38882, in 0.260s
[2/100000] 1 tree, 210 leaves, max depth = 8, train loss: 4.36527, val loss: 4.36548, in 0.240s
[3/100000] 1 tree, 214 leaves, max depth = 8, train loss: 4.34210, val loss: 4.34243, in 0.239s
[4/100000] 1 tree, 210 leaves, max depth = 8, train loss: 4.31936, val loss: 4.31986, in 0.237s
[5/100000] 1 tree, 216 l

[32m[I 2021-11-27 14:29:47,955][0m Trial 12 finished with value: -3.9167847236667743 and parameters: {'learning_rate': 0.010800991214926146, 'max_depth': 8, 'l2_regularization': 1.3025875335465571e-05, 'min_samples_leaf': 388, 'tol': 0.05087030034684442, 'n_iter_no_change': 100, 'random_state': 302761117}. Best is trial 4 with value: -3.8671193948794778.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 1.29149619049349e-06, 'learning_rate': 0.006082226749174292, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 9, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 658, 'monotonic_cst': None, 'n_iter_no_change': 77, 'random_state': 321548448, 'scoring': 'loss', 'tol': 0.04837029172030643, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.791 s
Binning 0.025 GB of validation data: 0.081 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 283 leaves, max depth = 9, train loss: 4.40117, val loss: 4.38838, in 0.318s
[2/100000] 1 tree, 278 leaves, max depth = 9, train loss: 4.38718, val loss: 4.37447, in 0.304s
[3/100000] 1 tree, 274 leaves, max depth = 9, train loss: 4.37330, val loss: 4.36066, in 0.288s
[4/100000] 1 tree, 279 leaves, max depth = 9, train loss: 4.35956, val loss: 4.34698, in 0.282s
[5/100000] 1 tree, 281 leav

[32m[I 2021-11-27 14:32:08,205][0m Trial 13 finished with value: -3.9069148064183383 and parameters: {'learning_rate': 0.006082226749174292, 'max_depth': 9, 'l2_regularization': 1.29149619049349e-06, 'min_samples_leaf': 658, 'tol': 0.04837029172030643, 'n_iter_no_change': 77, 'random_state': 321548448}. Best is trial 4 with value: -3.8671193948794778.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 4.541527947195171e-05, 'learning_rate': 0.003261379715833991, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 5, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 983, 'monotonic_cst': None, 'n_iter_no_change': 89, 'random_state': 17480306, 'scoring': 'loss', 'tol': 0.06472450411779258, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.989 s
Binning 0.025 GB of validation data: 0.082 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.40829, val loss: 4.40254, in 0.155s
[2/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.40280, val loss: 4.39708, in 0.129s
[3/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.39735, val loss: 4.39166, in 0.132s
[4/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.39193, val loss: 4.38626, in 0.087s
[5/100000] 1 tree, 32 leaves, m

[32m[I 2021-11-27 14:33:33,390][0m Trial 14 finished with value: -3.834341543981332 and parameters: {'learning_rate': 0.003261379715833991, 'max_depth': 5, 'l2_regularization': 4.541527947195171e-05, 'min_samples_leaf': 983, 'tol': 0.06472450411779258, 'n_iter_no_change': 89, 'random_state': 17480306}. Best is trial 14 with value: -3.834341543981332.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 5.68142270306428e-05, 'learning_rate': 0.003275873023681681, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 5, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 899, 'monotonic_cst': None, 'n_iter_no_change': 67, 'random_state': 31039700, 'scoring': 'loss', 'tol': 0.07037281655929718, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.791 s
Binning 0.025 GB of validation data: 0.091 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.40839, val loss: 4.40150, in 0.126s
[2/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.40283, val loss: 4.39594, in 0.096s
[3/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.39729, val loss: 4.39040, in 0.151s
[4/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.39177, val loss: 4.38488, in 0.124s
[5/100000] 1 tree, 32 leaves, ma

[32m[I 2021-11-27 14:34:36,624][0m Trial 15 finished with value: -3.8532371684174476 and parameters: {'learning_rate': 0.003275873023681681, 'max_depth': 5, 'l2_regularization': 5.68142270306428e-05, 'min_samples_leaf': 899, 'tol': 0.07037281655929718, 'n_iter_no_change': 67, 'random_state': 31039700}. Best is trial 14 with value: -3.834341543981332.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 6.541543606516337e-05, 'learning_rate': 0.0028710861658722024, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 5, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 978, 'monotonic_cst': None, 'n_iter_no_change': 71, 'random_state': 39084583, 'scoring': 'loss', 'tol': 0.03877036102680547, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.803 s
Binning 0.025 GB of validation data: 0.082 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.40841, val loss: 4.40494, in 0.129s
[2/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.40354, val loss: 4.40009, in 0.127s
[3/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.39870, val loss: 4.39526, in 0.133s
[4/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.39387, val loss: 4.39045, in 0.118s
[5/100000] 1 tree, 32 leaves, 

[32m[I 2021-11-27 14:36:19,106][0m Trial 16 finished with value: -3.8325659976867135 and parameters: {'learning_rate': 0.0028710861658722024, 'max_depth': 5, 'l2_regularization': 6.541543606516337e-05, 'min_samples_leaf': 978, 'tol': 0.03877036102680547, 'n_iter_no_change': 71, 'random_state': 39084583}. Best is trial 16 with value: -3.8325659976867135.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 0.00012025582475315536, 'learning_rate': 0.0025898207008850553, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 5, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 993, 'monotonic_cst': None, 'n_iter_no_change': 85, 'random_state': 9945409, 'scoring': 'loss', 'tol': 0.045059397191548924, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.949 s
Binning 0.025 GB of validation data: 0.116 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.40814, val loss: 4.40842, in 0.156s
[2/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.40377, val loss: 4.40409, in 0.143s
[3/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.39941, val loss: 4.39976, in 0.149s
[4/100000] 1 tree, 32 leaves, max depth = 5, train loss: 4.39505, val loss: 4.39544, in 0.127s
[5/100000] 1 tree, 32 leaves,

[32m[I 2021-11-27 14:38:06,938][0m Trial 17 finished with value: -3.837452624841087 and parameters: {'learning_rate': 0.0025898207008850553, 'max_depth': 5, 'l2_regularization': 0.00012025582475315536, 'min_samples_leaf': 993, 'tol': 0.045059397191548924, 'n_iter_no_change': 85, 'random_state': 9945409}. Best is trial 16 with value: -3.8325659976867135.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 0.0020307363384580524, 'learning_rate': 0.002177150573098593, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 6, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 969, 'monotonic_cst': None, 'n_iter_no_change': 68, 'random_state': 192636529, 'scoring': 'loss', 'tol': 0.034981763833664935, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.833 s
Binning 0.025 GB of validation data: 0.084 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 60 leaves, max depth = 6, train loss: 4.40800, val loss: 4.41074, in 0.173s
[2/100000] 1 tree, 60 leaves, max depth = 6, train loss: 4.40394, val loss: 4.40669, in 0.149s
[3/100000] 1 tree, 60 leaves, max depth = 6, train loss: 4.39989, val loss: 4.40265, in 0.127s
[4/100000] 1 tree, 60 leaves, max depth = 6, train loss: 4.39585, val loss: 4.39862, in 0.151s
[5/100000] 1 tree, 60 leaves,

[32m[I 2021-11-27 14:40:29,670][0m Trial 18 finished with value: -3.8515661205516736 and parameters: {'learning_rate': 0.002177150573098593, 'max_depth': 6, 'l2_regularization': 0.0020307363384580524, 'min_samples_leaf': 969, 'tol': 0.034981763833664935, 'n_iter_no_change': 68, 'random_state': 192636529}. Best is trial 16 with value: -3.8325659976867135.[0m




Current model:

{'categorical_features': None, 'early_stopping': True, 'l2_regularization': 0.018764072014136098, 'learning_rate': 0.02418763183472459, 'loss': 'absolute_error', 'max_bins': 255, 'max_depth': 6, 'max_iter': 100000, 'max_leaf_nodes': None, 'min_samples_leaf': 764, 'monotonic_cst': None, 'n_iter_no_change': 47, 'random_state': 421717916, 'scoring': 'loss', 'tol': 0.03514932481541225, 'validation_fraction': 0.2, 'verbose': 4, 'warm_start': False}

Binning 0.100 GB of training data: 0.846 s
Binning 0.025 GB of validation data: 0.080 s
Fitting gradient boosted rounds:
[1/100000] 1 tree, 61 leaves, max depth = 6, train loss: 4.36613, val loss: 4.37465, in 0.362s
[2/100000] 1 tree, 62 leaves, max depth = 6, train loss: 4.32281, val loss: 4.33125, in 0.148s
[3/100000] 1 tree, 63 leaves, max depth = 6, train loss: 4.28046, val loss: 4.28884, in 0.152s
[4/100000] 1 tree, 64 leaves, max depth = 6, train loss: 4.23983, val loss: 4.24816, in 0.151s
[5/100000] 1 tree, 62 leaves, ma

[32m[I 2021-11-27 14:41:07,768][0m Trial 19 finished with value: -3.874665679911936 and parameters: {'learning_rate': 0.02418763183472459, 'max_depth': 6, 'l2_regularization': 0.018764072014136098, 'min_samples_leaf': 764, 'tol': 0.03514932481541225, 'n_iter_no_change': 47, 'random_state': 421717916}. Best is trial 16 with value: -3.8325659976867135.[0m


Best score: -3.8325659976867135

Optimized parameters: {'learning_rate': 0.0028710861658722024, 'max_depth': 5, 'l2_regularization': 6.541543606516337e-05, 'min_samples_leaf': 978, 'tol': 0.03877036102680547, 'n_iter_no_change': 71, 'random_state': 39084583}



### Model fitting

In [73]:
test_size = 0.15
val_size = 0.15
sample_size = 1.00
n_topics = 10
random_state = 0

model_inputs = get_model_inputs(val_size, test_size, sample_size, n_topics, random_state)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [74]:
fixed_params = {
                    'max_iter': 855,
                    'loss': 'absolute_error',
                    'max_leaf_nodes': None,
                    'early_stopping': False,
                    'verbose': 4,
                  }

print({**fixed_params,**hgb_params})

# Train
hgb = HistGradientBoostingRegressor(**{**fixed_params, **hgb_params})
hgb.fit(model_inputs['X_train'], model_inputs['y_train'])

# Test
y_true = model_inputs['y_test']
y_pred = hgb.predict(model_inputs['X_test'])
print(f'MAE: {mean_absolute_error(y_true, y_pred)}')

{'max_iter': 855, 'loss': 'absolute_error', 'max_leaf_nodes': None, 'early_stopping': False, 'verbose': 4, 'learning_rate': 0.0028710861658722024, 'max_depth': 5, 'l2_regularization': 6.541543606516337e-05, 'min_samples_leaf': 978, 'tol': 0.03877036102680547, 'n_iter_no_change': 71, 'random_state': 39084583}
Binning 0.591 GB of training data: 2.413 s
Fitting gradient boosted rounds:
[1/855] 1 tree, 32 leaves, max depth = 5, in 1.940s
[2/855] 1 tree, 32 leaves, max depth = 5, in 1.205s
[3/855] 1 tree, 32 leaves, max depth = 5, in 2.332s
[4/855] 1 tree, 32 leaves, max depth = 5, in 1.205s
[5/855] 1 tree, 32 leaves, max depth = 5, in 0.969s
[6/855] 1 tree, 32 leaves, max depth = 5, in 0.508s
[7/855] 1 tree, 32 leaves, max depth = 5, in 0.532s
[8/855] 1 tree, 32 leaves, max depth = 5, in 0.532s
[9/855] 1 tree, 32 leaves, max depth = 5, in 0.532s
[10/855] 1 tree, 32 leaves, max depth = 5, in 0.527s
[11/855] 1 tree, 32 leaves, max depth = 5, in 0.498s
[12/855] 1 tree, 32 leaves, max depth = 

# Recommendations to the users

### Unseen jokes per user

In [81]:
model_inputs['user_joke_info'][['user_id', 'joke_id'] + model_inputs['features']].columns

KeyboardInterrupt: 

In [None]:
predictions_df = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})
predictions_df.insert(2, 'error', np.abs(predictions_df.y_true - predictions_df.y_pred))
display(predictions_df)