In [5]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import json
from sklearn.model_selection import train_test_split, cross_val_score

import lightgbm as lgb

import gc

In [6]:
df = pd.read_csv("pruned_df.csv")

Xs = df.drop("totalrevenue", 1)
ys = df["totalrevenue"]

In [7]:
Xs.head()

Unnamed: 0,total_count_interstitial_impression,total_count_banner_impression,total_count_rewarded_impression,total_count_level_event,mean_revenue_interstitial_impression,last_time_interstitial_impression,mean_revenue_rewarded_impression,last_time_banner_impression
0,1,6,0,7,0.03,2.0,0.0,13.0
1,1,2,0,2,0.03,0.0,0.0,0.0
2,30,118,2,76,0.027517,4076.0,0.04,4078.0
3,25,82,1,60,0.02292,5136.0,0.04,5137.0
4,5,16,0,18,0.03,2035.0,0.0,2368.0


In [8]:
from hyperopt import hp, tpe, Trials, STATUS_OK
from hyperopt.fmin import fmin
from hyperopt.pyll.stochastic import sample
import gc
import warnings
import catboost as cb
warnings.filterwarnings('ignore')

N_FOLDS=5
CB_MAX_DEPTH = 4 #maximum tree depth in CatBoost
OBJECTIVE_CB_REG = 'MAPE' #CatBoost regression metric
OBJECTIVE_CB_CLASS = 'Logloss' #CatBoost classification metric

def quick_hyperopt(data, labels, num_evals=100):
        
    #clear memory 
    gc.collect()

    integer_params = ['depth',
                      #'one_hot_max_size', #for categorical data
                      'min_data_in_leaf',
                      #'max_bin',
                     "iterations"]

    def objective(space_params):

        #cast integer params from float to int
        for param in integer_params:
            space_params[param] = int(space_params[param])

        #extract nested conditional parameters
        if space_params['bootstrap_type']['bootstrap_type'] == 'Bayesian':
            bagging_temp = space_params['bootstrap_type'].get('bagging_temperature')
            space_params['bagging_temperature'] = bagging_temp

        if space_params['grow_policy']['grow_policy'] == 'LossGuide':
            max_leaves = space_params['grow_policy'].get('max_leaves')
            space_params['max_leaves'] = int(max_leaves)

        space_params['bootstrap_type'] = space_params['bootstrap_type']['bootstrap_type']
        space_params['grow_policy'] = space_params['grow_policy']['grow_policy']

        #random_strength cannot be < 0
        space_params['random_strength'] = max(space_params['random_strength'], 0)
        #fold_len_multiplier cannot be < 1
        space_params['fold_len_multiplier'] = max(space_params['fold_len_multiplier'], 1)

        #for classification set stratified=True
        cv_results = cb.cv(train, space_params, fold_count=N_FOLDS, 
                         early_stopping_rounds=25, stratified=False, partition_random_seed=42)

        best_loss = cv_results['test-MAPE-mean'].iloc[-1] #'test-RMSE-mean' for RMSE
        #for classification, comment out the line above and uncomment the line below:
        #best_loss = 1 - cv_results['test-AUC-mean'].iloc[-1]
        #if necessary, replace 'test-Logloss-mean' with 'test-[your-preferred-metric]-mean'

        return{'loss':best_loss, 'status': STATUS_OK}

    train = cb.Pool(data, labels.astype('float32'),
                    #cat_features=[16,17]
                   )

    #integer and string parameters, used with hp.choice()
    bootstrap_type = [#{'bootstrap_type':'Poisson'}, 
                       {'bootstrap_type':'Bayesian',
                        'bagging_temperature' : hp.loguniform('bagging_temperature', np.log(1), np.log(50))},
                      {'bootstrap_type':'Bernoulli'}] 
    LEB = ['No', 'AnyImprovement'] #remove 'Armijo' if not using GPU
    #score_function = ['Correlation', 'L2', 'NewtonCorrelation', 'NewtonL2']
    grow_policy = [#{'grow_policy':'SymmetricTree'},
                   #{'grow_policy':'Depthwise'},
                   {'grow_policy':'Lossguide',
                    'max_leaves': hp.quniform('max_leaves', 2, 32, 1)}]
    eval_metric_list_reg = ["MAPE"] #['MAE', 'RMSE', 'Poisson']
    eval_metric_list_class = ['AUC']
    #for classification change line below to 'eval_metric_list = eval_metric_list_class'
    eval_metric_list = eval_metric_list_reg

    space ={'depth': hp.quniform('depth', 2, CB_MAX_DEPTH, 1),
            #'max_bin' : hp.quniform('max_bin', 1, 32, 1), #if using CPU just set this to 254
            'l2_leaf_reg' : hp.uniform('l2_leaf_reg', 0, 7),
            'min_data_in_leaf' : hp.quniform('min_data_in_leaf', 1, 50, 1),
            'random_strength' : hp.loguniform('random_strength', np.log(0.005), np.log(5)),
            #'one_hot_max_size' : hp.quniform('one_hot_max_size', 2, 16, 1), #uncomment if using categorical features
            'bootstrap_type' : hp.choice('bootstrap_type', bootstrap_type),
            'learning_rate' : hp.uniform('learning_rate', 0.05, 0.25),
            'eval_metric' : hp.choice('eval_metric', eval_metric_list),
            'objective' : OBJECTIVE_CB_REG,
            #'score_function' : hp.choice('score_function', score_function), #crashes kernel - reason unknown
            'leaf_estimation_backtracking' : hp.choice('leaf_estimation_backtracking', LEB),
            'grow_policy': hp.choice('grow_policy', grow_policy),
            'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),# CPU only
            'fold_len_multiplier' : hp.loguniform('fold_len_multiplier', np.log(1.01), np.log(2.5)),
            'task_type' : 'CPU',
            'verbose' : 0,
            #"scale_pos_weight":hp.uniform('scale_pos_weight', 0.05, 100),
            "iterations": hp.choice('iterations', [250]),
            "early_stopping_rounds":25
            
        }

    #optional: run CatBoost without GPU
    #uncomment line below
    #space['task_type'] = 'CPU'

    trials = Trials()
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                max_evals=num_evals, 
                trials=trials)

    #unpack nested dicts first
    best['bootstrap_type'] = bootstrap_type[best['bootstrap_type']]['bootstrap_type']
    best['grow_policy'] = grow_policy[best['grow_policy']]['grow_policy']
    best['eval_metric'] = eval_metric_list[best['eval_metric']]

    #best['score_function'] = score_function[best['score_function']] 
    #best['leaf_estimation_method'] = LEM[best['leaf_estimation_method']] #CPU only
    best['leaf_estimation_backtracking'] = LEB[best['leaf_estimation_backtracking']]        

    #cast floats of integer params to int
    for param in integer_params:
        best[param] = int(best[param])
        
    if 'max_leaves' in best:
        best['max_leaves'] = int(best['max_leaves'])

    print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')

    return(best)

opt_parameters = quick_hyperopt(Xs, ys, num_evals=50)

opt_parameters


Stopped by overfitting detector  (25 iterations wait)                             

Stopped by overfitting detector  (25 iterations wait)                             

Stopped by overfitting detector  (25 iterations wait)                             

Stopped by overfitting detector  (25 iterations wait)                             

Stopped by overfitting detector  (25 iterations wait)                             

Stopped by overfitting detector  (25 iterations wait)                             

Stopped by overfitting detector  (25 iterations wait)                             

100%|██████████| 50/50 [21:03<00:00, 25.26s/trial, best loss: 0.07214163617571881]
{bootstrap_type: Bernoulli
colsample_bylevel: 0.93
depth: 4
eval_metric: MAPE
fold_len_multiplier: 1.5386864113227443
grow_policy: Lossguide
iterations: 0
l2_leaf_reg: 0.9334630071731768
leaf_estimation_backtracking: AnyImprovement
learning_rate: 0.1004331980493914
max_leaves: 5
min_data_in_leaf: 11
random_strength: 0.013688770

{'bootstrap_type': 'Bernoulli',
 'colsample_bylevel': 0.93,
 'depth': 4,
 'eval_metric': 'MAPE',
 'fold_len_multiplier': 1.5386864113227443,
 'grow_policy': 'Lossguide',
 'iterations': 0,
 'l2_leaf_reg': 0.9334630071731768,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'learning_rate': 0.1004331980493914,
 'max_leaves': 5,
 'min_data_in_leaf': 11,
 'random_strength': 0.013688770329930855}