In [1]:
import GeoDS

from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split

dataset = datasets.load_diabetes(as_frame=True)

X = dataset.data
y = dataset.target

y = y.to_frame()

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import numpy as np
from GeoDS.supervised import pipelineator
from GeoDS.supervised import tuning
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
n_classes = len(np.unique(y))
cat_indices = X_train.columns.get_indexer(X_train.select_dtypes('object').columns)
dp = pipelineator.DefaultSupervisedPipeline(categorical_features_indices=cat_indices, objective='binary', lgbm_num_classes=n_classes)


In [50]:
# First define the function!
# See that trial and p must be the inputs of the function! We will see the reason soon!

pipes = [dp.lgbm_pipeline, dp.random_forest_pipeline]

lambda_objective = [lambda trial, p=p: tuning.objective(trial, 
                                                  pipeline=p,
                                                  X_train=X_train, 
                                                  y_train=y_train, 
                                                  cross_validator=5,
                                                  groups=None,
                                                  scoring_metric='f1_macro') for p in pipes]

In [51]:
lambda_objective

[<function __main__.<listcomp>.<lambda>(trial, p=Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('numerical_transfo',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fe6b4065850>),
                                                 ('categorical_transfo',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fe6b40657d0>)])),
                ('imb', SMOTEENN(random_state=42)),
                ('clf', LGBMClassifier(objective='binary', random_state=42))]))>,
 <function __main__.<

In [45]:
[x(trial) for x in lambda_objective]

AttributeError: 'int' object has no attribute 'suggest_categorical'



In [3]:
# First define the function!
# See that trial and p must be the inputs of the function! We will see the reason soon!

lambda_objective = lambda trial, p: tuning.objective(trial, 
                                                  pipeline=p,
                                                  X_train=X_train, 
                                                  y_train=y_train, 
                                                  cross_validator=5,
                                                  groups=None,
                                                  scoring_metric='f1_macro')

In [4]:
# Mistake! The reason of the bug!

pipes = [dp.lgbm_pipeline, dp.random_forest_pipeline]

for p in pipes: print(lambda_objective) # See that the output is a lambda function.
# However, objective_function below takes a pipeline object, not a lambda function.


# Extra Info: This error persists if we use list comprehensions like below
# Because it actually repeats the same mistake in a cool way :D


# [lambda_objective for p in pipes] ---> Mistake

<function <lambda> at 0x7fe6a51c34d0>
<function <lambda> at 0x7fe6a51c34d0>


In [48]:
lambda_objective[0]

<function __main__.<listcomp>.<lambda>(trial, p=Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('numerical_transfo',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fe6b4065850>),
                                                 ('categorical_transfo',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fe6b40657d0>)])),
                ('imb', SMOTEENN(random_state=42)),
                ('clf', LGBMClassifier(objective='binary', random_state=42))]))>

In [59]:
# What's the solution? Simple!
# We should use the inputs (trial and p)

# I believe the solution is below!
# Why are we getting an error then? The error is not about lambda but is about tuning.tune itself.
# However, I didn't write the tuning.tune function, so I don't know what it does. So if you can fix the function,
# I believe everything will be alright.

trial = 5

for p in pipes:
    my_pipe, my_study = tuning.tune(n_trials=trial, 
                                objective_function=lambda_objective[0], 
                                output_folder='./', 
                                study_name='test', 
                                random_state=42)


[32m[I 2022-09-21 03:36:05,666][0m A new study created in memory with name: test[0m
[32m[I 2022-09-21 03:36:05,851][0m Trial 0 finished with value: 0.9619485208278313 and parameters: {'clf__bosting_type': 'dart', 'clf__num_leaves': 124, 'clf__min_data': 73, 'clf__max_depth': 5, 'clf__reg_alpha': 0.05808361216819946, 'clf__reg_lambda': 0.8661761457749352, 'clf__learning_rate': 0.19956529392808392, 'drop_rate': 0.004619347374377372, 'skip_drop': 1.4610865886287176e-08}. Best is trial 0 with value: 0.9619485208278313.[0m
[32m[I 2022-09-21 03:36:06,034][0m Trial 1 finished with value: 0.9619485208278313 and parameters: {'clf__bosting_type': 'gbdt', 'clf__num_leaves': 44, 'clf__min_data': 77, 'clf__max_depth': 6, 'clf__reg_alpha': 0.5247564316322378, 'clf__reg_lambda': 0.43194501864211576, 'clf__learning_rate': 0.09776854331372624}. Best is trial 0 with value: 0.9619485208278313.[0m
[32m[I 2022-09-21 03:36:06,220][0m Trial 2 finished with value: 0.9619485208278313 and parameters:

Trained models saved in ./models/ 


[32m[I 2022-09-21 03:36:06,979][0m Trial 1 finished with value: 0.9619485208278313 and parameters: {'clf__bosting_type': 'gbdt', 'clf__num_leaves': 44, 'clf__min_data': 77, 'clf__max_depth': 6, 'clf__reg_alpha': 0.5247564316322378, 'clf__reg_lambda': 0.43194501864211576, 'clf__learning_rate': 0.09776854331372624}. Best is trial 0 with value: 0.9619485208278313.[0m
[32m[I 2022-09-21 03:36:07,163][0m Trial 2 finished with value: 0.9619485208278313 and parameters: {'clf__bosting_type': 'gbdt', 'clf__num_leaves': 79, 'clf__min_data': 118, 'clf__max_depth': 11, 'clf__reg_alpha': 0.19967378215835974, 'clf__reg_lambda': 0.5142344384136116, 'clf__learning_rate': 0.19560708142748476}. Best is trial 0 with value: 0.9619485208278313.[0m
[32m[I 2022-09-21 03:36:07,353][0m Trial 3 finished with value: 0.9619485208278313 and parameters: {'clf__bosting_type': 'dart', 'clf__num_leaves': 22, 'clf__min_data': 193, 'clf__max_depth': 12, 'clf__reg_alpha': 0.8083973481164611, 'clf__reg_lambda': 0.3

Trained models saved in ./models/ 


In [None]:
# Make sure the following line is working to fix the above error!

lambda_objective(trial=5, p=dp.lgbm_pipeline)

In [None]:
# You are not convinced! or you are skeptical!
# Then let's get back to basics!

square = lambda x:x*x

In [None]:
# Let's check what square is!

square 

# It's a lambda function!

In [None]:
# However, square(x) is the right way of using the function!

[square(x) for x in range(15)]

# If you don't like list compherensions, for loop version below
#for x in range(15):
#    print(square(x))

In [None]:
# Let's do list compherension for lambda_objective and see if we can use it to fix the error!
# The answer is NO in this cell!

pipes = [dp.lgbm_pipeline, dp.random_forest_pipeline]

lambda_objective = [lambda trial, p: tuning.objective(trial, 
                                                  pipeline=p,
                                                  X_train=X_train, 
                                                  y_train=y_train, 
                                                  cross_validator=5,
                                                  groups=None,
                                                  scoring_metric='f1_macro') for p in pipes]

lambda_objective

# lambda_objective is a list of lambda functions with size 2.
# In this way, we cannot give it as an input to the objective_function.

In [None]:
lambda_objective[0]

In [6]:
lambda_objective[1]

TypeError: 'function' object is not subscriptable

### How can we get the pipeline objects using list comprehensions?

In [28]:
# Be aware that I use p=p in the list compeherension to get the pipeline objects!
# p=p basically tells the lambda function to use the elements of the for loop 
# This is critical 
# This is due to the point at which p is being bound. 
# The lambda functions all point at the variable p rather than the current value of it, 
# so when we update p in the next iteration, this update is seen across all our functions.

pipes = [dp.lgbm_pipeline, dp.random_forest_pipeline]

lambda_objective = [lambda trial, p=p: tuning.objective(trial, 
                                                  pipeline=p,
                                                  X_train=X_train, 
                                                  y_train=y_train, 
                                                  cross_validator=5,
                                                  groups=None,
                                                  scoring_metric='f1_macro') for p in pipes]

lambda_objective

# lambda_objective is a list of lambda functions with size 2.
# In this way, we cannot give it as an input to the objective_function.

[<function __main__.<listcomp>.<lambda>(trial, p=Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('numerical_transfo',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fe6b4065850>),
                                                 ('categorical_transfo',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fe6b40657d0>)])),
                ('imb', SMOTEENN(random_state=42)),
                ('clf', LGBMClassifier(objective='binary', random_state=42))]))>,
 <function __main__.<

In [8]:
lambda_objective[0]

<function __main__.<listcomp>.<lambda>(trial, p=Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('numerical_transfo',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fe6b4065850>),
                                                 ('categorical_transfo',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fe6b40657d0>)])),
                ('imb', SMOTEENN(random_state=42)),
                ('clf', LGBMClassifier(objective='binary', random_state=42))]))>

In [9]:
lambda_objective[1]

<function __main__.<listcomp>.<lambda>(trial, p=Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('numerical_transfo',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fe6b4065850>),
                                                 ('categorical_transfo',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fe6b40657d0>)])),
                ('imb', SMOTEENN(random_state=42)),
                ('clf', RandomForestClassifier(random_state=42))]))>

In [31]:
# However, the type of the elements of lambda_objective is still lambda functions!

for objective in lambda_objective:
    print(objective)

<function <listcomp>.<lambda> at 0x7fe69eb8a200>
<function <listcomp>.<lambda> at 0x7fe69eb8a3b0>


In [32]:
import optuna

def objective(trial):
    x = trial.suggest_float('x', -10, 10)
    return (x - 2) ** 2

study = optuna.create_study()
study.optimize(objective, n_trials=100)

study.best_params  # E.g. {'x': 2.002108042}

[32m[I 2022-09-21 03:19:30,611][0m A new study created in memory with name: no-name-b8b1e6ac-86bc-4d0e-8a37-49a5e12e93fc[0m
[32m[I 2022-09-21 03:19:30,614][0m Trial 0 finished with value: 4.288047839773288 and parameters: {'x': 4.070760208177974}. Best is trial 0 with value: 4.288047839773288.[0m
[32m[I 2022-09-21 03:19:30,616][0m Trial 1 finished with value: 54.26830521337402 and parameters: {'x': 9.366702465375809}. Best is trial 0 with value: 4.288047839773288.[0m
[32m[I 2022-09-21 03:19:30,618][0m Trial 2 finished with value: 33.47166359270158 and parameters: {'x': 7.7854700407746975}. Best is trial 0 with value: 4.288047839773288.[0m
[32m[I 2022-09-21 03:19:30,620][0m Trial 3 finished with value: 1.7473198187228347 and parameters: {'x': 0.6781377459346096}. Best is trial 3 with value: 1.7473198187228347.[0m
[32m[I 2022-09-21 03:19:30,621][0m Trial 4 finished with value: 0.11935255432583033 and parameters: {'x': 2.345474390260451}. Best is trial 4 with value: 0.119

[32m[I 2022-09-21 03:19:30,895][0m Trial 45 finished with value: 0.0068198445955180735 and parameters: {'x': 1.9174176496125348}. Best is trial 19 with value: 0.0009931061735331127.[0m
[32m[I 2022-09-21 03:19:30,902][0m Trial 46 finished with value: 15.50671845920266 and parameters: {'x': -1.937857089738359}. Best is trial 19 with value: 0.0009931061735331127.[0m
[32m[I 2022-09-21 03:19:30,909][0m Trial 47 finished with value: 2.540177887500765 and parameters: {'x': 0.4062064476536602}. Best is trial 19 with value: 0.0009931061735331127.[0m
[32m[I 2022-09-21 03:19:30,916][0m Trial 48 finished with value: 5.064108820328304 and parameters: {'x': 4.250357487229152}. Best is trial 19 with value: 0.0009931061735331127.[0m
[32m[I 2022-09-21 03:19:30,923][0m Trial 49 finished with value: 5.697789302359646 and parameters: {'x': -0.38700425268989536}. Best is trial 19 with value: 0.0009931061735331127.[0m
[32m[I 2022-09-21 03:19:30,930][0m Trial 50 finished with value: 0.004453

[32m[I 2022-09-21 03:19:31,241][0m Trial 90 finished with value: 3.2675330128089173 and parameters: {'x': 3.807631879783303}. Best is trial 66 with value: 8.641370804090746e-06.[0m
[32m[I 2022-09-21 03:19:31,249][0m Trial 91 finished with value: 0.0027252974460081265 and parameters: {'x': 2.0522043814828614}. Best is trial 66 with value: 8.641370804090746e-06.[0m
[32m[I 2022-09-21 03:19:31,258][0m Trial 92 finished with value: 1.5427824975632811 and parameters: {'x': 3.2420879588673586}. Best is trial 66 with value: 8.641370804090746e-06.[0m
[32m[I 2022-09-21 03:19:31,266][0m Trial 93 finished with value: 0.2611751880884298 and parameters: {'x': 2.511053018862456}. Best is trial 66 with value: 8.641370804090746e-06.[0m
[32m[I 2022-09-21 03:19:31,275][0m Trial 94 finished with value: 0.25918133016037476 and parameters: {'x': 1.4909014533900389}. Best is trial 66 with value: 8.641370804090746e-06.[0m
[32m[I 2022-09-21 03:19:31,283][0m Trial 95 finished with value: 0.0227

{'x': 1.9970603791393973}

In [36]:
study.trials[0]

FrozenTrial(number=0, values=[4.288047839773288], datetime_start=datetime.datetime(2022, 9, 21, 3, 19, 30, 614076), datetime_complete=datetime.datetime(2022, 9, 21, 3, 19, 30, 614500), params={'x': 4.070760208177974}, distributions={'x': UniformDistribution(high=10.0, low=-10.0)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=0, state=TrialState.COMPLETE, value=None)

In [37]:
# Since the type of the elements of lambda_objective is still lambda functions,
# The trials will still end with the same value again!
# Because as I said at the beginning of this notebook, we need to give the input parameters to fix this bug.

for objective in lambda_objective:
    my_pipe, my_study = tuning.tune(n_trials=trial, 
                                objective_function=objective, 
                                output_folder='./', 
                                study_name='test', 
                                random_state=42)

[32m[I 2022-09-21 03:23:59,017][0m A new study created in memory with name: test[0m
[32m[I 2022-09-21 03:24:01,520][0m Trial 0 finished with value: 0.9619485208278313 and parameters: {'clf__bosting_type': 'dart', 'clf__num_leaves': 124, 'clf__min_data': 73, 'clf__max_depth': 5, 'clf__reg_alpha': 0.05808361216819946, 'clf__reg_lambda': 0.8661761457749352, 'clf__learning_rate': 0.19956529392808392, 'drop_rate': 0.004619347374377372, 'skip_drop': 1.4610865886287176e-08}. Best is trial 0 with value: 0.9619485208278313.[0m
[32m[I 2022-09-21 03:24:01,706][0m Trial 1 finished with value: 0.9619485208278313 and parameters: {'clf__bosting_type': 'gbdt', 'clf__num_leaves': 44, 'clf__min_data': 77, 'clf__max_depth': 6, 'clf__reg_alpha': 0.5247564316322378, 'clf__reg_lambda': 0.43194501864211576, 'clf__learning_rate': 0.09776854331372624}. Best is trial 0 with value: 0.9619485208278313.[0m
[32m[I 2022-09-21 03:24:01,889][0m Trial 2 finished with value: 0.9619485208278313 and parameters:

Trained models saved in ./models/ 


[32m[I 2022-09-21 03:24:02,889][0m Trial 0 finished with value: 0.9434793996425593 and parameters: {'n_estimators': 250, 'max_features': 'auto', 'max_depth': 32, 'min_samples_split': 3, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 0 with value: 0.9434793996425593.[0m
[32m[I 2022-09-21 03:24:03,499][0m Trial 1 finished with value: 0.9434793996425593 and parameters: {'n_estimators': 341, 'max_features': 'auto', 'max_depth': 49, 'min_samples_split': 9, 'min_samples_leaf': 3, 'bootstrap': False}. Best is trial 0 with value: 0.9434793996425593.[0m
[32m[I 2022-09-21 03:24:04,109][0m Trial 2 finished with value: 0.9434793996425593 and parameters: {'n_estimators': 222, 'max_features': 'auto', 'max_depth': 18, 'min_samples_split': 7, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 0 with value: 0.9434793996425593.[0m
[32m[I 2022-09-21 03:24:04,723][0m Trial 3 finished with value: 0.9434793996425593 and parameters: {'n_estimators': 282, 'max_features': 'auto', '

Trained models saved in ./models/ 


In [18]:
# Somehow we should make this work to get the pipeline rather than the lambda function!

objective(trial=5, p=dp.random_forest_pipeline)

AttributeError: 'int' object has no attribute 'suggest_int'

In [35]:
# Since the type of the elements of lambda_objective is still lambda functions,
# The trials will still end with the same value again!
# Because as I said at the beginning of this notebook, we need to give the input parameters to fix this bug.

for objective in lambda_objective:
    my_pipe, my_study = tuning.tune(n_trials=trial, 
                                objective_function=objective(study.trials[0]), 
                                output_folder='./', 
                                study_name='test', 
                                random_state=42)

ValueError: The value of the parameter 'clf__bosting_type' is not found. Please set it at the construction of the FrozenTrial object.