# All

## Import/read

In [1]:
import os
import pickle
import time
import warnings
from copy import deepcopy
from datetime import datetime
from functools import partial
from itertools import repeat
from types import SimpleNamespace
from typing import Callable
import copy

import bayes_opt as bayes
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from bayes_opt import (
    BayesianOptimization,
    SequentialDomainReductionTransformer,
    UtilityFunction,
)
from bayes_opt.event import Events
from bayes_opt.logger import JSONLogger
from bayes_opt.util import load_logs
from library.classes import Feature, ManageDataSplit, ManageFeatures, feature_objects
from library.classes.trait import DataFrame
from library.feature_func import (
    add_categorical_feature,
    add_feature_targets_groupby_stats,
    add_numerical_feature,
    f_rolling_mean,
    f_shifted,
    time_arrow,
    f_microbusiness_pct_change,
    f_microbusiness_density_diff,
)
from library.optimize_this import optimize_this
from library.utils import build_callbacks, read_df, smape, states, states_abb, write_df
from meteostat import Monthly, Point, Stations
from sklearn.cluster import DBSCAN
from sklearn.neighbors import LocalOutlierFactor
from traitlets import (
    Any,
    Bool,
    Callable,
    Dict,
    Float,
    HasTraits,
    Int,
    List,
    TraitError,
    TraitType,
    Tuple,
    Unicode,
    default,
    validate,
)

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 500)

In [2]:
# data_path = "../data/"
boundaries_sub_data_path = "other/boundaries"
# bayesian_run_path = "../data/bayesian_runs/"

In [3]:
# df_census = read_df("census_starter.csv")
df_test = read_df("test.csv")
df_train = read_df("train.csv")
df_submission = read_df("sample_submission.csv")
# df_population = read_df('df_population.csv', 'other')
# df_census_population = read_df('df_census_population.csv', 'kaggle_census')
# df_train_census = read_df('df_train_census.csv', 'kaggle_census')
df_adjusted_microbusiness_density = read_df(
    "df_adjusted_microbusiness_density.csv", "kaggle_census"
)

# df_boundaries = read_df("us-county-boundaries.csv", boundaries_sub_data_path, delimiter=";")
df_train = pd.merge(df_train, df_adjusted_microbusiness_density, "left", "row_id")
# df_train = df_train.rename(
#     columns={
#         "microbusiness_density": "original_microbusiness_density",
#         "adjusted_microbusiness_density": "microbusiness_density",
#     }
# )

df_train = df_train.rename(
    columns={
        "microbusiness_density": "original_microbusiness_density",
        "active": "microbusiness_density",
    }
)
df_train["microbusiness_density"] = df_train["microbusiness_density"].apply(np.log1p)

df_location = read_df("cfips_location.csv", "usa-counties-coordinates")

In [4]:
t = df_train[df_train.isna().any(axis=1)]
if t.shape[0] != 24:
    raise Exception("Nan counts used to be 24... something changed")
df_train[df_train.isna().any(axis=1)].shape

(24, 27)

### Weather save/load

In [5]:
def save_weather_data(path_weather):
    # Temperatures
    temps = df_boundaries[["NAME", "NAMELSAD", "INTPTLAT", "INTPTLON"]].copy()
    temps["min_date"] = df_train["first_day_of_month"].min()
    temps["max_date"] = df_train["first_day_of_month"].max()

    data_list = []
    for idx, row in temps.iterrows():
        p = Point(row["INTPTLAT"], row["INTPTLON"], 70)

        data = Monthly(p, row["min_date"], row["max_date"])
        data = data.fetch()

        if data.shape[0] > 0:
            data["state"] = row["NAME"]
            data["county"] = row["NAMELSAD"]

            data_list.append(data)

        if idx % 100 == 0:
            print(idx)

    weather_data = pd.concat(data_list)
    weather_data.to_csv(path_weather)

In [6]:
path_weather = "../data/other/weather/weather.csv"
# save_weather_data(path_weather)

In [7]:
df_weather = pd.read_csv(path_weather)

## Train

### Feature setup

In [8]:
manage_data_split = ManageDataSplit(df_train)

enabled_tuple = (0, 0.55)
enabled_tuple_h = (0, 0.75)
enabled_tuple_almost = (0.4, 1)
enabled_tuple_always = (1, 1)
params_tuple = (0, 10)

gamma = 0

# Better to clear it. Otherwise it might containt features we do not want
feature_objects.clear()

for cols in [
    {
        "target_col": "microbusiness_density",
        "groupby_col": "cfips",
        "params_bounds": list(repeat((1, 12), 4)),
    },
    # {"target_col": "microbusiness_density", "groupby_col": "county"},
    # {"target_col": "microbusiness_density", "groupby_col": "state"},
]:
    target_col = cols["target_col"]
    groupby_col = cols["groupby_col"]
    params_bounds = list(repeat((1, 20), 4))
    if "params_bounds" in cols.keys():
        params_bounds = cols["params_bounds"]
    f_col = "{}_{}_rolling_mean".format(groupby_col, target_col)
    feature_target_rolling_mean = Feature(
        f_col,
        f_rolling_mean,
        df_train,
        target_col=target_col,
        groupby_col=groupby_col,
        enabled_bounds=list(repeat(enabled_tuple, 4)),
        params_bounds=params_bounds,
    )


target_col = "microbusiness_density"
params_bounds = list(repeat((2, 10), 4))
params_bounds[0] = (1, 1+gamma)
enabled_bounds = list(repeat(enabled_tuple, 4))
enabled_bounds[0] = enabled_tuple_always
feature_target_shift_mean = Feature(
    f_col="{}_shift".format(target_col),
    f=f_shifted,
    df=df_train,
    target_col=target_col,
    enabled_bounds=enabled_bounds,
    params_bounds=params_bounds,
)

target_col = "original_microbusiness_density"
params_bounds = list(repeat((2, 10), 4))
enabled_bounds = list(repeat(enabled_tuple, 4))
params_bounds[0] = (1, 1+gamma)
enabled_bounds[0] = enabled_tuple_always
params_bounds[1] = (2, 2+gamma)
enabled_bounds[1] = enabled_tuple_almost
feature_target_shift_mean = Feature(
    f_col="{}_shift".format(target_col),
    f=f_shifted,
    df=df_train,
    target_col=target_col,
    enabled_bounds=enabled_bounds,
    params_bounds=params_bounds,
)

for idx, col in enumerate(
    [
        ("county", enabled_tuple_almost),
        "state",
        ("cfips", enabled_tuple_almost),
    ]
):
    _tuple = enabled_tuple
    if type(col) is not str:
        _tuple = col[1]
        col = col[0]
    _feature = Feature(col, add_categorical_feature, df_train, enabled_bounds=[_tuple])

for idx, col in enumerate(
    [
        "median_hh_inc",
        "pct_bb",
        "pct_college",
        "pct_foreign_born",
        "pct_it_workers",
        ("target_census_over_18_population_x1000", enabled_tuple_almost),
        ("target_census_population_x1000", enabled_tuple_almost),
        "lng",
        "lat",
        "rot_15_x",
        "rot_15_y",
        "rot_30_x",
        "rot_30_y",
        "rot_45_x",
        "rot_45_y",
    ]
):
    _tuple = enabled_tuple
    if type(col) is not str:
        _tuple = col[1]
        col = col[0]
    _feature = Feature(col, add_numerical_feature, df_train, enabled_bounds=[_tuple])

feature_time_arrow = Feature(
    "time_arrow", time_arrow, df_train, enabled_bounds=[enabled_tuple]
)

# for groupby_col in [
#     "cfips",
#     "state",
#     "county",
# ]:
#     for col in [
#         "median_hh_inc",
#         "pct_bb",
#         "pct_college",
#         "pct_foreign_born",
#         "pct_it_workers",
#     ]:
#         for agg_function in ["median", "mean", "std"]:
#             f_col = "{}_{}_target_{}".format(groupby_col, col, agg_function)
#             _feature = Feature(
#                 f_col,
#                 add_feature_targets_groupby_stats,
#                 df_train,
#                 groupby_col=groupby_col,
#                 col=col,
#                 agg_function=agg_function,
#                 enabled_bounds=[enabled_tuple],
#                 train_idx=manage_data_split._train_idx,
#             )

feature_microbusiness_density_pct_change = Feature(
    "microbusiness_density_pct_change",
    f_microbusiness_pct_change,
    df_train,
    enabled_bounds=[enabled_tuple],
)

feature_microbusiness_density_diff = Feature(
    "microbusiness_density_diff",
    f_microbusiness_density_diff,
    df_train,
    enabled_bounds=[enabled_tuple],
)

# Bounded region of parameter space
model_pbounds = {
    "num_leaves": (3, 200),
    "num_iterations": (2000, 2000),
    "learning_rate": (0.01, 2.5),
    "bagging_fraction": (0.0001, 1),
    "feature_fraction": (0.0001, 1),
    "lambda_l1": (0, 500),
    "lambda_l2": (0, 500),
    "bagging_freq": (0, 500),
    "min_data_in_leaf": (10, 3000),
    "min_sum_hessian_in_leaf": (0, 500),
    "max_depth": (-10, 150),
    "path_smooth": (0, 500),
}

In [9]:
manage_data_split._train_idx.shape

(87780,)

In [10]:
manage_data_split._val_idx.shape

(25080,)

In [11]:
# manage_features = ManageFeatures(feature_objects)
# manage_features.set_model_pbounds(model_pbounds)

# pbounds = manage_features.get_pbounds()


# feature = feature_target_shift_mean
# df_mapped = manage_features._make_mapped(d)
# df_mapped_feature = df_mapped[df_mapped["f_col"] == feature.f_col]
# r = feature.f(
#     df=feature.df.copy(),
#     df_mapped_feature=df_mapped_feature,
#     f_col=feature.f_col,
#     **feature._kwargs
# )

#### Start bayesian optimization

In [12]:
manage_features = ManageFeatures(feature_objects)
manage_features.set_model_pbounds(model_pbounds)

pbounds = manage_features.get_pbounds()

pbounds = {**pbounds, 'lower_quantile': (0, 0.01), 'upper_quantile': (0.98, 1)}

# acquisition_function = UtilityFunction(kind="ucb")
# acquisition_function = UtilityFunction(kind="poi")
# acquisition_function = UtilityFunction(kind="ucb", kappa=0.1)
# acquisition_function = UtilityFunction(kind="ucb", kappa=1)
# bounds_transformer = SequentialDomainReductionTransformer(minimum_window=0.5)

objective = "mae"
optimize_this_partial = partial(
    optimize_this,
    objective=objective,
    pbounds=pbounds,
    manage_data_split=manage_data_split,
    manage_features=manage_features,
    df_train=df_train,
    build_callbacks=build_callbacks,
    target_shift=0
)

optimizer = BayesianOptimization(
    f=optimize_this_partial,
    pbounds=pbounds,
    verbose=0,  # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    random_state=42,
    # n_restarts_optimizer=50,
    # bounds_transformer=bounds_transformer
)

# optimizer.set_gp_params(alpha=1e-2, n_restarts_optimizer=10)

# load_logs(optimizer, logs=['../data/bayesian_optimizer/2023-03-05_14-19-15_logs.json'])

# optimize_res = copy.deepcopy(optimizer.res)

# df_optimizer_params = pd.DataFrame([x["params"] for x in optimize_res])
# df_optimizer_target = pd.DataFrame(
#     [x["target"] for x in optimize_res], columns=["target"]
# )

# df_optimizer = pd.concat([df_optimizer_target, df_optimizer_params], axis=1)

# optimizer = BayesianOptimization(
#     f=optimize_this_partial,
#     pbounds=pbounds,
#     verbose=0,  # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
#     random_state=6,
#     # bounds_transformer=bounds_transformer
# )

dt = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

logger = JSONLogger(path="../data/bayesian_optimizer/{}_logs.json".format(dt))
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

# load_logs(optimizer, logs=["../data/bayesian_optimizer/2023-03-05_23-16-17_logs.json"])

# for idx, row in df_optimizer.sort_values("target").tail(15).iterrows():
#     optimizer.probe(
#         params=optimize_res[idx]["params"]
#     )

# optimizer.set_gp_params(alpha=1, n_restarts_optimizer=10)
optimizer.maximize(
    init_points=20, n_iter=2000, 
    # acquisition_function=acquisition_function
)

print(optimizer.max["target"])

mae np.exp train(79632, 8) test(15675, 8) -18.9411 eval(25080, 8) -19.1352
mae np.exp train(83240, 6) test(15670, 6) -6.924 eval(25073, 6) -6.8279
mae np.exp train(64909, 10) test(15675, 10) -10.5961 eval(25080, 10) -10.6712
mae np.exp train(74492, 6) test(15675, 6) -9.283 eval(25080, 6) -9.8926
mae np.exp train(58191, 15) test(15675, 15) -18.7226 eval(25080, 15) -20.6989
mae np.exp train(80115, 9) test(15670, 9) -12.7758 eval(25073, 9) -13.3693
mae np.exp train(80755, 8) test(15670, 8) -75.2847 eval(25073, 8) -89.3045
mae np.exp train(81105, 8) test(15675, 8) -8.7238 eval(25080, 8) -6.6706
mae np.exp train(55971, 12) test(15675, 12) -109.8022 eval(25080, 12) -593.3956
mae np.exp train(52079, 8) test(15675, 8) -10.5591 eval(25080, 8) -10.5564
mae np.exp train(80963, 8) test(15675, 8) -33.812 eval(25080, 8) -38.595
mae np.exp train(83642, 9) test(15675, 9) -5.6798 eval(25080, 9) -5.8365
mae np.exp train(83728, 8) test(15675, 8) -11.6677 eval(25080, 8) -12.435
mae np.exp train(80542, 9) 

In [35]:
acquisition_function = UtilityFunction(kind="ucb", kappa=0.5)
optimizer.maximize(
    init_points=0, n_iter=2000, 
    acquisition_function=acquisition_function
)

mae -75.89701859123127 -76.47537013717546
train (25080, 13) eval (25080, 13) test (15675, 13)
mae -80.33973469214504 -80.56631616342395
train (25080, 12) eval (25080, 12) test (15675, 12)
mae -5.110388920027523 -5.096425411362867
train (25080, 9) eval (25080, 9) test (15675, 9)
mae -25.838593148698973 -26.255594015680174
train (25080, 7) eval (25080, 7) test (15675, 7)
mae -8.5435102128912 -8.44279239370594
train (25080, 4) eval (25080, 4) test (15675, 4)
mae -8.514238249328105 -8.480635471445982
train (25073, 12) eval (25073, 12) test (15670, 12)
mae -43.30871784984235 -43.05880109662354
train (25080, 10) eval (25080, 10) test (15675, 10)
mae -144.1429513469267 -4.312611648681245
train (25080, 6) eval (25080, 6) test (15675, 6)
mae -84.7575129654299 -84.98978472075035
train (25080, 14) eval (25080, 14) test (15675, 14)
mae -6.377700244260449 -6.393772682706325
train (25080, 9) eval (25080, 9) test (15675, 9)
mae -15.420186729198699 -15.40403720142901
train (25080, 7) eval (25080, 7) t

KeyboardInterrupt: 

In [13]:
"../data/bayesian_optimizer/{}_logs.json".format(dt)

'../data/bayesian_optimizer/2023-03-08_02-29-47_logs.json'

In [13]:
df_optimizer_params = pd.DataFrame([x['params'] for x in optimizer.res])
df_optimizer_target = pd.DataFrame([x['target'] for x in optimizer.res], columns=['target'])

df_optimizer = pd.concat([df_optimizer_target, df_optimizer_params], axis=1)

In [14]:
# enabled_cols = [x for x in df_optimizer.columns if 'enabled_' in x]
# df_optimizer[enabled_cols].tail(20)

In [15]:
df_optimizer.sort_values('target', ascending=False).head(15)

Unnamed: 0,target,bagging_fraction,bagging_freq,enabled_cfips_,enabled_cfips_microbusiness_density_rolling_mean_0,enabled_cfips_microbusiness_density_rolling_mean_1,enabled_cfips_microbusiness_density_rolling_mean_2,enabled_cfips_microbusiness_density_rolling_mean_3,enabled_county_,enabled_lat_,enabled_lng_,enabled_median_hh_inc_,enabled_microbusiness_density_diff_,enabled_microbusiness_density_pct_change_,enabled_microbusiness_density_shift_0,enabled_microbusiness_density_shift_1,enabled_microbusiness_density_shift_2,enabled_microbusiness_density_shift_3,enabled_original_microbusiness_density_shift_0,enabled_original_microbusiness_density_shift_1,enabled_original_microbusiness_density_shift_2,enabled_original_microbusiness_density_shift_3,enabled_pct_bb_,enabled_pct_college_,enabled_pct_foreign_born_,enabled_pct_it_workers_,enabled_rot_15_x_,enabled_rot_15_y_,enabled_rot_30_x_,enabled_rot_30_y_,enabled_rot_45_x_,enabled_rot_45_y_,enabled_state_,enabled_target_census_over_18_population_x1000_,enabled_target_census_population_x1000_,enabled_time_arrow_,feature_fraction,lambda_l1,lambda_l2,learning_rate,lower_quantile,max_depth,min_data_in_leaf,min_sum_hessian_in_leaf,num_iterations,num_leaves,params_cfips_microbusiness_density_rolling_mean_0,params_cfips_microbusiness_density_rolling_mean_1,params_cfips_microbusiness_density_rolling_mean_2,params_cfips_microbusiness_density_rolling_mean_3,params_microbusiness_density_shift_0,params_microbusiness_density_shift_1,params_microbusiness_density_shift_2,params_microbusiness_density_shift_3,params_original_microbusiness_density_shift_0,params_original_microbusiness_density_shift_1,params_original_microbusiness_density_shift_2,params_original_microbusiness_density_shift_3,path_smooth,upper_quantile
439,-0.036237,0.882815,450.198362,0.79572,0.529293,0.019884,0.14598,0.206118,0.808912,0.291202,0.120747,0.328032,0.193926,0.232675,1.0,0.468819,0.402029,0.067431,1.0,0.746386,0.103699,0.489015,0.417343,0.291748,0.12742,0.309854,0.244378,0.163359,0.004255,0.467289,0.133641,0.180508,0.023712,0.610632,0.644818,0.031358,0.969019,108.968303,134.83286,1.712576,0.009613,76.585341,1326.325342,323.01546,2000.0,20.296026,1.620234,3.557352,1.704538,6.47938,1.0,4.969252,7.65547,2.285911,1.0,2.0,3.232797,8.49711,452.198852,0.999302
64,-0.03694,0.834705,234.442983,0.442988,0.405021,0.358016,0.077524,0.410925,0.756191,0.049557,0.511296,0.197433,0.497688,0.506822,1.0,0.320625,0.343188,0.262035,1.0,0.936076,0.176021,0.108296,0.499658,0.1581,0.541858,0.0585,0.530516,0.023818,0.489869,0.31427,0.419319,0.520253,0.256259,0.667763,0.627799,0.513982,0.739087,97.647669,424.05693,0.556111,0.000453,93.796948,654.563838,483.238515,2000.0,67.735334,1.600089,8.387131,4.241464,3.162764,1.0,4.116762,4.997322,9.924016,1.0,2.0,8.06896,2.303557,295.688732,0.996772
756,-0.039043,0.973046,213.081618,0.891956,0.546802,0.156231,0.547378,0.059538,0.873181,0.152099,0.033508,0.20869,0.199923,0.396123,1.0,0.076301,0.028221,0.441624,1.0,0.607956,0.003731,0.038816,0.243959,0.284043,0.038342,0.505452,0.33676,0.031231,0.076665,0.34694,0.378916,0.397857,0.38684,0.731973,0.915329,0.020934,0.621647,42.524265,277.59063,1.080909,0.001784,20.619595,879.004232,433.440648,2000.0,167.496949,9.400186,6.808991,1.295828,3.490004,1.0,3.711313,3.889233,4.531573,1.0,2.0,7.962882,7.93305,391.411732,0.983178
1064,-0.039473,0.810348,378.957633,0.586587,0.148818,0.312188,0.497603,0.517811,0.705311,0.343366,0.479607,0.055277,0.500069,0.40837,1.0,0.105192,0.488155,0.338509,1.0,0.923349,0.185202,0.100411,0.10488,0.48219,0.261041,0.42648,0.407317,0.00642,0.092989,0.338476,0.539963,0.264152,0.226271,0.79339,0.575494,0.392459,0.476613,180.722545,49.421819,0.011903,0.002838,13.965846,1056.379885,103.438206,2000.0,88.806857,3.861155,8.021494,1.04704,1.488755,1.0,5.624923,4.171323,4.835619,1.0,2.0,2.907654,8.13261,107.484023,0.997523
363,-0.04196,0.980787,472.450989,0.450301,0.008,0.271682,0.518999,0.368802,0.408235,0.079007,0.215462,0.211603,0.273656,0.384955,1.0,0.50533,0.455839,0.419501,1.0,0.796426,0.186542,0.261848,0.219883,0.373122,0.533641,0.136647,0.167823,0.073945,0.344412,0.424761,0.529831,0.431764,0.534444,0.864762,0.978508,0.08818,0.682004,17.075982,121.66454,1.170692,0.002763,70.460739,429.543495,422.239797,2000.0,166.773483,3.912122,5.511245,5.014849,6.866023,1.0,5.931244,7.822793,2.597802,1.0,2.0,6.492968,8.278917,215.548618,0.983667
232,-0.041976,0.746352,51.491337,0.637995,0.327184,0.302564,0.531826,0.394414,0.647066,0.293325,0.12296,0.328632,0.261236,0.428946,1.0,0.269857,0.384704,0.475832,1.0,0.428246,0.136043,0.421242,0.068464,0.253784,0.212912,0.107234,0.464577,0.212352,0.423801,0.531821,0.055384,0.16015,0.303631,0.788177,0.675019,0.140314,0.999952,208.054024,347.772484,0.028705,0.007655,1.794119,231.782686,32.390526,2000.0,78.231976,3.324344,7.492266,3.127598,2.455173,1.0,6.065963,9.70992,9.101259,1.0,2.0,7.32226,4.491572,235.956967,0.988124
729,-0.043604,0.582655,137.688131,0.77232,0.337084,0.192653,0.099673,0.003279,0.930199,0.040993,0.168161,0.035077,0.122276,0.503005,1.0,0.308957,0.347486,0.185695,1.0,0.524649,0.361055,0.312566,0.19733,0.127739,0.212992,0.449021,0.547641,0.065436,0.28591,0.029116,0.515635,0.311927,0.154953,0.965613,0.576344,0.018178,0.952424,414.485889,340.614529,1.993864,0.006055,116.522378,763.729713,37.128672,2000.0,71.48939,9.92217,4.952574,8.371917,2.487921,1.0,7.785486,7.032972,2.942463,1.0,2.0,5.913684,2.550249,366.619781,0.992109
220,-0.043627,0.978151,158.855559,0.930154,0.546528,0.46957,0.46481,0.267742,0.950335,0.376901,0.546344,0.025064,0.006841,0.201476,1.0,0.125579,0.428246,0.31988,1.0,0.561084,0.060553,0.112762,0.471676,0.475132,0.51256,0.53292,0.154246,0.273851,0.492606,0.218295,0.342852,0.494612,0.281833,0.580523,0.790126,0.020406,0.633106,371.363028,229.262183,0.771784,0.002568,21.271507,1157.447114,55.501718,2000.0,107.356916,1.129854,8.871831,7.571994,6.925989,1.0,3.550417,9.509783,7.250359,1.0,2.0,6.602937,9.250231,262.56999,0.994904
324,-0.044698,0.978585,299.282373,0.701418,0.038689,0.282951,0.126934,0.527541,0.763787,0.188018,0.068014,0.27338,0.165951,0.4546,1.0,0.13199,0.496386,0.176807,1.0,0.774734,0.528437,0.048363,0.163647,0.225498,0.154988,0.492707,0.065224,0.119895,0.205706,0.5472,0.164725,0.3348,0.123427,0.949451,0.505798,0.179243,0.501126,267.818325,170.329946,0.946312,0.000867,68.396149,48.983105,124.924729,2000.0,178.469176,4.261537,2.147131,7.526172,2.228887,1.0,4.943117,8.521101,6.209608,1.0,2.0,4.439866,2.807386,476.883943,0.988569
176,-0.045396,0.772191,85.963134,0.444898,0.345937,0.175044,0.430154,0.506307,0.875685,0.330793,0.429203,0.029159,0.032369,0.121311,1.0,0.101594,0.070952,0.385816,1.0,0.954209,0.226404,0.221103,0.399593,0.24504,0.238515,0.147885,0.018116,0.205556,0.335764,0.052465,0.434698,0.471271,0.11888,0.636275,0.692315,0.105696,0.582267,218.078654,164.000955,0.543287,0.000188,-2.258863,178.538561,314.563973,2000.0,168.898764,10.065733,4.693921,5.357342,9.429564,1.0,8.794863,9.344924,6.601624,1.0,2.0,3.124765,3.30919,242.680177,0.987716


In [None]:
path = "../data/bayesian_optimizer/"
df_bay_logs = [
    (x, os.path.getsize(os.path.join(path, x)))
    for x in os.listdir(path)
    if "_logs.json" in x
]
df_bay_logs = pd.DataFrame(df_bay_logs, columns=["filename", "size_mb"])
df_bay_logs["size_mb"] = (df_bay_logs["size_mb"] / 1024 / 1024).round(2)

In [None]:
df_bay_logs

In [None]:
params = optimizer.max['params']

# keys_enabled = [x for x in params.keys() if 'enabled_' in x]
# for k in keys_enabled:
#     params[k] = 0
    
# keys_params = [x for x in params.keys() if 'params_' in x]
# for k in keys_params:
#     params[k] = 0

# params['enabled_microbusiness_density_shift_0'] = 1
# params['params_microbusiness_density_shift_0'] = 1

# t = lgb.LGBMRegressor()
# t_params = t.get_params()
# for k in t_params.keys():
#     if k in params.keys():
#         print(k)
#         params[k] = t_params[k]
# # params = {**params, **t.get_params()}
# params['bagging_fraction'] = 1
# params['bagging_freq'] = 0
# params['lambda_l1'] = 0
# params['feature_fraction'] = 0

In [None]:
params = optimizer.max['params']
gbm, lgb_train, lgb_eval, lgb_test, model_params, callbacks, df_features, df_target = optimize_this_partial(
    return_booster=True, **params
)

In [None]:
gbm.best_score

In [None]:
df_test = lgb_test.data.copy()
pred = gbm.predict(df_test)
df_test['label'] = lgb_test.label
df_test['pred'] = pred

In [None]:
t2 = pd.merge(
    df_test[["label", "pred"]],
    df_train.set_index("row_id")[
        ["target_census_over_18_population_x1000", "original_microbusiness_density"]
    ],
    "left",
    left_index=True,
    right_index=True,
)
t2.head()

In [None]:
t2['microbusiness_density_pred'] = (t2['pred'] / (t2['target_census_over_18_population_x1000'] * 1000)) * 100

In [None]:
smape(t2['microbusiness_density_pred'], t2['original_microbusiness_density'])

In [None]:
(t2['label'] / (t2['target_census_population_x1000'] * 1000))

In [None]:
t

In [None]:
df_train.columns

In [None]:
df_train.set_index('row_id')['target_census_population_x1000']

In [None]:
pred

In [None]:
set(t.get_params().keys())

In [None]:
gbm.best_score

In [None]:
dataset = lgb_test
pred = gbm.predict(dataset.data)
smape(pred, dataset.label)

In [None]:
t4 = df_optimizer.sort_values('target', ascending=False).iloc[0]

In [None]:
df_features.columns