# All

## Import/read

In [None]:
import os
import pickle
import time
import warnings
from copy import deepcopy
from datetime import datetime
from functools import partial
from itertools import repeat
from types import SimpleNamespace
from typing import Callable
import copy

import bayes_opt as bayes
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from bayes_opt import (
    BayesianOptimization,
    SequentialDomainReductionTransformer,
    UtilityFunction,
)
from bayes_opt.event import Events
from bayes_opt.logger import JSONLogger
from bayes_opt.util import load_logs
from library.classes import Feature, ManageDataSplit, ManageFeatures, feature_objects
from library.classes.trait import DataFrame
from library.feature_func import (
    add_categorical_feature,
    add_feature_targets_groupby_stats,
    add_numerical_feature,
    f_rolling_mean,
    f_shifted,
    time_arrow,
    f_microbusiness_pct_change,
    f_microbusiness_density_diff,
)
from library.optimize_this import optimize_this
from library.utils import build_callbacks, read_df, smape, states, states_abb, write_df
from meteostat import Monthly, Point, Stations
from sklearn.cluster import DBSCAN
from sklearn.neighbors import LocalOutlierFactor
from traitlets import (
    Any,
    Bool,
    Callable,
    Dict,
    Float,
    HasTraits,
    Int,
    List,
    TraitError,
    TraitType,
    Tuple,
    Unicode,
    default,
    validate,
)

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 500)

In [None]:
# data_path = "../data/"
boundaries_sub_data_path = "other/boundaries"
# bayesian_run_path = "../data/bayesian_runs/"

In [None]:
# df_census = read_df("census_starter.csv")
df_test = read_df("test.csv")
df_train = read_df("train.csv")
df_submission = read_df("sample_submission.csv")
# df_population = read_df('df_population.csv', 'other')
# df_census_population = read_df('df_census_population.csv', 'kaggle_census')
# df_train_census = read_df('df_train_census.csv', 'kaggle_census')
df_adjusted_microbusiness_density = read_df(
    "df_adjusted_microbusiness_density.csv", "kaggle_census"
)

# df_boundaries = read_df("us-county-boundaries.csv", boundaries_sub_data_path, delimiter=";")
df_train = pd.merge(df_train, df_adjusted_microbusiness_density, "left", "row_id")
# df_train = df_train.rename(
#     columns={
#         "microbusiness_density": "original_microbusiness_density",
#         "adjusted_microbusiness_density": "microbusiness_density",
#     }
# )

df_train = df_train.rename(
    columns={
        "microbusiness_density": "original_microbusiness_density",
        "active": "microbusiness_density",
    }
)
df_train["microbusiness_density"] = df_train["microbusiness_density"].apply(np.log1p)

df_location = read_df("cfips_location.csv", "usa-counties-coordinates")

In [None]:
t = df_train[df_train.isna().any(axis=1)]
if t.shape[0] != 24:
    raise Exception("Nan counts used to be 24... something changed")
df_train[df_train.isna().any(axis=1)].shape

### Weather save/load

In [None]:
def save_weather_data(path_weather):
    # Temperatures
    temps = df_boundaries[["NAME", "NAMELSAD", "INTPTLAT", "INTPTLON"]].copy()
    temps["min_date"] = df_train["first_day_of_month"].min()
    temps["max_date"] = df_train["first_day_of_month"].max()

    data_list = []
    for idx, row in temps.iterrows():
        p = Point(row["INTPTLAT"], row["INTPTLON"], 70)

        data = Monthly(p, row["min_date"], row["max_date"])
        data = data.fetch()

        if data.shape[0] > 0:
            data["state"] = row["NAME"]
            data["county"] = row["NAMELSAD"]

            data_list.append(data)

        if idx % 100 == 0:
            print(idx)

    weather_data = pd.concat(data_list)
    weather_data.to_csv(path_weather)

In [None]:
path_weather = "../data/other/weather/weather.csv"
# save_weather_data(path_weather)

In [None]:
df_weather = pd.read_csv(path_weather)

## Train

### Feature setup

In [None]:
manage_data_split = ManageDataSplit(df_train)

enabled_tuple = (0, 0.55)
enabled_tuple_h = (0, 0.75)
enabled_tuple_almost = (0.4, 1)
enabled_tuple_always = (1, 1)
params_tuple = (0, 10)

gamma = 0

# Better to clear it. Otherwise it might containt features we do not want
feature_objects.clear()

for cols in [
    {
        "target_col": "microbusiness_density",
        "groupby_col": "cfips",
        "params_bounds": list(repeat((1, 12), 4)),
    },
    # {"target_col": "microbusiness_density", "groupby_col": "county"},
    # {"target_col": "microbusiness_density", "groupby_col": "state"},
]:
    target_col = cols["target_col"]
    groupby_col = cols["groupby_col"]
    params_bounds = list(repeat((1, 20), 4))
    if "params_bounds" in cols.keys():
        params_bounds = cols["params_bounds"]
    f_col = "{}_{}_rolling_mean".format(groupby_col, target_col)
    feature_target_rolling_mean = Feature(
        f_col,
        f_rolling_mean,
        df_train,
        target_col=target_col,
        groupby_col=groupby_col,
        enabled_bounds=list(repeat(enabled_tuple, 4)),
        params_bounds=params_bounds,
    )


target_col = "microbusiness_density"
params_bounds = list(repeat((2, 10), 4))
params_bounds[0] = (1, 1+gamma)
enabled_bounds = list(repeat(enabled_tuple, 4))
enabled_bounds[0] = enabled_tuple_always
feature_target_shift_mean = Feature(
    f_col="{}_shift".format(target_col),
    f=f_shifted,
    df=df_train,
    target_col=target_col,
    enabled_bounds=enabled_bounds,
    params_bounds=params_bounds,
)

target_col = "original_microbusiness_density"
params_bounds = list(repeat((2, 10), 4))
enabled_bounds = list(repeat(enabled_tuple, 4))
params_bounds[0] = (1, 1+gamma)
enabled_bounds[0] = enabled_tuple_always
params_bounds[1] = (2, 2+gamma)
enabled_bounds[1] = enabled_tuple_almost
feature_target_shift_mean = Feature(
    f_col="{}_shift".format(target_col),
    f=f_shifted,
    df=df_train,
    target_col=target_col,
    enabled_bounds=enabled_bounds,
    params_bounds=params_bounds,
)

for idx, col in enumerate(
    [
        ("county", enabled_tuple_almost),
        "state",
        ("cfips", enabled_tuple_almost),
    ]
):
    _tuple = enabled_tuple
    if type(col) is not str:
        _tuple = col[1]
        col = col[0]
    _feature = Feature(col, add_categorical_feature, df_train, enabled_bounds=[_tuple])

for idx, col in enumerate(
    [
        "median_hh_inc",
        "pct_bb",
        "pct_college",
        "pct_foreign_born",
        "pct_it_workers",
        ("target_census_over_18_population_x1000", enabled_tuple_almost),
        ("target_census_population_x1000", enabled_tuple_almost),
        "lng",
        "lat",
        "rot_15_x",
        "rot_15_y",
        "rot_30_x",
        "rot_30_y",
        "rot_45_x",
        "rot_45_y",
    ]
):
    _tuple = enabled_tuple
    if type(col) is not str:
        _tuple = col[1]
        col = col[0]
    _feature = Feature(col, add_numerical_feature, df_train, enabled_bounds=[_tuple])

feature_time_arrow = Feature(
    "time_arrow", time_arrow, df_train, enabled_bounds=[enabled_tuple]
)

# for groupby_col in [
#     "cfips",
#     "state",
#     "county",
# ]:
#     for col in [
#         "median_hh_inc",
#         "pct_bb",
#         "pct_college",
#         "pct_foreign_born",
#         "pct_it_workers",
#     ]:
#         for agg_function in ["median", "mean", "std"]:
#             f_col = "{}_{}_target_{}".format(groupby_col, col, agg_function)
#             _feature = Feature(
#                 f_col,
#                 add_feature_targets_groupby_stats,
#                 df_train,
#                 groupby_col=groupby_col,
#                 col=col,
#                 agg_function=agg_function,
#                 enabled_bounds=[enabled_tuple],
#                 train_idx=manage_data_split._train_idx,
#             )

feature_microbusiness_density_pct_change = Feature(
    "microbusiness_density_pct_change",
    f_microbusiness_pct_change,
    df_train,
    enabled_bounds=[enabled_tuple],
)

feature_microbusiness_density_diff = Feature(
    "microbusiness_density_diff",
    f_microbusiness_density_diff,
    df_train,
    enabled_bounds=[enabled_tuple],
)

# Bounded region of parameter space
model_pbounds = {
    "num_leaves": (3, 200),
    "num_iterations": (2000, 2000),
    "learning_rate": (0.01, 2.5),
    "bagging_fraction": (0.0001, 1),
    "feature_fraction": (0.0001, 1),
    "lambda_l1": (0, 500),
    "lambda_l2": (0, 500),
    "bagging_freq": (0, 500),
    "min_data_in_leaf": (10, 3000),
    "min_sum_hessian_in_leaf": (0, 500),
    "max_depth": (-10, 150),
    "path_smooth": (0, 500),
}

In [None]:
manage_data_split._train_idx.shape

In [None]:
manage_data_split._val_idx.shape

In [None]:
# manage_features = ManageFeatures(feature_objects)
# manage_features.set_model_pbounds(model_pbounds)

# pbounds = manage_features.get_pbounds()


# feature = feature_target_shift_mean
# df_mapped = manage_features._make_mapped(d)
# df_mapped_feature = df_mapped[df_mapped["f_col"] == feature.f_col]
# r = feature.f(
#     df=feature.df.copy(),
#     df_mapped_feature=df_mapped_feature,
#     f_col=feature.f_col,
#     **feature._kwargs
# )

#### Start bayesian optimization

In [None]:
manage_features = ManageFeatures(feature_objects)
manage_features.set_model_pbounds(model_pbounds)

pbounds = manage_features.get_pbounds()

pbounds = {**pbounds, 'lower_quantile': (0, 0.01), 'upper_quantile': (0.98, 1)}

# acquisition_function = UtilityFunction(kind="ucb")
# acquisition_function = UtilityFunction(kind="poi")
# acquisition_function = UtilityFunction(kind="ucb", kappa=0.1)
# acquisition_function = UtilityFunction(kind="ucb", kappa=1)
# bounds_transformer = SequentialDomainReductionTransformer(minimum_window=0.5)

objective = "mae"
optimize_this_partial = partial(
    optimize_this,
    objective=objective,
    pbounds=pbounds,
    manage_data_split=manage_data_split,
    manage_features=manage_features,
    df_train=df_train,
    build_callbacks=build_callbacks,
    target_shift=0
)

optimizer = BayesianOptimization(
    f=optimize_this_partial,
    pbounds=pbounds,
    verbose=0,  # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    random_state=42,
    # n_restarts_optimizer=50,
    # bounds_transformer=bounds_transformer
)

# optimizer.set_gp_params(alpha=1e-2, n_restarts_optimizer=10)

# load_logs(optimizer, logs=['../data/bayesian_optimizer/2023-03-05_14-19-15_logs.json'])

# optimize_res = copy.deepcopy(optimizer.res)

# df_optimizer_params = pd.DataFrame([x["params"] for x in optimize_res])
# df_optimizer_target = pd.DataFrame(
#     [x["target"] for x in optimize_res], columns=["target"]
# )

# df_optimizer = pd.concat([df_optimizer_target, df_optimizer_params], axis=1)

# optimizer = BayesianOptimization(
#     f=optimize_this_partial,
#     pbounds=pbounds,
#     verbose=0,  # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
#     random_state=6,
#     # bounds_transformer=bounds_transformer
# )

dt = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

logger = JSONLogger(path="../data/bayesian_optimizer/{}_logs.json".format(dt))
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

# load_logs(optimizer, logs=["../data/bayesian_optimizer/2023-03-05_23-16-17_logs.json"])

# for idx, row in df_optimizer.sort_values("target").tail(15).iterrows():
#     optimizer.probe(
#         params=optimize_res[idx]["params"]
#     )

# optimizer.set_gp_params(alpha=1, n_restarts_optimizer=10)
optimizer.maximize(
    init_points=20, n_iter=2000, 
    # acquisition_function=acquisition_function
)

print(optimizer.max["target"])

In [None]:
acquisition_function = UtilityFunction(kind="ucb", kappa=0.5)
optimizer.maximize(
    init_points=0, n_iter=2000, 
    acquisition_function=acquisition_function
)

In [None]:
"../data/bayesian_optimizer/{}_logs.json".format(dt)

In [None]:
df_optimizer_params = pd.DataFrame([x['params'] for x in optimizer.res])
df_optimizer_target = pd.DataFrame([x['target'] for x in optimizer.res], columns=['target'])

df_optimizer = pd.concat([df_optimizer_target, df_optimizer_params], axis=1)

In [None]:
# enabled_cols = [x for x in df_optimizer.columns if 'enabled_' in x]
# df_optimizer[enabled_cols].tail(20)

In [None]:
df_optimizer.sort_values('target', ascending=False).head(15)

In [None]:
path = "../data/bayesian_optimizer/"
df_bay_logs = [
    (x, os.path.getsize(os.path.join(path, x)))
    for x in os.listdir(path)
    if "_logs.json" in x
]
df_bay_logs = pd.DataFrame(df_bay_logs, columns=["filename", "size_mb"])
df_bay_logs["size_mb"] = (df_bay_logs["size_mb"] / 1024 / 1024).round(2)

In [None]:
df_bay_logs

In [None]:
params = optimizer.max['params']

# keys_enabled = [x for x in params.keys() if 'enabled_' in x]
# for k in keys_enabled:
#     params[k] = 0
    
# keys_params = [x for x in params.keys() if 'params_' in x]
# for k in keys_params:
#     params[k] = 0

# params['enabled_microbusiness_density_shift_0'] = 1
# params['params_microbusiness_density_shift_0'] = 1

# t = lgb.LGBMRegressor()
# t_params = t.get_params()
# for k in t_params.keys():
#     if k in params.keys():
#         print(k)
#         params[k] = t_params[k]
# # params = {**params, **t.get_params()}
# params['bagging_fraction'] = 1
# params['bagging_freq'] = 0
# params['lambda_l1'] = 0
# params['feature_fraction'] = 0

In [None]:
params = optimizer.max['params']
gbm, lgb_train, lgb_eval, lgb_test, model_params, callbacks, df_features, df_target = optimize_this_partial(
    return_booster=True, **params
)

In [None]:
gbm.best_score

In [None]:
df_test = lgb_test.data.copy()
pred = gbm.predict(df_test)
df_test['label'] = lgb_test.label
df_test['pred'] = pred

In [None]:
t2 = pd.merge(
    df_test[["label", "pred"]],
    df_train.set_index("row_id")[
        ["target_census_over_18_population_x1000", "original_microbusiness_density"]
    ],
    "left",
    left_index=True,
    right_index=True,
)
t2.head()

In [None]:
t2['microbusiness_density_pred'] = (t2['pred'] / (t2['target_census_over_18_population_x1000'] * 1000)) * 100

In [None]:
smape(t2['microbusiness_density_pred'], t2['original_microbusiness_density'])

In [None]:
(t2['label'] / (t2['target_census_population_x1000'] * 1000))

In [None]:
t

In [None]:
df_train.columns

In [None]:
df_train.set_index('row_id')['target_census_population_x1000']

In [None]:
pred

In [None]:
set(t.get_params().keys())

In [None]:
gbm.best_score

In [None]:
dataset = lgb_test
pred = gbm.predict(dataset.data)
smape(pred, dataset.label)

In [None]:
t4 = df_optimizer.sort_values('target', ascending=False).iloc[0]

In [None]:
df_features.columns