# All

## Import/read

In [1]:
import os
import pickle
import time
import warnings
from copy import deepcopy
from datetime import datetime
from types import SimpleNamespace
from typing import Callable

import bayes_opt as bayes
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from bayes_opt import BayesianOptimization
from meteostat import Monthly, Point, Stations
from sklearn.cluster import DBSCAN
from sklearn.neighbors import LocalOutlierFactor
from traitlets import (
    Any,
    Bool,
    Callable,
    Dict,
    Float,
    HasTraits,
    Int,
    List,
    TraitError,
    TraitType,
    Tuple,
    Unicode,
    default,
    validate,
)

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 500)

In [2]:
data_path = "../data/"
boundaries_sub_data_path = "other/boundaries"
bayesian_run_path = "../data/bayesian_runs/"


def r(filename, sub_folder="kaggle", delimiter=","):
    return pd.read_csv(
        os.path.join(data_path, sub_folder, filename), delimiter=delimiter
    )

In [3]:
df_census = r("census_starter.csv")
df_test = r("test.csv")
df_train = r("train.csv")
df_submission = r("sample_submission.csv")

df_boundaries = r("us-county-boundaries.csv", boundaries_sub_data_path, delimiter=";")

### Add census, year, fix dates etc

In [4]:
def fix_df_train(df_train):
    df = df_train.copy()

    # Add year
    df["first_day_of_month"] = pd.to_datetime(df["first_day_of_month"])
    df["year"] = df["first_day_of_month"].dt.year.astype(int)

    # Add df_census to df
    cols = list(df_census.columns)
    cols.remove("cfips")

    t0 = df_census.melt("cfips", cols)
    t0["year"] = t0["variable"].str.split("_").str[-1].astype(int)
    t0["variable_name"] = t0["variable"].str.rsplit("_", expand=False, n=1).str[0]

    t1 = pd.pivot_table(t0, "value", ["cfips", "year"], "variable_name").reset_index()

    # Census data is lagging 2 years
    t1["year"] = t1["year"] + 2

    df = pd.merge(df, t1, "left", left_on=["cfips", "year"], right_on=["cfips", "year"])

    # Add month
    df["month"] = df["first_day_of_month"].dt.month

    return df

In [5]:
df_train = fix_df_train(df_train)

t = df_train[df_train.isna().any(axis=1)]
if t.shape[0] != 22:
    raise Exception("Nan counts used to be 22... something changed")

### Weather save/load

In [6]:
def save_weather_data(path_weather):
    # Temperatures
    temps = df_boundaries[["NAME", "NAMELSAD", "INTPTLAT", "INTPTLON"]].copy()
    temps["min_date"] = df_train["first_day_of_month"].min()
    temps["max_date"] = df_train["first_day_of_month"].max()

    data_list = []
    for idx, row in temps.iterrows():
        p = Point(row["INTPTLAT"], row["INTPTLON"], 70)

        data = Monthly(p, row["min_date"], row["max_date"])
        data = data.fetch()

        if data.shape[0] > 0:
            data["state"] = row["NAME"]
            data["county"] = row["NAMELSAD"]

            data_list.append(data)

        if idx % 100 == 0:
            print(idx)

    weather_data = pd.concat(data_list)
    weather_data.to_csv(path_weather)

In [7]:
path_weather = "../data/other/weather/weather.csv"
# save_weather_data(path_weather)

In [8]:
df_weather = pd.read_csv(path_weather)

### Baseline model

#### Feature functions

In [9]:
def add_feature_targets_history(t, feature_type, cols_f: dict, **kwargs):
    """
    Add rolling windows and/or shifted values

    window: list
        List of windows to add. [2, 5] will add two columns with rolling window 2 and 5.
    shifts: list
        List of recents shifts to add. [2, 5] will add two columns with shifts of 2 and 5.
    """

    def add_rolling_mean_x(
        t: pd.DataFrame, added_feature_cols: list, f_col: str, window: int
    ):
        rolling_mean = (
            t.sort_values(["cfips", "first_day_of_month"])
            .groupby(["cfips"])["microbusiness_density"]
            .rolling(window)
            .mean()
            .rename(f_col)
        )

        return rolling_mean

    def add_shifted_x(
        t: pd.DataFrame, f_col: str, shift: int
    ):
        previous = (
            t.sort_values(["cfips", "first_day_of_month"])
            .groupby(["cfips"])["microbusiness_density"]
            .shift(shift)
            .rename(f_col)
        )

        return t

    def _loop_new_cols(t, f, new_cols):
        res: list[pd.DataFrame] = []
        for col, val in cols_f.items():
            r =  f(t, col, v)
        
    

    if feature_type == 'target_rolling_mean':
        # Rolling mean target
        t = _loop_dict(t, add_rolling_mean_x)
    elif feature_type == 'target_shift':
        # Previous target values
        t = _loop_dict(t, add_shifted_x)
    else:
        raise ValueError('Kind ´{kind}´ is not supported')

    return t

In [10]:
def add_feature_targets_groupby_stats(
    df: pd.DataFrame,
    added_feature_cols: list,
    cols_groupby_target: list = ["cfips", "county", "state"],
    **kwargs
):
    def make_feature(
        df: pd.DataFrame,
        col: str,
        col_template: str,
        added_feature_cols: list,
        agg_functions: list = ["mean", "std", "median"],
    ):
        
        t0 = df.groupby(col)["microbusiness_density"].agg(agg_functions)
        new_cols = [col_template.format(col, x) for x in t0.columns]
        t0.columns = new_cols
        t0 = t0.reset_index()
        t0[col] = t0[col].astype(df[col].dtype)

        df = pd.merge(df, t0, "left", left_on=col, right_on=col)
        
        for f_col in list(t0.columns):
            print(f_col)
            added_feature_cols.append(f_col)
        
        print('--------------------------')
        print('t0.columns', t0.columns)
        print('df.columns', df.columns)

        return df

    col_template = "target_{}_{}"
    
    for col in cols_groupby_target:
        df = make_feature(df, col, col_template, added_feature_cols)

    print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<')
    print('return add_feature_targets_groupby_stats df.columns', df.columns)
        
    return df

In [11]:
df_train["cfips"].value_counts()

1001     39
39133    39
39089    39
39091    39
39093    39
         ..
21113    39
21115    39
21117    39
21119    39
56045    39
Name: cfips, Length: 3135, dtype: int64

In [12]:
def correlated_states(df_train):
    """
    Find states that correlate well in terms of change in ´microbusiness_density´
    """
    cols_state_relation = []
    dfs = pd.DataFrame([])
    new_col_raw = "mean"

    # Rolling
    t0 = (
        df_train.groupby(["state", "first_day_of_month"])["microbusiness_density"]
        .mean()
        .rename(new_col_raw)
        .reset_index()
    )

    t1 = t0.pivot_table(new_col_raw, "first_day_of_month", "state").sort_index().corr()
    # plt.imshow(t1.values, cmap="hot", interpolation="nearest")
    # plt.show()

    t5 = t1.rename_axis(["other_state"], axis=1).stack().rename("corr").reset_index()
    t5 = t5[t5["state"] != t5["other_state"]]

    # Clean pairs of same correlations
    t5 = t5.sort_values("corr").reset_index(drop=True)
    cols = ["state", "other_state"]
    t5[cols] = pd.DataFrame(np.sort(t5[cols].values, axis=1), columns=cols)
    t5 = t5.drop_duplicates()

    # Cluster
    clustering = DBSCAN(eps=0.01, min_samples=2).fit(t5["corr"].values.reshape(-1, 1))
    t5["cluster"] = clustering.labels_

    # Iterate through pairs and add state means to each other
    corr_states = t5[abs(t5["corr"]) > 0.9]

    # Append one month.
    t2 = df_train.copy()
    t2["first_day_of_month"] = t2["first_day_of_month"] + pd.DateOffset(months=1)
    t0 = (
        t2.groupby(["state", "first_day_of_month"])["microbusiness_density"]
        .mean()
        .rename(new_col_raw)
        .reset_index()
    )

    # Iterate over the pairs
    for pair in corr_states.values:
        group = pair[:2]
        corr = pair[2]

        switched = t0[t0["state"].isin(group)].copy()

        state_0 = group[0]
        state_1 = group[1]

        s = "{}_{}_rolling_microbusiness_density"
        col_state_0 = s.format(state_0, state_1)
        col_state_1 = s.format(state_1, state_0)

        sw = pd.pivot_table(
            switched, new_col_raw, "first_day_of_month", "state"
        ).rename(columns={state_0: col_state_0, state_1: col_state_1})

        def boo(df, sw, state, col_state, corr):
            """
            I think it switches state labels.
            """
            corr_col = "corr_" + col_state
            # sw[corr_col] = corr

            df_state_t = sw[[col_state]].reset_index()
            df_state_t["state"] = state

            # df = pd.merge(
            #     df,
            #     df_state_t,
            #     "left",
            # left_on=["state", "first_day_of_month"],
            # right_on=["state", "first_day_of_month"],
            # )

            return (df_state_t, corr_col)

        df_state_t_0, corr_col_0 = boo(df_train, sw, state_1, col_state_0, corr)
        df_state_t_1, corr_col_1 = boo(df_train, sw, state_0, col_state_1, corr)

        # Append results to list
        if dfs.shape[0] == 0:
            dfs = pd.merge(
                df_state_t_0,
                df_state_t_1,
                "outer",
                left_on=["state", "first_day_of_month"],
                right_on=["state", "first_day_of_month"],
            )
        else:
            dfs = pd.merge(
                dfs,
                df_state_t_0,
                "outer",
                left_on=["state", "first_day_of_month"],
                right_on=["state", "first_day_of_month"],
            )
            dfs = pd.merge(
                dfs,
                df_state_t_1,
                "outer",
                left_on=["state", "first_day_of_month"],
                right_on=["state", "first_day_of_month"],
            )
        # cols_state_relation.extend([corr_col_0, corr_col_1])
        cols_state_relation.extend([col_state_0, col_state_1])

    return (dfs, cols_state_relation)

In [13]:
def state_cluster(df_train):
    t0 = df_train.groupby("state")["microbusiness_density"].agg(["mean", "std"])
    clustering = DBSCAN(eps=0.5, min_samples=2).fit(t0.values)
    t0["cluster"] = clustering.labels_

    return t0.reset_index()[["state", "cluster"]]

In [14]:
def time_arrow(df: pd.DataFrame, added_feature_cols):
    def NormalizeData(data):
        return (data - np.min(data)) / (np.max(data) - np.min(data))

    f_col = "time_arrow"

    seconds_since = df["first_day_of_month"].astype("int64") // 1e9
    df[f_col] = NormalizeData(seconds_since)

    added_feature_cols.append(f_col)

    return df

In [15]:
state_cluster(df_train)

Unnamed: 0,state,cluster
0,Alabama,0
1,Alaska,1
2,Arizona,2
3,Arkansas,0
4,California,3
...,...,...
46,Virginia,1
47,Washington,2
48,West Virginia,0
49,Wisconsin,0


##### Maybe pile

In [16]:
def ups_downs(df_train):
    """
    Counts ups and downs until the latest known time. Returns the percetage of ups/downs
    """
    col = "microbusiness_shift_bool_over_pct"
    col_to_group = "cfips"

    t = df_train.copy()
    t["microbusiness_shift_diff"] = (
        t["microbusiness_density"]
        - df_train.sort_values([col_to_group, "first_day_of_month"])
        .groupby(col_to_group)
        .shift()["microbusiness_density"]
    )

    idx_over_0 = t[t["microbusiness_shift_diff"] >= 0].index
    idx_under_0 = t[t["microbusiness_shift_diff"] < 0].index

    t.loc[idx_over_0, "microbusiness_shift_bool_over"] = True
    t.loc[idx_under_0, "microbusiness_shift_bool_over"] = False

    t["microbusiness_shift_bool_over_sum"] = (
        t.groupby(col_to_group)["microbusiness_shift_bool_over"]
        .expanding()
        .sum()
        .values
    )
    t["microbusiness_shift_bool_over_count"] = (
        t.groupby(col_to_group)["microbusiness_shift_bool_over"]
        .expanding()
        .count()
        .values
    )

    t["microbusiness_shift_bool_over_pct"] = (
        t["microbusiness_shift_bool_over_sum"]
        / t["microbusiness_shift_bool_over_count"]
    )

    idx = t[t["microbusiness_shift_bool_over_count"] < 3].index
    t.loc[idx, "microbusiness_shift_bool_over_pct"] = np.nan

    added_feature_cols.append(col)

    return t[["row_id", "microbusiness_shift_bool_over_pct"]]

#### Misc functions

In [17]:
def split_dates(df_train):
    """
    Split dates. Used for splitting train/test according to datetime.
    """
    dates = np.sort(df_train["first_day_of_month"].unique())
    c = int(dates.shape[0] * 0.70)
    dates_train = dates[:c]
    dates_val = dates[c:]

    return (dates_train, dates_val)


def remove_outliers(train, outlier_multiplier):
    """
    Remove outlies. Should be run on train only
    """
    max_density = (
        train.groupby("cfips")["microbusiness_density"]
        .mean()
        .rename("max_microbusiness_density")
        .reset_index()
    )
    max_density["max_microbusiness_density"] = (
        max_density["max_microbusiness_density"] * outlier_multiplier
    )
    t1 = pd.merge(
        train.reset_index(), max_density, "left", left_on="cfips", right_on="cfips"
    )

    idx = t1[t1["microbusiness_density"] < t1["max_microbusiness_density"]]["index"]
    t = train.loc[idx].copy()

    return t


def smape(pred, eval_data):
    if hasattr(eval_data, "label"):
        A = eval_data.label  # Used by lightgbm
    else:
        A = eval_data  # Used by numpy
    F = pred

    if type(pred) == int or type(pred) == float:
        # Single cases
        value = 100 / 1 * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))
    else:
        # Many cases
        value = 100 / len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))
    return "smape", value, False


def remove_empty_folders(path_abs):
    walk = list(os.walk(path_abs))
    for path, _, _ in walk[::-1]:
        if len(os.listdir(path)) == 0:
            os.rmdir(path)


def save_bayesian_results(loss_fn: str, results: list[dict], best: dict):
    def save_pkl(full_path, data):
        with open(full_path, "wb") as f:
            pickle.dump(data, f)

    # Create folders
    dt = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    subfolder = "{}_{}".format(dt, abs(round(best["target"], 4)))
    bay_result_dir = os.path.join(bayesian_run_path, loss_fn, subfolder)

    if not os.path.exists(bay_result_dir):
        os.makedirs(bay_result_dir)

    # Full paths
    result_path = os.path.join(bay_result_dir, "result.pkl")
    best_path = os.path.join(bay_result_dir, "best.pkl")

    # Save pickles
    save_pkl(result_path, results)
    save_pkl(best_path, best)

    # Remove empty folders that can trash the place
    remove_empty_folders(bayesian_run_path)


def read_pkl(path):
    with open(path, "rb") as f:
        return pickle.load(f)

#### Prepare data

In [None]:
class bcolors:
    HEADER = "\033[95m"
    OKBLUE = "\033[94m"
    OKCYAN = "\033[96m"
    OKGREEN = "\033[92m"
    WARNING = "\033[93m"
    FAIL = "\033[91m"
    ENDC = "\033[0m"
    BOLD = "\033[1m"
    UNDERLINE = "\033[4m"

In [None]:
class PrepareData:
    def __init__(self, df_train):
        self.df_train = df_train.copy()  # Original data

        self._col_target: list = ["microbusiness_density"]
        self._col_target_shifted: str = ["shifted_microbusiness_density"]
        self._col_cfips: str = "cfips"
        self._col_first_day_of_month = "first_day_of_month"

        self.dates_train: np.array = None
        self.dates_val: np.array = None

        self._lgb_train: lgb.Dataset = None
        self._lgb_eval: lgb.Dataset = None
        # self._df: pd.DataFrame = None
        self._cat_f: list = None
        self._features: list = None

        # self._debug_different_shifts = {} # Debugging purposes

    def prepare_data_for_model(self, args):
        # Split and validation strategy
        for shift in args["shifts"]:
            # print('shift ---', shift)

            added_feature_cols = []
            cols_drop_or_not = [
                # Existing features
                "state",
                "pct_college",
                "shift_in_months",
                "pct_it_workers",
                "pct_bb",
                "pct_foreign_born",
                "median_hh_inc",
                # "month",
                "cfips",
                "county",
                # New features
                "time_arrow",
                ## Stats
                "target_cfips_std",
                "target_cfips_median",
                "target_county_median",
                "target_county_mean",
                "target_state_mean",
                "target_cfips_mean",
                "target_county_std",
                "target_state_median",
                "target_state_std",
                ## Roll
                "target_mean_rolling_1_activated",
                "target_mean_rolling_2_activated",
                "target_mean_rolling_3_activated",
                "target_mean_rolling_4_activated",
                ## Shift 
                "target_shift_1_activated",
                "target_shift_2_activated",
                "target_shift_3_activated",
                "target_shift_4_activated",
            ]

            # Do not modify df_train
            df = self.df_train.copy()

            # Define Train/Eval split
            self.dates_train, self.dates_val = split_dates(df)

            # Shift target features so we can predict 1, 2, 3... months ahead
            df = self._shift_target(df, shift, added_feature_cols)

            # Define columns to be used, feature engineering etc
            df, self._cat_f = self._create_data(
                df, added_feature_cols, args
            )
            print('-------------->>>>>>>>>>>>>><<<<<<<<<<<<<-----------')
            print('After FEATURE ENG df.columns', df.columns)

            # Remove some columns according to bayesian.
            # df = self._drop_col_maybe(df, cols_drop_or_not, args)
            self._drop_col_maybe(df, cols_drop_or_not, added_feature_cols, args)
            

            # Split train/eval by date
            (self._lgb_train, self._lgb_eval, df) = self._split_data(df, **args)

            res = (
                self._lgb_train,
                self._lgb_eval,
                df,
                self._cat_f,
                added_feature_cols,
                shift,
            )

            yield res

    def _shift_target(self, df, shift, added_feature_cols):
        """
        Shift data so that I can predict 1,2 and 3 months in advance
        """

        f_col = "shift_in_months"

        # assert shift < 0, "Shift has to be less than 0"

        df[self._col_target_shifted] = (
            df.sort_values([self._col_cfips, self._col_first_day_of_month])
            .groupby(self._col_cfips)[self._col_target]
            .shift(shift)
        )

        df = df.dropna(subset=self._col_target_shifted).copy()

        df[f_col] = shift

        added_feature_cols.append(f_col)

        return df

    def _create_data(self, df, added_feature_cols, args):
        # Split logic

        # Feature engineering
        df = self._feature_engineering(df, added_feature_cols, args)

        # Mape needs values over 1. When not using mape, irrelevant
        target_multiplier = 1
        df[self._col_target_shifted] = df[self._col_target_shifted] * target_multiplier

        # Handle categorical features
        print(df.info())
        cat_f = ["state", "county", "cfips"]
        for c in cat_f:
            print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            print('Changing ´{}´ to catgeory'.format(c))
            df[c] = df[c].astype("category")
            print(df[c].cat.codes)
        print(df.info())

        # print('added_feature_cols', added_feature_cols)
        features = (
            [
                "median_hh_inc",
                "pct_bb",
                "pct_college",
                "pct_foreign_born",
                "pct_it_workers",
            ]
            + added_feature_cols
            # + cat_f
        )
        
        return (df, features, cat_f)

    def _feature_engineering(self, df, added_feature_cols, args):
        """
        0 means feature will not be included
        """
        # Time arrow
        df = time_arrow(df, added_feature_cols)

        # Add previous target info
        df = add_feature_targets_history(df, added_feature_cols, **args)

        print('------------------------------------')
        print('before add_feature_targets_groupby_stats', added_feature_cols)
        # Add std, mean, median for cfips, county, state - using only train data
        df = add_feature_targets_groupby_stats(
            df, added_feature_cols, **args
        )
        print('after add_feature_targets_groupby_stats df.columns', df.columns)
        print('after add_feature_targets_groupby_stats added_feature_cols', added_feature_cols)

        return df

    def _split_data(self, df, drop_na, outlier_multiplier, **kwargs):
        # df = df.copy()
        if drop_na > 0.5:
            df = df.dropna()

        # Split train/val split according to dates
        dates_train, dates_val = split_dates(df)
        train = df[df["first_day_of_month"].isin(dates_train)]
        val = df[df["first_day_of_month"].isin(dates_val)]

        # print('train', train)
        
        # Remove outliers
        train = remove_outliers(train, outlier_multiplier)
        
        print('self._features', self._features)
        
        x_train = train[self._features]
        y_train = train[self._col_target_shifted]

        x_val = val[self._features]
        y_val = val[self._col_target_shifted]

        print('??????????????????????????')
        print('self._cat_f', self._cat_f)
        # lgb_train = lgb.Dataset(x_train, y_train)
        lgb_train = lgb.Dataset(
            x_train, y_train, 
            categorical_feature=self._cat_f, 
            free_raw_data=False
        )
        lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train, free_raw_data=False)

        return (lgb_train, lgb_eval, df)

    def _drop_col_maybe(self, df, cols_drop_or_not: list, added_feature_cols: list, args: dict):
        cols_to_drop = []
        cols_bay_val = []
        all_pairs = []
        for col in cols_drop_or_not:
            all_pairs.append([col, args[col]])
            if args[col] < 0.5:
                drop_col = col.replace('_activated', '')
                cols_to_drop.append(drop_col)
                cols_bay_val.append(args[col])
                
                # Update feature list as well
                idx = added_feature_cols.index(drop_col)
                del added_feature_cols[idx]
                
                # Update cat_f list as well
                try:
                    idx = self._cat_f.index(drop_col)
                    del self._cat_f[idx]
                except ValueError:
                    pass

        print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
        print("df.columns", df.columns)
        print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<")
        print("cols_to_drop", cols_to_drop)
        print('--------------')
        print("cols_bay_val", cols_bay_val)
        print('--------------')
        print("all_pairs", all_pairs)
        print('--------------')

        # return df.drop(columns=cols_to_drop)

In [None]:
class LightGBMBayesian:
    def __init__(self, PrepareData, df_train):
        self.prepare_data: PrepareData = PrepareData(df_train)

        self._debug_different_shifts: dict = {}

        # Shows if there are columns which are not optimized by bayesian.
        self._warn_bayes_not_optimize_columns = True

    def optimize(self, bounds, init_points=1, n_iter=1, xi=1e-4):
        # Initialize the Bayesian Optimization
        self.optimizer = BayesianOptimization(self.train_model, bounds, random_state=0)

        # af = bayes.UtilityFunction('poi', kappa=100)
        acquisition_function = bayes.UtilityFunction(kind="ei", xi=xi)

        # Run the optimization
        self.optimizer.maximize(
            # init_points=100, n_iter=100
            init_points=init_points,
            n_iter=n_iter,
            acquisition_function=acquisition_function,
        )

        # Get the best hyperparameters
        best_params = self.optimizer.max["params"]
        print("Best hyperparameters: ", best_params)

    def train_model(
        self,
        shifts=[-1, -2, -3],
        early_stopping=True,
        log_evaluation=True,
        num_threads=6,
        # verbose=2,
        **kwargs,
    ):
        # print("kwargs", kwargs)

        saved_args = locals()
        saved_args.update(saved_args["kwargs"])
        del saved_args["self"], saved_args["kwargs"]
        scores = []

        # print('train_model - saved_args', saved_args)

        for (
            lgb_train,
            lgb_eval,
            df,
            cat_f,
            features,
            shift,
        ) in self.prepare_data.prepare_data_for_model(saved_args):
            self._check_bayes_optimize_columns(saved_args, features)

            params = {
                "boosting_type": "gbdt",
                "objective": "mae",
                "metric": "None",
                "first_metric_only": True,
                # "num_iterations": int(num_iterations),
                "num_leaves": int(kwargs["num_leaves"]),
                "learning_rate": kwargs["learning_rate"],
                "subsample": kwargs["subsample"],
                "colsample_bytree": kwargs["colsample_bytree"],
                "reg_alpha": kwargs["reg_alpha"],
                "reg_lambda": kwargs["reg_lambda"],
                # "verbose": verbose,
                "num_threads": num_threads,
            }

            callbacks = []
            # Stop earlier if no changes
            if early_stopping:
                callbacks.append(lgb.early_stopping(100))

            # Log every X-th line
            if log_evaluation:
                callbacks.append(lgb.log_evaluation(100))

            # Can be used to supress warnings
            # with warnings.catch_warnings():
            #     warnings.filterwarnings("ignore", category=UserWarning)

            gbm = lgb.train(
                params,
                lgb_train,
                num_boost_round=int(kwargs["num_iterations"]),
                callbacks=callbacks,
                valid_sets=[lgb_eval],
                feval=smape,
            )

            scores.append(gbm.best_score["valid_0"]["smape"])

            pred_eval = gbm.predict(lgb_eval.data)

            res = {
                "lgb_train_data": lgb_train.data,
                "lgb_train_label": lgb_train.label,
                "lgb_eval_data": lgb_eval.data,
                "lgb_eval_label": lgb_eval.label,
                "df": df.copy(),
                "cat_f": cat_f,
                "features": features,
                "pred_eval": pred_eval,
                "gbm": deepcopy(gbm),
            }

            self._debug_different_shifts[shift] = res

        return -np.array(scores).mean()

    def per_case_error(self, shift):
        t0 = self._debug_different_shifts[shift]
        data_eval = t0["lgb_eval_data"].copy()
        data_eval["target"] = t0["lgb_eval_label"]
        data_eval["pred"] = t0["pred_eval"]
        data_eval["smape"] = data_eval.apply(
            lambda x: smape(x["pred"], x["target"])[1], axis=1
        )

        cols_to_copy = ["row_id", "cfips", "first_day_of_month"]
        data_eval[cols_to_copy] = t0["df"][cols_to_copy]

        return data_eval

    def _check_bayes_optimize_columns(self, saved_args, features):
        if self._warn_bayes_not_optimize_columns:
            columns_no_bayes = list(set(features) - saved_args.keys())

            if len(columns_no_bayes) > 0:
                print(">>>>>>>>>>>>>>>>>")
                print(
                    f"{bcolors.WARNING}Following columns are not optimized by bayesian:{bcolors.ENDC}"
                )
                # print('Following columns are not optimized by bayesian:')
                print(columns_no_bayes)
                print(">>>>>>>>>>>>>>>>>")

            self._warn_bayes_not_optimize_columns = False

##### Run

In [None]:
# col_mapping = {
#     "state",
#     "pct_college",
#     "shift_in_months",
#     "pct_it_workers",
#     "pct_bb",
#     "pct_foreign_born",
#     "median_hh_inc",
#     # "month",
#     "cfips",
#     "county",
#     # New features
#     "time_arrow",
#     ## Stats
#     "target_cfips_std",
#     "target_cfips_median",
#     "target_county_median",
#     "target_county_mean",
#     "target_state_mean",
#     "target_cfips_mean",
#     "target_county_std",
#     "target_state_median",
#     "target_state_std",
#     ## Roll
#     "target_mean_rolling_1_activated",
#     "target_mean_rolling_2_activated",
#     "target_mean_rolling_3_activated",
#     "target_mean_rolling_4_activated",
#     ## Shift 
#     "target_shift_1_activated",
#     "target_shift_2_activated",
#     "target_shift_3_activated",
#     "target_shift_4_activated",
# }

In [None]:
# %%capture
binary_bounds = (0, 1)

min_window = 1
max_window = 10

min_shift = 1
max_shift = 10

bounds = {
    # Model hyperparameters
    "num_leaves": (5, 100),
    "num_iterations": (50, 1000),
    "learning_rate": (0.01, 0.5),
    "subsample": (0.1, 1),
    "colsample_bytree": (0.1, 1),
    "reg_alpha": (0, 10),
    "reg_lambda": (0, 10),
    # Disable/Enable existing features
    "state": binary_bounds,
    "pct_college": binary_bounds,
    "shift_in_months": binary_bounds,
    "pct_it_workers": binary_bounds,
    "pct_bb": binary_bounds,
    "pct_foreign_born": binary_bounds,
    "median_hh_inc": binary_bounds,
    "month": binary_bounds,
    "cfips": binary_bounds,
    "county": binary_bounds,
    # Drop all rows having NaN
    "drop_na": binary_bounds,
    # New features
    ## Simulates time from 0(oldest) to 1 newest
    "time_arrow": binary_bounds,
    ## Rolling window bounds
    "target_mean_rolling_1": (min_window, max_window),
    "target_mean_rolling_2": (min_window, max_window),
    "target_mean_rolling_3": (min_window, max_window),
    "target_mean_rolling_4": (min_window, max_window),
    ## Shift bounds
    "target_shift_1": (min_shift, max_shift),
    "target_shift_2": (min_shift, max_shift),
    "target_shift_3": (min_shift, max_shift),
    "target_shift_4": (min_shift, max_shift),
    ## Stats
    "target_cfips_std": binary_bounds,
    "target_cfips_median": binary_bounds,
    "target_county_median": binary_bounds,
    "target_county_mean": binary_bounds,
    "target_state_mean": binary_bounds,
    "target_cfips_mean": binary_bounds,
    "target_county_std": binary_bounds,
    "target_state_median": binary_bounds,
    "target_state_std": binary_bounds,
    ## Smaller value less outliers
    "outlier_multiplier": (0.1, 100),
    # Disable/Enable features
    ## Pre-existing features
    "enabled_state": binary_bounds,
    "enabled_pct_college": binary_bounds,
    "enabled_shift_in_months": binary_bounds,
    "enabled_pct_it_workers": binary_bounds,
    "enabled_pct_bb": binary_bounds,
    "enabled_pct_foreign_born": binary_bounds,
    "enabled_median_hh_inc": binary_bounds,
    "enabled_month": binary_bounds,
    "enabled_cfips": binary_bounds,
    "enabled_county": binary_bounds,
    ## Stats
    "enabled_target_cfips_std": binary_bounds,
    "enabled_target_cfips_median": binary_bounds,
    "enabled_target_county_median": binary_bounds,
    "enabled_target_county_mean": binary_bounds,
    "enabled_target_state_mean": binary_bounds,
    "enabled_target_cfips_mean": binary_bounds,
    "enabled_target_county_std": binary_bounds,
    "enabled_target_state_median": binary_bounds,
    "enabled_target_state_std": binary_bounds,
    ## Roll
    "enabled_target_mean_rolling_1": binary_bounds,
    "enabled_target_mean_rolling_2": binary_bounds,
    "enabled_target_mean_rolling_3": binary_bounds,
    "enabled_target_mean_rolling_4": binary_bounds,
    ## Shift 
    "enabled_target_shift_1": binary_bounds,
    "enabled_target_shift_2": binary_bounds,
    "enabled_target_shift_3": binary_bounds,
    "enabled_target_shift_4": binary_bounds,
}

# lightgbm_bayesian = LightGBMBayesian(PrepareData, df_train)
# lightgbm_bayesian.optimize(bounds, init_points=1, n_iter=1, xi=1e-2)

# save_bayesian_results(
#     "smape", lightgbm_bayesian.optimizer.res, lightgbm_bayesian.optimizer.max
# )

In [None]:
# save_bayesian_results('smape', lightgbm_bayesian.optimizer.res, lightgbm_bayesian.optimizer.max)

In [None]:
t = read_pkl("../data/bayesian_runs/smape/2023-02-17_05-12-23_2.7925/best.pkl")
t_res = read_pkl("../data/bayesian_runs/smape/2023-02-17_05-12-23_2.7925/result.pkl")
t

In [None]:
t0 = np.array([x["target"] for x in t_res])
df_t0 = pd.DataFrame(t0 * -1, columns=["bayes_target"])
ax = df_t0.plot()
ax.set_yscale("log")

In [None]:
# lightgbm_bayesian.optimizer.res

In [None]:
lightgbm_bayesian = LightGBMBayesian(PrepareData, df_train)
start = time.time()
k = lightgbm_bayesian.train_model(**t["params"])
debug_data = lightgbm_bayesian._debug_different_shifts
s1 = debug_data[-1]
df = s1["df"]
# debug_data[1][4]
print("SMAPE ==", k)

In [None]:
s1.keys()

In [None]:
t["params"]

In [None]:
lgb.plot_importance(s1["gbm"], max_num_features=30)

In [None]:
lgb.Dataset(s1["lgb_eval_data"], s1["lgb_eval_label"])

In [None]:
data_eval = lightgbm_bayesian.per_case_error(-1)

In [None]:
data_eval[["target", "pred", "smape"]].sort_values("smape").reset_index(
    drop=True
).plot()

In [None]:
s1.keys()

In [None]:
s1["lgb_eval_data"].loc[25:25]

In [None]:
s1["df"].loc[25:25]

In [None]:
data_eval.sort_values("smape").tail(50)["cfips"].value_counts()

In [None]:
data_eval[data_eval["cfips"] == 56033].set_index("first_day_of_month")[
    ["target", "pred", "smape"]
].plot()

In [None]:
data_eval[data_eval["cfips"] == 56033]

In [None]:
# params = {
#     "colsample_bytree": 0.915899949299061,
#     "drop_na": 0.7740473326986388,
#     "learning_rate": 0.17324112449403456,
#     "median_hh_inc": 0.08110138998799676,
#     "num_iterations": 436.87911284311696,
#     "num_leaves": 27.06224350623956,
#     "pct_bb": 0.13248763475798297,
#     "pct_college": 0.05342718178682526,
#     "pct_foreign_born": 0.7255943642105788,
#     "pct_it_workers": 0.011427458625031028,
#     "reg_alpha": 7.705807485027762,
#     "reg_lambda": 1.4694664540037505,
#     "shift_in_months": 0.07952208258675575,
#     "state": 0.08960303423860538,
#     "subsample": 0.704843026618523,
#     "target_mean_rolling_1": 2.4536720985284477,
#     "target_mean_rolling_2": 4.205394666800984,
#     "target_mean_rolling_3": 5.573687913239169,
#     "target_mean_rolling_4": 8.605511738287937,
#     "target_mean_rolling_5": 0.7270442627113283,
#     "target_mean_rolling_7": 0.27032790523871464,
#     "target_shift_1": 1.314827992911276,
#     "target_shift_2": 0.5537432042119794,
#     "target_shift_3": 3.015986344809425,
#     "target_shift_4": 2.6211814923967824,
# }
# lightgbm_bayesian = LightGBMBayesian(PrepareData, df_train)
# k = lightgbm_bayesian.train_model(**params, return_data=False)
# debug_data = lightgbm_bayesian._debug_different_shifts
# # debug_data[1][4]

In [None]:
# lgb.plot_importance(debug_data[-1]["gbm"], max_num_features=20)

In [None]:
# lgb.create_tree_digraph(debug_data[-1]["gbm"])

In [None]:
k

In [None]:
debug_data = lightgbm_bayesian._debug_different_shifts
all_pred = []
all_eval = []

for values in debug_data.values():
    all_eval.extend(list(values["lgb_eval"].label))
    all_pred.extend(list(values["pred_eval"]))

all_pred = np.array(all_pred)

myobject = SimpleNamespace()  # myobject = {}
myobject.label = np.array(all_eval)
all_eval = myobject

In [None]:
smape(all_pred, all_eval)

In [None]:
all_eval.label

In [None]:
all_pred

In [None]:
plt.plot(all_eval.label)

In [None]:
plt.plot(all_pred)

In [None]:
smape(values["pred_eval"], values["lgb_eval"])

In [None]:
smape(all_pred, all_eval)

## Results

In [None]:
lgb.plot_importance(gbm, max_num_features=20)

In [None]:
t = df_train.copy()
idx = y_train.index

cols = x_train.columns.difference(df_train.columns)
cols = x_train.columns.intersection(cols)
t[cols] = x_train[cols].copy()

t.loc[idx, "target"] = (y_train - target_const).values
t.loc[idx, "pred"] = gbm.predict(x_train) - target_const
t = t.loc[idx]
t["diff"] = abs(t["target"] - t["pred"])


def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


t["relative"] = t["diff"] / t["pred"]

# Analyze

### Column mapping

##### Progress

In [19]:
# # %%capture
# binary_bounds = (0, 1)

# min_window = 1
# max_window = 10

# min_shift = 1
# max_shift = 10

# bounds = {
#     # Model hyperparameters
#     "num_leaves": (5, 100),
#     "num_iterations": (50, 1000),
#     "learning_rate": (0.01, 0.5),
#     "subsample": (0.1, 1),
#     "colsample_bytree": (0.1, 1),
#     "reg_alpha": (0, 10),
#     "reg_lambda": (0, 10),
#     # Disable/Enable existing features
#     "state": binary_bounds,
#     "pct_college": binary_bounds,
#     "shift_in_months": binary_bounds,
#     "pct_it_workers": binary_bounds,
#     "pct_bb": binary_bounds,
#     "pct_foreign_born": binary_bounds,
#     "median_hh_inc": binary_bounds,
#     "month": binary_bounds,
#     "cfips": binary_bounds,
#     "county": binary_bounds,
#     # Drop all rows having NaN
#     "drop_na": binary_bounds,
#     # New features
#     ## Simulates time from 0(oldest) to 1 newest
#     "time_arrow": binary_bounds,
#     ## Rolling window bounds
#     "target_mean_rolling_1": (min_window, max_window),
#     "target_mean_rolling_2": (min_window, max_window),
#     "target_mean_rolling_3": (min_window, max_window),
#     "target_mean_rolling_4": (min_window, max_window),
#     ## Shift bounds
#     "target_shift_1": (min_shift, max_shift),
#     "target_shift_2": (min_shift, max_shift),
#     "target_shift_3": (min_shift, max_shift),
#     "target_shift_4": (min_shift, max_shift),
#     ## Stats
#     "target_cfips_std": binary_bounds,
#     "target_cfips_median": binary_bounds,
#     "target_county_median": binary_bounds,
#     "target_county_mean": binary_bounds,
#     "target_state_mean": binary_bounds,
#     "target_cfips_mean": binary_bounds,
#     "target_county_std": binary_bounds,
#     "target_state_median": binary_bounds,
#     "target_state_std": binary_bounds,
#     ## Smaller value less outliers
#     "outlier_multiplier": (0.1, 100),
#     # Disable/Enable features
#     ## Pre-existing features
#     "enabled_state": binary_bounds,
#     "enabled_pct_college": binary_bounds,
#     "enabled_shift_in_months": binary_bounds,
#     "enabled_pct_it_workers": binary_bounds,
#     "enabled_pct_bb": binary_bounds,
#     "enabled_pct_foreign_born": binary_bounds,
#     "enabled_median_hh_inc": binary_bounds,
#     "enabled_month": binary_bounds,
#     "enabled_cfips": binary_bounds,
#     "enabled_county": binary_bounds,
#     ## Stats
#     "enabled_target_cfips_std": binary_bounds,
#     "enabled_target_cfips_median": binary_bounds,
#     "enabled_target_county_median": binary_bounds,
#     "enabled_target_county_mean": binary_bounds,
#     "enabled_target_state_mean": binary_bounds,
#     "enabled_target_cfips_mean": binary_bounds,
#     "enabled_target_county_std": binary_bounds,
#     "enabled_target_state_median": binary_bounds,
#     "enabled_target_state_std": binary_bounds,
#     ## Roll
#     "enabled_target_mean_rolling_1": binary_bounds,
#     "enabled_target_mean_rolling_2": binary_bounds,
#     "enabled_target_mean_rolling_3": binary_bounds,
#     "enabled_target_mean_rolling_4": binary_bounds,
#     ## Shift 
#     "enabled_target_shift_1": binary_bounds,
#     "enabled_target_shift_2": binary_bounds,
#     "enabled_target_shift_3": binary_bounds,
#     "enabled_target_shift_4": binary_bounds,
# }

In [20]:
def cell_vars(offset=0):
    
    def filter_feature_class_only(result):
        feature_class_only = {}
        for key, val in result.items():
            if str(type(val)) == "<class '__main__.Feature'>":
                feature_class_only.update({key:val})
                
        return feature_class_only
    
    import io
    from contextlib import redirect_stdout

    ipy = get_ipython()
    out = io.StringIO()

    with redirect_stdout(out):
        ipy.magic("history {0}".format(ipy.execution_count - offset))

    #process each line...
    x = out.getvalue().replace(" ", "").split("\n")
    x = [a.split("=")[0] for a in x if "=" in a] #all of the variables in the cell
    g = globals()
    result = {k:g[k] for k in x if k in g}
    
    feature_class_only = filter_feature_class_only(result)
    
    if len(feature_class_only) == 0:
        raise ValueError('None found. Maybe you changed name AGAIN?')
    
    return feature_class_only

In [21]:
class DataFrame(TraitType):
    """A trait for pd.DataFrame.
    """

    info_text = "pd.DataFrame"
    
    def validate(self, obj, value):
        if type(value) == pd.DataFrame:
            return value
        self.error(obj, value)



class Feature(HasTraits):
    # During init

    name = Unicode()
    f = Callable()

    bound = Tuple(Float(), Float())

    enabled_bounds = List(bound, minlen=1)
    params_bounds = List(bound)
    
    # After init

    _enabled_dict = Dict(key_trait=Unicode(), value_trait=bound)
    _params_dict = Dict(key_trait=Unicode(), value_trait=bound)
    
    relations = Dict(key_trait=Unicode(), value_trait=Unicode())
    
    _df_enabled_params = DataFrame()
    # _df_enabled_params = Dict()

    def __init__(self, name, f, df, enabled_bounds=None, params_bounds=None):
        self.name = name
        self.f = f
        self.df = df
        if enabled_bounds:
            self.enabled_bounds = enabled_bounds
        if params_bounds:
            self.params_bounds = params_bounds

        self._make_enable_and_params_dict()
        self._make_df_enable_params()

    def get_bounds_for_optimizer(self):
        return {**self._enabled_dict, **self._params_dict}
    
    def get_relations(self):
        return self._relations
    
    def get_enabled_dict(self):
        return self._enabled_dict
    
    def get_params_dict(self):
        return self._params_dict

    def get_df_enable_params(self):
        return self._df_enabled_params
    
    
    def _set_relations(self, d):
        self._relations = d
        
        
    def _make_df_enable_params(self):
        d = self.get_relations()
        relation_original = d
        relation_flipped = {v: k for k, v in d.items()}

        # Get relations
        df_relation = pd.DataFrame.from_dict(
            {**relation_original, **relation_flipped}, orient="index", columns=["relation"]
        )

        # Bounds
        df_bounds = pd.DataFrame.from_dict(
            {
                **self._enabled_dict,
                **self._params_dict,
            },
            orient="index",
            columns=["min", "max"],
        )

        # Join
        df = pd.merge(df_bounds, df_relation, 'left', left_index=True, right_index=True)
        
        # Add name
        df['name'] = self.name
        
        self._df_enabled_params = df

    def _make_enable_and_params_dict(self):
        """
        Create 2 variables:
        1) self._enabled_dict
        2) self._params_dict
        """
        self._enabled_keys = []
        self._params_keys = []
        for kind in ["enabled", "params"]:
            # print('self.enabled_bounds', self.enabled_bounds)
            bounds = getattr(self, "{}_bounds".format(kind))

            d = {}
            key_name = "{kind}_{name}_{idx}"
            for idx, bound in enumerate(bounds):
                if len(bounds) < 2:
                    idx = ""
                    self._params_keys.append(
                        key_name.format(kind="params", name=self.name, idx=idx)
                    )
                key = key_name.format(kind=kind, name=self.name, idx=idx)
                d[key] = bound

                getattr(self, "_{}_keys".format(kind)).append(key)

            setattr(self, "_{}_dict".format(kind), d)
        # Set relations for all
        d = dict(zip(self._enabled_keys, self._params_keys))
        self._set_relations(d)
        
        
    @default("enabled_bounds")
    def _default_value(self):
        return [(1, 1)]

    @validate("params_bounds")
    def _valid_params_bounds(self, proposal):
        params_bounds = proposal["value"]

        if params_bounds is not None:
            len_params_bounds = len(params_bounds)
            len_enabled_bounds = len(self.enabled_bounds)
            if len_params_bounds != len_enabled_bounds:
                raise TraitError(
                    "If defined, ´params_bounds´ ({}) should match in length with ´enabled_bounds´ ({})".format(
                        len_params_bounds, len_enabled_bounds
                    )
                )

        return proposal["value"]

    @validate("enabled_dict")
    def _valid_enabled_dict(self, proposal):
        for key, value in proposal["value"].items():
            bound_min = value[0]
            bound_max = value[1]

            if (bound_min < 0 or bound_min > 1) or (bound_max < 0 or bound_max > 1):
                raise TraitError(
                    "For key ´{}´ bound values have to be between 0 and 1 (both included). Values given: ´{}´".format(
                        key, value
                    )
                )

            if bound_min > bound_max:
                raise TraitError(
                    "Bound min cannot be bigger than max. Values given: ´{}´".format(
                        value
                    )
                )

        return proposal["value"]

        
#     def _add_self_to_all_paramss(self, all_paramss):
#         all_paramss.update({self.name: self})

In [22]:
class ManageFeatures(HasTraits):
    _feature_objects = Dict(key_trait=Unicode())
    _df_mapping = DataFrame()
    _df_enabled_params = DataFrame()

    def __init__(self, feature_objects):
        self._feature_objects = feature_objects
        
        self._set_features_df_enabled_params()
        
    def get_features_df_enabled_params(self):
        return self._df_enabled_params
        
    def get_pbounds(self):
        df = self.get_features_df_enabled_params()
        df['pbounds'] = list(zip(df['min'], df['max']))
        
        return df['pbounds'].to_dict()
        
    def get_mapping(self):
        return self._df_mapping.copy()
        
    def set_mapping_return(self, *args, **kwargs):
        self.set_mapping(*args, **kwargs)
        return self.get_mapping()
        
    def _set_features_df_enabled_params(self, f="get_df_enable_params"):
        l = []
        for key, value in self._feature_objects.items():
            print()
            l.append(getattr(self._feature_objects[key], f)())

        self._df_enabled_params = self._validate_df_duplicate_index_and_concat(l)
        
    def set_mapping(self, pbounds: dict):
        
        df_enabled_params = self.get_features_df_enabled_params()
        df_pbounds = pd.DataFrame.from_dict(pbounds, orient='index', columns=['params'])

        df = pd.merge(df_enabled_params, df_pbounds, 'outer', left_index=True, right_index=True)
        
        assert df.shape == df.dropna().shape, 'There should be no NaN values'
        
        self._df_mapping = df
        
        # self._debug = (df_enabled_params, df_pbounds)
        

    def _validate_df_duplicate_index_and_concat(self, l):
        rows_in_l = 0
        for t in l:
            rows_in_l += t.shape[0]

        df = pd.concat(l)

        assert (
            rows_in_l == df.shape[0]
        ), "Row count should be same. You probably have a duplicate index between diferent ´Feature objects´"

        return df

In [23]:
from itertools import repeat

enabled_tuple = (0, 1)

params_tuple = (0,10)
feature_target_rolling_mean = Feature(
    "target_rolling_mean",
    add_feature_targets_history,
    df_train,
    enabled_bounds=list(repeat(enabled_tuple, 4)),
    params_bounds=list(repeat(params_tuple, 4)),
)

feature_target_shift_mean = Feature(
    "target_shift",
    add_feature_targets_history,
    df_train,
    enabled_bounds=list(repeat(enabled_tuple, 4)),
    params_bounds=list(repeat(params_tuple, 4)),
)

# This has to be in the same cell as features
all_features = cell_vars()



In [24]:
manage_features = ManageFeatures(all_features)
# manage_features._get_all_relations()





__main__.ManageFeatures

In [29]:
def black_box_function(**kwargs):
    """Function with unknown internals we wish to maximize.

    This is just serving as an example, for all intents and
    purposes think of the internals of this function, i.e.: the process
    which generates its output values, as unknown.
    """
    print(kwargs)

    return -1


# Bounded region of parameter space
pbounds = manage_features.get_pbounds()

optimizer = BayesianOptimization(
    f=black_box_function,
    pbounds=pbounds,
    verbose=2,  # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    random_state=1,
)

optimizer.maximize(
    init_points=1,
    n_iter=1,
)

bayes_exploring_values = {
    "enabled_target_rolling_mean_0": 0.417022004702574,
    "enabled_target_rolling_mean_1": 0.7203244934421581,
    "enabled_target_rolling_mean_2": 0.00011437481734488664,
    "enabled_target_rolling_mean_3": 0.30233257263183977,
    "enabled_target_shift_0": 0.14675589081711304,
    "enabled_target_shift_1": 0.0923385947687978,
    "enabled_target_shift_2": 0.1862602113776709,
    "enabled_target_shift_3": 0.34556072704304774,
    "params_target_rolling_mean_0": 3.9676747423066994,
    "params_target_rolling_mean_1": 5.3881673400335695,
    "params_target_rolling_mean_2": 4.191945144032948,
    "params_target_rolling_mean_3": 6.852195003967595,
    "params_target_shift_0": 2.0445224973151745,
    "params_target_shift_1": 8.781174363909454,
    "params_target_shift_2": 0.27387593197926163,
    "params_target_shift_3": 6.704675101784022,
}


manage_features.set_mapping_return(optimizer.max['params'])

|   iter    |  target   | enable... | enable... | enable... | enable... | enable... | enable... | enable... | enable... | params... | params... | params... | params... | params... | params... | params... | params... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
{'enabled_target_rolling_mean_0': 0.417022004702574, 'enabled_target_rolling_mean_1': 0.7203244934421581, 'enabled_target_rolling_mean_2': 0.00011437481734488664, 'enabled_target_rolling_mean_3': 0.30233257263183977, 'enabled_target_shift_0': 0.14675589081711304, 'enabled_target_shift_1': 0.0923385947687978, 'enabled_target_shift_2': 0.1862602113776709, 'enabled_target_shift_3': 0.34556072704304774, 'params_target_rolling_mean_0': 3.9676747423066994, 'params_target_rolling_mean_1': 5.3881673400335695, 'params_target_rolling_mean_2': 4.191945144032948, 'param

Unnamed: 0,min,max,relation,name,pbounds,params
enabled_target_rolling_mean_0,0.0,1.0,params_target_rolling_mean_0,target_rolling_mean,"(0.0, 1.0)",0.417022
enabled_target_rolling_mean_1,0.0,1.0,params_target_rolling_mean_1,target_rolling_mean,"(0.0, 1.0)",0.720324
enabled_target_rolling_mean_2,0.0,1.0,params_target_rolling_mean_2,target_rolling_mean,"(0.0, 1.0)",0.000114
enabled_target_rolling_mean_3,0.0,1.0,params_target_rolling_mean_3,target_rolling_mean,"(0.0, 1.0)",0.302333
enabled_target_shift_0,0.0,1.0,params_target_shift_0,target_shift,"(0.0, 1.0)",0.146756
enabled_target_shift_1,0.0,1.0,params_target_shift_1,target_shift,"(0.0, 1.0)",0.092339
enabled_target_shift_2,0.0,1.0,params_target_shift_2,target_shift,"(0.0, 1.0)",0.18626
enabled_target_shift_3,0.0,1.0,params_target_shift_3,target_shift,"(0.0, 1.0)",0.345561
params_target_rolling_mean_0,0.0,10.0,enabled_target_rolling_mean_0,target_rolling_mean,"(0.0, 10.0)",3.967675
params_target_rolling_mean_1,0.0,10.0,enabled_target_rolling_mean_1,target_rolling_mean,"(0.0, 10.0)",5.388167


In [28]:
optimizer.max['params']

{'enabled_target_rolling_mean_0': 0.417022004702574,
 'enabled_target_rolling_mean_1': 0.7203244934421581,
 'enabled_target_rolling_mean_2': 0.00011437481734488664,
 'enabled_target_rolling_mean_3': 0.30233257263183977,
 'enabled_target_shift_0': 0.14675589081711304,
 'enabled_target_shift_1': 0.0923385947687978,
 'enabled_target_shift_2': 0.1862602113776709,
 'enabled_target_shift_3': 0.34556072704304774,
 'params_target_rolling_mean_0': 3.9676747423066994,
 'params_target_rolling_mean_1': 5.3881673400335695,
 'params_target_rolling_mean_2': 4.191945144032948,
 'params_target_rolling_mean_3': 6.852195003967595,
 'params_target_shift_0': 2.0445224973151745,
 'params_target_shift_1': 8.781174363909454,
 'params_target_shift_2': 0.27387593197926163,
 'params_target_shift_3': 6.704675101784022}

### Per state target data

In [None]:
train_dates, eval_dates = split_dates(df_train)

In [None]:
train_dates

In [None]:
train_data = df_train[df_train["first_day_of_month"].isin(train_dates)]
eval_data = df_train[df_train["first_day_of_month"].isin(eval_dates)]

In [None]:
def add_feature_targets_groupby_stats(
    df,
    col_group,
    new_col_template="{}_target_{}",
    agg_functions=["mean", "std", "median"],
):
    t0 = df.groupby(col_group)["microbusiness_density"].agg(agg_functions)
    t0.columns = [new_col.format(col_group, x) for x in t0.columns]

    df = pd.merge(df, t0, "left", left_on=col_group, right_index=True)

    return df

In [None]:
add_feature_targets_groupby_stats(train_data, "state").sort_values(
    "state_target_std"
).reset_index(drop=True)["state_target_std"].plot()

In [None]:
ax = (
    train_data.sort_values("microbusiness_density")
    .reset_index(drop=True)["microbusiness_density"]
    .plot(logy=True)
)
eval_data.sort_values("microbusiness_density").reset_index(drop=True)[
    "microbusiness_density"
].plot(logy=True, ax=ax)

In [None]:
col = "cfips"
new_col = "{}_target_{}"
t0 = train_data.groupby("cfips")["microbusiness_density"].agg(agg_functions)

t0.columns = [new_col.format(col, x) for x in t0.columns]

In [None]:
t0.sort_values("cfips_target_std")

In [None]:
t0.sort_values("cfips_target_mean").reset_index(drop=True).plot()

In [None]:
train_data

In [None]:
df_train[df_train["state"] == "Alabama"]["cfips"].value_counts()

### Split dates

In [None]:
def split_dates(df):
    """
    Split dates used for splitting train/test
    """
    dates = np.sort(df["first_day_of_month"].unique())
    c = int(dates.shape[0] * 0.70)
    dates_train = dates[:c]
    dates_val = dates[c:]

    return (dates_train, dates_val)

### Long/Lat

In [None]:
import plotly.express as px

color_scale = [(0, "orange"), (1, "red")]

fig = px.scatter_mapbox(
    df_boundaries,
    lat="INTPTLAT",
    lon="INTPTLON",
    color_continuous_scale=color_scale,
    zoom=8,
    height=800,
    width=800,
)

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show()

### Analyze

#### State corr

In [None]:
t0 = (
    df_train.groupby(["state", "first_day_of_month"])["microbusiness_density"]
    .agg(["mean", "std"])
    .reset_index()
)
t1 = t0.pivot_table("mean", "first_day_of_month", "state").sort_index().corr()
plt.imshow(t1.values, cmap="hot", interpolation="nearest")
plt.show()

In [None]:
t1

In [None]:
t5 = t1.rename_axis(["other_state"], axis=1).stack().rename("corr").reset_index()
t5 = t5[t5["state"] != t5["other_state"]]

# Clean pairs of same correlations
t5 = t5.sort_values("corr").reset_index(drop=True)
cols = ["state", "other_state"]
t5[cols] = pd.DataFrame(np.sort(t5[cols].values, axis=1), columns=cols)
t5 = t5.drop_duplicates()

from sklearn.cluster import DBSCAN

clustering = DBSCAN(eps=0.01, min_samples=2).fit(t5["corr"].values.reshape(-1, 1))

t5["cluster"] = clustering.labels_
t5.groupby(["cluster"])["corr"].agg(["mean", "std"])

In [None]:
t5.shape

In [None]:
corr_states = t5[abs(t5["corr"]) > 0.8]

In [None]:
ax = (
    df_train[df_train["state"] == "Louisiana"]
    .sort_values("first_day_of_month")
    .groupby("first_day_of_month")["microbusiness_density"]
    .mean()
    .plot()
)

# ax = df_train[df_train["state"] == "Mississippi"].sort_values("first_day_of_month").groupby(
#     "first_day_of_month"
# )["microbusiness_density"].mean().plot(ax=ax)

t9 = (
    df_train[df_train["state"] == "Mississippi"]
    .sort_values("first_day_of_month")
    .groupby("first_day_of_month")["microbusiness_density"]
    .mean()
    + 0.55
)

t9.plot(ax=ax)

plt.legend(["Louisiana", "Mississippi"]);

In [None]:
corr_states = t5[abs(t5["corr"]) > 0.8]

#### State corr shifted

In [None]:
t0 = (
    df_train.groupby(["state", "first_day_of_month"])["microbusiness_density"]
    .agg(["mean", "std"])
    .reset_index()
)

state = "Hawaii"

t7 = t0.pivot_table("mean", "first_day_of_month", "state")
a = t7[state].shift(-6).copy()
a.index = t7.index
t7[state] = a

t1 = t7.sort_index().corr()

plt.imshow(t1.values, cmap="hot", interpolation="nearest")
plt.show()

In [None]:
t1

In [None]:
t5 = t1.rename_axis(["other_state"], axis=1).stack().rename("corr").reset_index()
t5 = t5[t5["state"] != t5["other_state"]]

# Clean pairs of same correlations
t5 = t5.sort_values("corr").reset_index(drop=True)
cols = ["state", "other_state"]
t5[cols] = pd.DataFrame(np.sort(t5[cols].values, axis=1), columns=cols)
t5 = t5.drop_duplicates()

from sklearn.cluster import DBSCAN

clustering = DBSCAN(eps=0.01, min_samples=2).fit(t5["corr"].values.reshape(-1, 1))

t5["cluster"] = clustering.labels_
t5.groupby(["cluster"])["corr"].agg(["mean", "std", "count"])

In [None]:
t5[t5["state"] == state].sort_values("corr")

In [None]:
# ax = df_train[df_train["state"] == "Alabama"].sort_values("first_day_of_month").groupby(
#     "first_day_of_month"
# )["microbusiness_density"].mean().plot()

k = df_train[df_train["state"] == state].copy()
k["first_day_of_month"] = k["first_day_of_month"]  # - pd.DateOffset(months=6)

ax = (
    k.sort_values("first_day_of_month")
    .groupby("first_day_of_month")["microbusiness_density"]
    .mean()
    .plot()
)

# ax = df_train[df_train["state"] == "Mississippi"].sort_values("first_day_of_month").groupby(
#     "first_day_of_month"
# )["microbusiness_density"].mean().plot(ax=ax)

other = "Massachusetts"
t9 = (
    df_train[df_train["state"] == other]
    .sort_values("first_day_of_month")
    .groupby("first_day_of_month")["microbusiness_density"]
    .mean()
)

t9.plot(ax=ax)

plt.legend([state, other])

ax.grid("on", which="minor", axis="x")
ax.grid("off", which="major", axis="x")

#### Count up/down

In [None]:
def ups_downs(df_train):
    col_to_group = "cfips"

    t = df_train.copy()
    t["microbusiness_shift_diff"] = (
        t["microbusiness_density"]
        - df_train.sort_values([col_to_group, "first_day_of_month"])
        .groupby(col_to_group)
        .shift()["microbusiness_density"]
    )

    idx_over_0 = t[t["microbusiness_shift_diff"] >= 0].index
    idx_under_0 = t[t["microbusiness_shift_diff"] < 0].index

    t.loc[idx_over_0, "microbusiness_shift_bool_over"] = True
    t.loc[idx_under_0, "microbusiness_shift_bool_over"] = False

    t["microbusiness_shift_bool_over_sum"] = (
        t.groupby(col_to_group)["microbusiness_shift_bool_over"]
        .expanding()
        .sum()
        .values
    )
    t["microbusiness_shift_bool_over_count"] = (
        t.groupby(col_to_group)["microbusiness_shift_bool_over"]
        .expanding()
        .count()
        .values
    )

    t["microbusiness_shift_bool_over_pct"] = (
        t["microbusiness_shift_bool_over_sum"]
        / t["microbusiness_shift_bool_over_count"]
    )

    idx = t[t["microbusiness_shift_bool_over_count"] < 3].index
    t.loc[idx, "microbusiness_shift_bool_over_pct"] = np.nan

    return t[["row_id", "microbusiness_shift_bool_over_pct"]]

In [None]:
df_ups_downs = ups_downs(df_train)

In [None]:
df_ups_downs

In [None]:
df_train

In [None]:
df_train.groupby("first_day_of_month")["active"].sum().plot()