In [3]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.model_selection import (
    cross_validate,
    KFold,
)
from sklearn.metrics import r2_score, make_scorer
from sklearn.preprocessing import (
    StandardScaler,
)

from bayes_opt import BayesianOptimization
from xgboost import XGBRegressor 

from sklearn.base import BaseEstimator
import numpy as np

In [4]:
train = pd.read_csv("../data/train.csv", index_col="SEQN")
test_data = pd.read_csv("../data/test.csv", index_col="SEQN")

train_x, train_y = train.drop("y", axis=1), train["y"]

In [5]:
class RowSum(BaseEstimator):

    def __init__(self) -> None:
        super().__init__()

    def fit(self, *args, **kwargs):
        pass

    def transform(self, X, y=None, **kwargs):
        if isinstance(X, pd.DataFrame):
            return X.values.sum(axis=1).reshape(-1, 1)
        else:
            return X.sum(axis=1).reshape(-1, 1)

    def fit_transform(self, X, y=None, **kwargs):
        return self.transform(X, y, **kwargs)

    def get_feature_names_out(self, X, y=None, **kwargs):
        return ["rowSums"]


sumRows = Pipeline(
    [
        ("sums", RowSum()),
        ("scale", StandardScaler()),
    ]
)

colTrans = ColumnTransformer(
    [
        # ("one_hot", OneHotEncoder(sparse_output=False, drop="first"), ["district"]),
        ("collapse", sumRows, [f"SRP_{i}" for i in range(1, 51)]),
    ],
    remainder="passthrough",
)

preprocessor = Pipeline(
    [
        ("setup", colTrans),
        # ("polys", PolynomialFeatures(interaction_only=True)),
        # ("splines", SplineTransformer(knots="quantile")),
    ]
)


In [6]:
def optim(
    learning_rate=0.1,
    max_iter=10,
    l2_regularization=1,
    max_bins=50,
    max_depth=4,
    max_leaf_nodes=5,
    return_model=False,
    gamma=0,
    subsample=1,
    alpha=0
):

    scorer = make_scorer(r2_score)
    max_depth = int(max_depth)
    max_bins = int(max_bins)
    max_leaf_nodes = int(max_leaf_nodes)
    max_iter = int(max_iter)

    model = XGBRegressor(
        learning_rate=learning_rate,
        gamma=gamma,
        reg_lambda=l2_regularization,
        reg_alpha=alpha,
        max_bin=int(max_bins),
        max_leaves=max_leaf_nodes,
        max_depth=max_depth,
        subsample=subsample,
        enable_categorical=True,
    )

    model_pipeline = Pipeline([("preprocess", preprocessor), ("model", model)])

    # TODO: add cross manual cross validation here within district
    if return_model:
        return (
            cross_validate(
                model_pipeline,
                X=train_x,
                y=train_y,
                scoring=scorer,
                cv=KFold(shuffle=True),
            ),
            model_pipeline,
        )

    else:
        return cross_validate(
            model_pipeline,
            X=train_x,
            y=train_y,
            scoring=scorer,
            cv=KFold(shuffle=True, n_splits=5),
        )["test_score"].mean()


In [7]:
optimizer = BayesianOptimization(
    optim,
    pbounds={
        "learning_rate": [0.00001, 1],
        "l2_regularization": [1, 4000],
        "max_bins": [5, 500],
        "max_leaf_nodes": [0, 50],
        "max_depth": [1, 60],
        "gamma": [0, 1],
        "subsample": [0.5, 1],
        "alpha":[0, 100]
    },
)
optimizer.maximize(init_points=25, n_iter=100)

|   iter    |  target   |   alpha   |   gamma   | l2_reg... | learni... | max_bins  | max_depth | max_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.7577   [0m | [0m95.5     [0m | [0m0.303    [0m | [0m447.0    [0m | [0m0.9439   [0m | [0m253.6    [0m | [0m8.894    [0m | [0m7.53     [0m | [0m0.7055   [0m |
| [95m2        [0m | [95m0.7771   [0m | [95m70.92    [0m | [95m0.3538   [0m | [95m1.54e+03 [0m | [95m0.3013   [0m | [95m191.2    [0m | [95m29.55    [0m | [95m45.08    [0m | [95m0.7079   [0m |
| [0m3        [0m | [0m0.7575   [0m | [0m16.48    [0m | [0m0.4823   [0m | [0m3.226e+03[0m | [0m0.3309   [0m | [0m218.0    [0m | [0m36.04    [0m | [0m27.39    [0m | [0m0.5081   [0m |
| [95m4        [0m | [95m0.8176   [0m | [95m21.4     [0m | [95m0.4348   [0m | [95m2.316e+03[0m | [95m0.3796   [0m | [95m330.2  

In [8]:
optimal_params = optimizer.max["params"]
scorer = make_scorer(r2_score)
param_names_default = list(optimal_params.keys())
optimal_params['subsample'] = 1
optimal_params['reg_lambda'] = 1
optimal_params['learning_rate'] = 1
print(optimal_params)
print(param_names_default)
param_names = ['reg_alpha', 'gamma', 'reg_lambda', 'learning_rate', 'max_bin', 'max_depth', 'max_leaves', 'subsample']
for i, param in enumerate(param_names_default):
    if param_names[i] != param:
        optimal_params[param_names[i]] = optimal_params[param]
        optimal_params.pop(param)
    if param in ["max_bins", "max_depth", "max_leaf_nodes"]:
        optimal_params.update({param_names[i]: int(optimal_params[param_names[i]])})
print(optimal_params)     

{'alpha': 3.526286580106397, 'gamma': 0.9953286702754582, 'l2_regularization': 9.967994896132868, 'learning_rate': 1, 'max_bins': 481.4607979204697, 'max_depth': 12.129757860114669, 'max_leaf_nodes': 40.65500334154933, 'subsample': 1, 'reg_lambda': 1}
['alpha', 'gamma', 'l2_regularization', 'learning_rate', 'max_bins', 'max_depth', 'max_leaf_nodes', 'subsample']
{'gamma': 0.9953286702754582, 'learning_rate': 1, 'max_depth': 12, 'subsample': 1, 'reg_lambda': 9.967994896132868, 'reg_alpha': 3.526286580106397, 'max_bin': 481, 'max_leaves': 40}


In [9]:
test_model = XGBRegressor(
    **optimal_params
)

model_pipeline = Pipeline([("preprocess", preprocessor), ("model", test_model)])
cv_results = cross_validate(
    model_pipeline, X=train_x, y=train_y, scoring=scorer, cv=KFold(shuffle=True)
)

model_pipeline.fit(train_x, train_y)
cv_results

{'fit_time': array([0.04588699, 0.01771617, 0.02087784, 0.01741004, 0.02111411]),
 'score_time': array([0.00276995, 0.00166488, 0.00169921, 0.00154185, 0.0017879 ]),
 'test_score': array([0.82997554, 0.83429767, 0.85546975, 0.83841479, 0.83670797])}

In [44]:
# looks good, so spit out the results
test_data["y"] = model_pipeline.predict(test_data)
test_data["y"].to_csv("../results/cv_results.csv")