In [1]:
from pprint import pprint

import numpy as np
import pandas as pd
from datasets import id_map, load_dataset, split_data
from linear_models import wrap
from sklearn.feature_selection import mutual_info_regression
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import (
    ARDRegression,
    BayesianRidge,
    Lasso,
    LassoCV,
    LinearRegression,
    Ridge,
    RidgeCV,
)
from tree_models import *
import contextlib
import os

from linear_models import MISO

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer, RobustScaler, StandardScaler
from tqdm.auto import tqdm
from tree_models import HCV
from wrappers import BFS, RFS, RobustRegressor


class Silencer:
    def __init__(self, verbose: bool = False):
        self.verbose = verbose
        self.null_file = None
        self.stdout_redirector = None
        self.stderr_redirector = None

    def __enter__(self):
        if not self.verbose:
            self.null_file = open(os.devnull, "w")
            self.stdout_redirector = contextlib.redirect_stdout(self.null_file)
            self.stderr_redirector = contextlib.redirect_stderr(self.null_file)
            self.stdout_redirector.__enter__()
            self.stderr_redirector.__enter__()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if not self.verbose:
            self.stderr_redirector.__exit__(exc_type, exc_val, exc_tb)
            self.stdout_redirector.__exit__(exc_type, exc_val, exc_tb)
            self.null_file.close()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pprint(id_map, sort_dicts=False)


{'student_performance': 320,
 'concrete': 165,
 'computer_hardware': 29,
 'kidney_disease': 857,
 'fertility': 244,
 'algerian_forest_fires': 547,
 'airfoil_self_noise': 291,
 'istanbul_stock_exchange': 247}


In [3]:
# players = {
#     # Classical methods
#     # "LinearRegression": lambda x, _: wrap(LinearRegression()),
#     "Ridge(1/(2n))": lambda x, _: wrap(Ridge(1 / (2 * len(x)))),
#     "Isotonic(Ridge)": lambda x, _: wrap(Ridge(1 / (2 * len(x))), isotonic=True),
#     # "RidgeCV(cv=10)": lambda x, _: wrap(
#     #     RidgeCV(alphas=10 ** np.linspace(-4, 2, 20), cv=10)
#     # ),
#     # "LassoCV(cv=10)": lambda x, _: wrap(LassoCV(cv=10)),
#     "ARDRegression": lambda x, _: wrap(ARDRegression()),
#     "Isotonic(ARDRegression)": lambda x, _: wrap(ARDRegression(), isotonic=True),
#     # Modifications
#     # "Ridge": lambda x, _: wrap(Ridge(1 / (2 * len(x))), feats="id", outliers="id"),
#     # "Ridge+BFS": lambda x, _: wrap(Ridge(1 / (2 * len(x))), feats="bfs", outliers="id"),
#     # # "Ridge+RFS": lambda x, _: wrap(Ridge(1 / (2 * len(x))), feats="rfs", outliers="id"),
#     # "Ridge+Robust": lambda x, _: wrap(
#     #     Ridge(1 / (2 * len(x))), feats="id", outliers="robust"
#     # ),
#     # "Ridge+BFS+Robust": lambda x, _: wrap(
#     #     Ridge(1 / (2 * len(x))), feats="bfs", outliers="robust"
#     # ),
#     # "ARDRegression+Robust": lambda x, _: wrap(
#     #     ARDRegression(), feats="id", outliers="robust"
#     # ),
#     # "Ridge+RFS+Robust": lambda x, _: wrap(
#     #     Ridge(1 / (2 * len(x))), feats="rfs", outliers="robust"
#     # ),
#     # "BFS(add_singles=True)": lambda x, _: BFS(
#     #     wrap(Ridge(1 / (2 * len(x)))), add_singles=True
#     # ),
#     "BFS(add_singles=False)": lambda x, _: BFS(
#         wrap(Lasso(1 / (2 * len(x)))), add_singles=False
#     ),
#     "Isotonic(BFS(add_singles=False))": lambda x, _: wrap(
#         BFS(Lasso(1 / (2 * len(x))), add_singles=False),
#         isotonic=True,
#     ),
#     # "BFS(add_singles=True, final=lasso_cv)": lambda x, _: BFS(
#     #     wrap(Ridge(1 / (2 * len(x)))), add_singles=True, final="lasso_cv"
#     # ),
#     "BFS(add_singles=False, final=lasso_cv)": lambda x, _: BFS(
#         wrap(Lasso(1 / (2 * len(x)))), add_singles=False, final="lasso_cv"
#     ),
#     # "BFS(add_singles=True, final=lasso)": lambda x, _: BFS(
#     #     wrap(Ridge(1 / (2 * len(x)))), add_singles=True, final="lasso"
#     # ),
#     "BFS(add_singles=False, final=lasso)": lambda x, _: BFS(
#         wrap(Lasso(1 / (2 * len(x)))), add_singles=False, final="lasso"
#     ),

players = {
    "RandomForestRegressor": lambda x, _: wrap(
        RandomForestRegressor(max_depth=4), scale="id"
    ),
    "LGBMRegressor": lambda x, _: HCV(LGBMRegressor()),
    "XGBRegressor": lambda x, _: HCV(XGBRegressor()),
    "MISO": lambda x, _: MISO(),
}

model_names = list(players.keys())

# X, y = load_dataset("concrete")
X, y = load_dataset("student_performance")

results = []
n_runs = 20
for run in tqdm(range(n_runs)):
    X_train, y_train, X_test, y_test = split_data(
        X, y, test_size=0.5, bad_features=False
    )
    models = [players[name](X_train, y_train) for name in model_names]

    for i, (name, model) in enumerate(zip(model_names, models)):
        with Silencer():
            model.fit(X_train, y_train)
            train_pred = model.predict(X_train)
            test_pred = model.predict(X_test)
        train_mse = np.square(train_pred - y_train).mean()
        test_mse = np.square(test_pred - y_test).mean()
        results.append(
            {"Model": name, "Train MSE": train_mse, "Test MSE": test_mse, "Run": run}
        )

df_all = pd.DataFrame(results)
df = (
    df_all.groupby("Model")
    .agg(
        Train_MSE_mean=("Train MSE", "mean"),
        Train_MSE_std=("Train MSE", "std"),
        Test_MSE_mean=("Test MSE", "mean"),
        Test_MSE_std=("Test MSE", "std"),
    )
    .reset_index()
)

df["Train MSE"] = df.apply(
    lambda x: f"{x['Train_MSE_mean']:.4f} ± {x['Train_MSE_std']:.4f}", axis=1
)
df["Test MSE"] = df.apply(
    lambda x: f"{x['Test_MSE_mean']:.4f} ± {x['Test_MSE_std']:.4f}", axis=1
)
df = df[["Model", "Train MSE", "Test MSE"]]
print(df.sort_values(by="Test MSE"))


  0%|          | 0/20 [00:00<?, ?it/s]

[32m2025-03-15 18:03:30.974[0m | [1mINFO    [0m | [36mtree_models[0m:[36mfit[0m:[36m153[0m - [1mBest parameters #5: {'max_depth': np.int64(3), 'learning_rate': np.float64(0.1)}[0m
[32m2025-03-15 18:03:30.984[0m | [1mINFO    [0m | [36mtree_models[0m:[36mfit[0m:[36m153[0m - [1mBest parameters #6: {'max_depth': np.int64(2), 'learning_rate': np.float64(0.2)}[0m
[32m2025-03-15 18:03:30.991[0m | [1mINFO    [0m | [36mtree_models[0m:[36mfit[0m:[36m153[0m - [1mBest parameters #7: {'max_depth': np.int64(3), 'learning_rate': np.float64(0.2)}[0m
[32m2025-03-15 18:03:31.219[0m | [1mINFO    [0m | [36mtree_models[0m:[36mfit[0m:[36m153[0m - [1mBest parameters #0: {'max_depth': np.int64(2), 'learning_rate': np.float64(0.02)}[0m
[32m2025-03-15 18:03:31.226[0m | [1mINFO    [0m | [36mtree_models[0m:[36mfit[0m:[36m153[0m - [1mBest parameters #9: {'max_depth': np.int64(3), 'learning_rate': np.float64(0.02)}[0m
[32m2025-03-15 18:03:31.234[0m | [1

                   Model        Train MSE         Test MSE
3           XGBRegressor  0.5471 ± 0.1107  0.7207 ± 0.0457
2  RandomForestRegressor  0.5397 ± 0.0220  0.7240 ± 0.0413
1                   MISO  0.5809 ± 0.0445  0.7329 ± 0.0439
0          LGBMRegressor  0.3161 ± 0.0235  0.7493 ± 0.0404





In [5]:
import time

from arena import Silencer

from lightgbm import LGBMRegressor

from sklearn.linear_model import LinearRegression
from wrappers import BFS

# Benchmark fitting time for BFS with different final estimators
X_bench, y_bench = X_train.copy(), y_train.copy()
n_runs = 5

# BFS with ard
with_single_times = []
without_singles_times = []
for _ in range(n_runs):
    with Silencer():
        start_time = time.time()
        with_singles = HCV(XGBRegressor(learning_rate=0.05, max_depth=3))
        with_singles.fit(X_bench, y_bench)
        with_single_times.append(time.time() - start_time)
        start_time = time.time()
        without_singles = HCV(LGBMRegressor(learning_rate=0.05, max_depth=3))
        without_singles.fit(X_bench, y_bench)
        without_singles_times.append(time.time() - start_time)
with_single_time_mean = np.mean(with_single_times)
with_single_time_std = np.std(with_single_times)
without_singles_time_mean = np.mean(without_singles_times)
without_singles_time_std = np.std(without_singles_times)


print(
    f"BFS(add_singles=False) fitting time: {without_singles_time_mean:.4f} ± {without_singles_time_std:.4f} seconds"
)
print(
    f"BFS(add_singles=True) fitting time: {with_single_time_mean:.4f} ± {with_single_time_std:.4f} seconds"
)


[32m2025-03-15 17:25:25.238[0m | [1mINFO    [0m | [36mtree_models[0m:[36mfit[0m:[36m149[0m - [1mBest parameters #29: {'max_depth': np.int64(2), 'learning_rate': np.float64(0.02)}[0m
[32m2025-03-15 17:25:25.254[0m | [1mINFO    [0m | [36mtree_models[0m:[36mfit[0m:[36m149[0m - [1mBest parameters #24: {'max_depth': np.int64(4), 'learning_rate': np.float64(0.02)}[0m
[32m2025-03-15 17:25:25.285[0m | [1mINFO    [0m | [36mtree_models[0m:[36mfit[0m:[36m149[0m - [1mBest parameters #30: {'max_depth': np.int64(3), 'learning_rate': np.float64(0.02)}[0m
[32m2025-03-15 17:25:26.293[0m | [1mINFO    [0m | [36mtree_models[0m:[36mfit[0m:[36m149[0m - [1mBest parameters #27: {'max_depth': np.int64(2), 'learning_rate': np.float64(0.05)}[0m
[32m2025-03-15 17:25:26.341[0m | [1mINFO    [0m | [36mtree_models[0m:[36mfit[0m:[36m149[0m - [1mBest parameters #28: {'max_depth': np.int64(4), 'learning_rate': np.float64(0.02)}[0m
[32m2025-03-15 17:25:26.428[

BFS(add_singles=False) fitting time: 1.1760 ± 0.1131 seconds
BFS(add_singles=True) fitting time: 1.1468 ± 0.1587 seconds


In [5]:
np.set_printoptions(suppress=True, precision=6, floatmode="fixed")
for i in range(2, len(models)):
    name = model_names[i]
    meta_coef = models[i].meta_estimator_[1].coef_
    intercept = models[i].meta_estimator_[1].intercept_
    print(name, meta_coef / np.sum(meta_coef), np.sum(meta_coef), intercept)


AttributeError: 'HCV' object has no attribute 'meta_estimator_'

In [6]:
A = models[4]
B = models[6]


In [13]:
print(A[1].coef_)


[ 0.6939171   0.5198963   0.31041625 -0.20310733  0.11226537  0.00358856
  0.          0.42565832]


In [16]:
B.feature_indices_


[array([0]),
 array([0, 4]),
 array([0, 4, 7]),
 array([0, 4, 7, 3]),
 array([0, 4, 7, 3, 6]),
 array([0, 4, 7, 3, 6, 5]),
 array([0, 4, 7, 3, 6, 5, 1]),
 array([0, 4, 7, 3, 6, 5, 1, 2])]

In [24]:
# A[1].coef_
np.set_printoptions(suppress=True, precision=6, floatmode="fixed")

meta_coef = B.meta_estimator_[1].coef_
print(meta_coef / np.sum(meta_coef))
total_coef = np.zeros(X_train.shape[1])
for i, meta_w in enumerate(meta_coef):
    feats = B.feature_indices_[i]
    weights = B.estimators_[i][1].coef_
    total_coef[feats] += weights * meta_w
total_coef


[0.000000 0.009541 0.000000 0.000000 0.023721 0.129458 0.319179 0.518100]


array([ 0.476652,  0.316338,  0.155053, -0.219728,  0.126291, -0.042684,
       -0.090598,  0.349651])

In [7]:
def EDA(name):
    X, y = load_dataset(name)
    assert isinstance(X, np.ndarray) and isinstance(y, np.ndarray)
    n_samples, n_features = X.shape
    assert y.shape == (n_samples,)
    print(f"{n_samples} samples, {n_features} features")
    k = 10
    subset = np.random.choice(n_samples, k)
    for i in range(n_features):
        n_unique = len(np.unique(X[:, i]))
        mn, mx = np.min(X[:, i]), np.max(X[:, i])
        mean = np.mean(X[:, i])
        std = np.std(X[:, i])
        C = 8
        sample = [int(C * x) / C for x in sorted(X[subset, i].tolist())]
        correlation = np.corrcoef(X[:, i], y)[0, 1]
        mutual_info = mutual_info_regression(X[:, i].reshape(-1, 1), y)[0]
        print(
            f"#{i}: {mn:.2f}..({mean:.2f} ± {std:.2f})..{mx:.2f}, {n_unique} unique, {sample} | C={correlation:.2f}, MI={mutual_info:.2f}"
        )


In [None]:
EDA("istanbul_stock_exchange")


536 samples, 8 features
#0: 0.00..(267.50 ± 154.73)..535.00, 536 unique, [6.0, 75.0, 108.0, 199.0, 366.0, 433.0, 444.0, 506.0, 506.0, 525.0] | C=-0.03, MI=0.05
#1: -0.05..(0.00 ± 0.01)..0.07, 519 unique, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | C=0.44, MI=0.09
#2: -0.05..(0.00 ± 0.01)..0.06, 527 unique, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | C=0.60, MI=0.21
#3: -0.05..(0.00 ± 0.01)..0.05, 523 unique, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | C=0.62, MI=0.26
#4: -0.05..(0.00 ± 0.01)..0.06, 503 unique, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | C=0.26, MI=0.00
#5: -0.05..(0.00 ± 0.02)..0.06, 508 unique, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | C=0.43, MI=0.06
#6: -0.05..(0.00 ± 0.01)..0.07, 532 unique, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | C=0.66, MI=0.24
#7: -0.04..(0.00 ± 0.01)..0.05, 536 unique, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | C=0.60, MI=0.23
