# Bayesian Optimization over standard imputers

In [1]:
import sys
import warnings
import pandas as pd
import numpy as np

from IPython.display import HTML, display
import tabulate

warnings.simplefilter("ignore")


from hyperimpute.utils.distributions import enable_reproducible_results

enable_reproducible_results()

# Load imputers

In [2]:
from hyperimpute.plugins.imputers import Imputers, ImputerPlugin
from hyperimpute.plugins.utils.metrics import RMSE
from hyperimpute.plugins.utils.simulate import simulate_nan
from hyperimpute.utils.optimizer import EarlyStoppingExceeded, create_study

imputers = Imputers()

imputers.list()

['gain',
 'mean',
 'mice',
 'nop',
 'sklearn_missforest',
 'sinkhorn',
 'hyperimpute',
 'ice',
 'most_frequent',
 'miwae',
 'miracle',
 'missforest',
 'EM',
 'sklearn_ice',
 'softimpute',
 'median']

In [6]:
imputers_seed = [
    "hyperimpute"
]

subsample = 500

# Helpers

In [7]:
from sklearn.preprocessing import MinMaxScaler


def ampute(x, mechanism, p_miss):
    x_simulated = simulate_nan(np.asarray(x), p_miss, mechanism)

    mask = x_simulated["mask"]
    x_miss = x_simulated["X_incomp"]

    return pd.DataFrame(x), pd.DataFrame(x_miss), pd.DataFrame(mask)


def scale_data(X):
    X = np.asarray(X)
    preproc = MinMaxScaler()

    return np.asarray(preproc.fit_transform(X))


def simulate_scenarios(X):
    X = scale_data(X)

    datasets = {}

    mechanisms = ["MAR", "MNAR", "MCAR"]
    percentages = [0.2, 0.3]

    for ampute_mechanism in mechanisms:
        for p_miss in percentages:
            if ampute_mechanism not in datasets:
                datasets[ampute_mechanism] = {}

            datasets[ampute_mechanism][p_miss] = ampute(X, ampute_mechanism, p_miss)

    return datasets

# BO core

In [8]:
from typing import Any
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import time


def evaluate_plugin(
    name: str,
    plugin: ImputerPlugin,
    X: np.ndarray,
    X_miss: np.ndarray,
    mask: np.ndarray,
    prev_best_score: float,
):

    study, pruner = create_study(
        study_name=f"{name}_imputer_evaluation_{plugin.name()}",
        direction="minimize",
        load_if_exists=False,
        patience=5,
    )

    def evaluate_args(**kwargs: Any) -> float:
        imputer = plugin(**kwargs)

        imputed = imputer.fit_transform(X_miss.copy())
        return RMSE(imputed.values, X.values, mask.values)

    baseline_score = evaluate_args(**{})

    if baseline_score < prev_best_score:
        return baseline_score, {}

    pruner.report_score(baseline_score)
    if prev_best_score < 100:
        pruner.report_score(prev_best_score)

    def objective(trial: optuna.Trial) -> float:
        args = plugin.sample_hyperparameters(trial)
        pruner.check_trial(trial)

        score = evaluate_args(**args)

        pruner.report_score(score)

        return score

    try:
        study.optimize(objective, n_trials=50, timeout=60 * 3)
    except EarlyStoppingExceeded:
        pass
        # print(f"Early stopping triggered for imputer {plugin.name()}")

    try:
        if baseline_score > study.best_value:
            return baseline_score, {}

        return study.best_value, study.best_trial.params
    except BaseException:
        return baseline_score, {}


def benchmark(
    name: str,
    X: np.ndarray,
    X_miss: np.ndarray,
    mask: np.ndarray,
):
    scores = {}
    start = time.time()

    best_score = 999
    for plugin in imputers_seed:

        plugin_t = imputers.get_type(plugin)
        try:
            score, params = evaluate_plugin(name, plugin_t, X, X_miss, mask, best_score)
            if score < best_score:
                best_score = score
        except BaseException as e:
            print("      >>>  Plugin failed", plugin, e)
            raise e

        scores[plugin] = (score, params)

    print(f" iteration for {name} took {time.time() - start} seconds")
    print(" iteration scores", scores)
    return scores

# Dataset: UCI Airfoil Self-Noise Data Set

https://archive.ics.uci.edu/ml/datasets/airfoil+self-noise


In [5]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat",
    header=None,
    sep="\\t",
)

df

Unnamed: 0,0,1,2,3,4,5
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461
...,...,...,...,...,...,...
1498,2500,15.6,0.1016,39.6,0.052849,110.264
1499,3150,15.6,0.1016,39.6,0.052849,109.254
1500,4000,15.6,0.1016,39.6,0.052849,106.604
1501,5000,15.6,0.1016,39.6,0.052849,106.224


In [6]:
frac = subsample / len(df)
X = df.sample(frac=frac)

imputation_scenarios = simulate_scenarios(X)

results = []
candidates = {}
for scenario in ["MAR", "MCAR", "MNAR"]:
    print("Evaluating ", scenario)
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    bo_results = benchmark("airfoil", x, x_miss, mask)

    best_candidate = ""
    best_score = 99999
    best_params = {}
    for plugin in bo_results:
        score, params = bo_results[plugin]
        if score < best_score:
            best_score = score
            best_candidate = plugin
            best_params = params

    results.append([scenario, best_candidate, best_score])
    candidates[scenario] = (best_candidate, best_params)
results

headers = ["Scenario", "BO selected estimator", "BO score"]

display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

Evaluating  MAR
Instructions for updating:
non-resource variables are not supported in the long term
 iteration for airfoil took 160.48045778274536 seconds
 iteration scores {'mean': (0.26094919218266216, {}), 'miracle': (4.126181568792021, {}), 'miwae': (0.4622753526674968, {}), 'gain': (0.3106851282580339, {'batch_size': 128, 'n_epochs': 500, 'hint_rate': 0.841538251929108, 'loss_alpha': 60}), 'softimpute': (0.25925798056757154, {}), 'sinkhorn': (0.2587600591841952, {}), 'sklearn_ice': (0.24114893131219062, {}), 'most_frequent': (0.3340366136173224, {}), 'median': (0.3120505455297883, {}), 'EM': (0.23939913968929888, {}), 'sklearn_missforest': (0.22908226584045221, {})}
Evaluating  MCAR
 iteration for airfoil took 211.55259728431702 seconds
 iteration scores {'mean': (0.2778840904476426, {}), 'miracle': (4.256982018651303, {}), 'miwae': (0.594530247167858, {}), 'gain': (0.27940811410920935, {'batch_size': 128, 'n_epochs': 100, 'hint_rate': 0.9027690574492997, 'loss_alpha': 80}), 'sof

Scenario,BO selected estimator,BO score
MAR,sklearn_missforest,0.229082
MCAR,sklearn_missforest,0.225608
MNAR,sklearn_missforest,0.234593


In [7]:
# Full dataset evaluation
X = df
imputation_scenarios = simulate_scenarios(X)

results = []
for scenario in candidates:
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    plugin, plugin_params = candidates[scenario]

    model = imputers.get(plugin, **plugin_params)

    imputed = model.fit_transform(x_miss.copy())

    loss = RMSE(imputed.values, x.values, mask.values)

    results.append([scenario, plugin, loss])

headers = ["Scenario", "BO-selected model", "RMSE on full dataset"]


display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

Scenario,BO-selected model,RMSE on full dataset
MAR,sklearn_missforest,0.241603
MCAR,sklearn_missforest,0.23614
MNAR,sklearn_missforest,0.233142


In [8]:
# Raw methods evaluation
ref_models = [
    "mean",
    "sklearn_missforest",
    "gain",
    "EM",
    "sklearn_ice",
    "softimpute",
    "sinkhorn",
    "miracle",
    "miwae",
]
results = []
for scenario in candidates:
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    local_res = [
        scenario,
    ]
    for plugin in ref_models:

        model = imputers.get(plugin)

        imputed = model.fit_transform(x_miss.copy())

        loss = RMSE(imputed.values, x.values, mask.values)

        local_res.append(loss)

    results.append(local_res)

headers = ["Scenario"] + ref_models


display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

Scenario,mean,sklearn_missforest,gain,EM,sklearn_ice,softimpute,sinkhorn,miracle,miwae
MAR,0.298775,0.241603,0.285472,0.247402,0.247665,0.305636,0.254818,0.67605,0.317701
MCAR,0.276679,0.23614,0.297078,0.234258,0.250891,0.348509,0.252304,0.251228,0.300607
MNAR,0.268875,0.233142,0.268371,0.229353,0.246091,0.344266,0.251986,0.248494,0.289012


# Dataset: Blood Transfusion Service Center Data Set


https://archive.ics.uci.edu/ml/datasets/Blood+Transfusion+Service+Center

In [9]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data",
    sep=",",
)

df

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [10]:
frac = min(subsample / len(df), 1)
X = df.sample(frac=frac)

imputation_scenarios = simulate_scenarios(X)

results = []
candidates = {}
for scenario in ["MAR", "MCAR", "MNAR"]:
    print("Evaluating ", scenario)
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    bo_results = benchmark("blood", x, x_miss, mask)

    best_candidate = ""
    best_score = 99999
    best_params = {}
    for plugin in bo_results:
        score, params = bo_results[plugin]
        if score < best_score:
            best_score = score
            best_candidate = plugin
            best_params = params

    results.append([scenario, best_candidate, best_score])
    candidates[scenario] = (best_candidate, best_params)
results

headers = ["Scenario", "BO selected estimator", "BO score"]

display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

Evaluating  MAR
 iteration for blood took 418.8616394996643 seconds
 iteration scores {'mean': (0.2747499204269218, {}), 'miracle': (4.210500431319634, {}), 'miwae': (0.3991661497868706, {}), 'gain': (0.2603722770579354, {}), 'softimpute': (0.34147858848988677, {'max_rank': 5, 'shrink_lambda': 3.596235627330393}), 'sinkhorn': (0.2677451083413792, {}), 'sklearn_ice': (0.24432313907088243, {}), 'most_frequent': (0.3511554617704031, {}), 'median': (0.29233156383921244, {}), 'EM': (288269618754367.2, {'maxit': 200, 'convergence_threshold': 1e-07}), 'sklearn_missforest': (0.23303082994302254, {})}
Evaluating  MCAR
 iteration for blood took 523.7935466766357 seconds
 iteration scores {'mean': (0.24396219018337315, {}), 'miracle': (4.3434085335592325, {}), 'miwae': (0.41843287718953986, {}), 'gain': (0.2422840969682272, {}), 'softimpute': (0.2878895944034214, {'max_rank': 5, 'shrink_lambda': 1.6446862190340739}), 'sinkhorn': (0.26388939181825755, {}), 'sklearn_ice': (0.22614506227648778, {}),

Scenario,BO selected estimator,BO score
MAR,sklearn_missforest,0.233031
MCAR,EM,0.221477
MNAR,sklearn_missforest,0.228789


In [11]:
# Full dataset evaluation
X = df
imputation_scenarios = simulate_scenarios(X)

results = []
for scenario in candidates:
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    plugin, plugin_params = candidates[scenario]

    model = imputers.get(plugin, **plugin_params)

    imputed = model.fit_transform(x_miss.copy())

    loss = RMSE(imputed.values, x.values, mask.values)

    results.append([scenario, plugin, loss])

headers = ["Scenario", "BO-selected model", "RMSE on full dataset"]


display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

Scenario,BO-selected model,RMSE on full dataset
MAR,sklearn_missforest,0.263908
MCAR,EM,2.25217
MNAR,sklearn_missforest,0.22864


In [12]:
# Other methods evaluation

ref_models = [
    "mean",
        "sklearn_missforest",
        "gain",
        "EM",
        "sklearn_ice",
        "softimpute",
        "sinkhorn",
        "miracle",
        "miwae",
]
results = []
for scenario in candidates:
    x, x_miss, mask = imputation_scenarios[scenario][0.3]

    local_res = [
        scenario,
    ]
    for plugin in ref_models:

        model = imputers.get(plugin)

        imputed = model.fit_transform(x_miss.copy())

        loss = RMSE(imputed.values, x.values, mask.values)

        local_res.append(loss)

    results.append(local_res)

headers = ["Scenario"] + ref_models


display(HTML(tabulate.tabulate(results, headers=headers, tablefmt="html")))

Scenario,mean,sklearn_missforest,gain,EM,sklearn_ice,softimpute,sinkhorn,miracle,miwae
MAR,0.285786,0.263908,0.304159,2.14469e+50,0.248418,0.292023,0.315553,4.13889,0.339364
MCAR,0.240563,0.218943,0.240414,2.25217,0.206561,0.25573,0.262153,4.30976,0.291763
MNAR,0.247079,0.22864,0.236591,0.231322,0.231361,0.282093,0.254457,4.30674,0.293589
