In [1]:
import copy
import os
import time
import random
import sys
import warnings

import numpy as np
import pandas as pd
from typing import Any


from hyperimpute.plugins.imputers import Imputers
from hyperimpute.utils.distributions import enable_reproducible_results
from hyperimpute.utils.benchmarks import compare_models
import hyperimpute.logger as log


from IPython.display import HTML, display
import tabulate

import json

warnings.filterwarnings("ignore")
enable_reproducible_results()

imputers = Imputers()
log.add(sink=sys.stderr, level="INFO")

In [2]:
from pathlib import Path

experiment = "experiments_01_hyperimpute_with_naive_search"


def get_imputer():
    return imputers.get("hyperimpute", optimizer="simple")


def save_results(fname, results):
    path = Path(experiment)
    path.mkdir(parents=True, exist_ok=True)

    out = path / fname

    with open(out, "w") as outfile:
        json.dump(results, outfile)


def evaluate_dataset_repeated(
    name,
    X_raw,
    y,
    ref_methods=[
        "mean",
        "sklearn_ice",
        "sklearn_missforest",
        "softimpute",
        "gain",
        "miwae",
        "sinkhorn",
    ],
    scenarios=["MAR", "MCAR", "MNAR"],
    miss_pct=[0.1, 0.3, 0.5, 0.7],
    n_iter=10,
):
    results = compare_models(
        name=name,
        evaluated_model=get_imputer(),
        X_raw=X_raw,
        ref_methods=ref_methods,
        scenarios=scenarios,
        miss_pct=miss_pct,
        n_iter=n_iter,
    )

    save_results(name, results)

## Sanity check in  debug mode

In [3]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat",
    header=None,
    sep="\\t",
)

y = df[5]
X_raw = df.drop(columns=[5])

evaluate_dataset_repeated(
    "airfoil_debug",
    X_raw,
    y,
    scenarios=["MAR"],
    ref_methods=["mean", "ice"],
    n_iter=3,
    miss_pct=[0.3],
)

[2024-03-11T07:08:05.864552+0800][21032][INFO] Iteration imputation: select_model_by_column: True, select_model_by_iteration: True
[2024-03-11T07:08:06.079721+0800][21032][INFO]   > HyperImpute using inner optimization
[2024-03-11T07:08:06.081720+0800][21032][INFO]   > Imputation iter 0
[2024-03-11T07:08:08.676197+0800][21032][INFO]      >>> Column 1 <-- score 0.9996566481387175 <-- Model xgboost_regressor
[2024-03-11T07:08:09.761332+0800][21032][INFO]      >>> Column 0 <-- score 0.14275775132160595 <-- Model random_forest_regressor
[2024-03-11T07:08:11.225756+0800][21032][INFO]      >>> Column 3 <-- score 0.8816665681674609 <-- Model xgboost_regressor
[2024-03-11T07:08:11.438641+0800][21032][INFO]   > Imputation iter 1
[2024-03-11T07:08:12.176796+0800][21032][INFO]      >>> Column 0 <-- score 0.15578369037350231 <-- Model random_forest_regressor
[2024-03-11T07:08:13.493914+0800][21032][INFO]      >>> Column 1 <-- score 0.9996752680283194 <-- Model xgboost_regressor
[2024-03-11T07:08:1

RMSE score


Unnamed: 0,Scenario,"miss_pct [0, 1]",Evaluated: hyperimpute,mean,ice
0,MAR,0.3,0.1469 +/- 0.023,0.3238 +/- 0.03,0.2867 +/- 0.021




Wasserstein score


Unnamed: 0,Scenario,"miss_pct [0, 1]",Evaluated: hyperimpute,mean,ice
0,MAR,0.3,0.0362 +/- 0.0043,0.2482 +/- 0.0344,0.1389 +/- 0.0119


# Datasets

In [4]:
import hyperimpute.logger as log

log.remove()

| Dataset     | Length | Features |
|-------------|--------|----------|
| airfoil     | 1503   | 6        |
| blood       | 748    | 5        |
| bc          | 569    | 30       |
| california  | 20640  | 8        |
| climate     | 540    | 21       |
| compression | 1030   | 9        |
| slump       | 103    | 11       |
| sonar       | 208    | 61       |
| diabetes    | 442    | 10       |
| wine_red    | 1599   | 12       |
| wine_white  | 4898   | 12       |
| yeast       | 1484   | 10       |
| iris        | 150    | 4        |
| libras      | 360    | 91       |
| parkinsons  | 195    | 24       |
| yacht       | 308    | 7        |
| ionosphere  | 351    | 35       |
| letter      | 20000  | 17       |
| spam        | 4600   | 58       |
| credit      | 690    | 16       |

## Dataset: UCI Airfoil Self-Noise Data Set

https://archive.ics.uci.edu/ml/datasets/airfoil+self-noise


In [5]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat",
    header=None,
    sep="\\t",
)

df

Unnamed: 0,0,1,2,3,4,5
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461
...,...,...,...,...,...,...
1498,2500,15.6,0.1016,39.6,0.052849,110.264
1499,3150,15.6,0.1016,39.6,0.052849,109.254
1500,4000,15.6,0.1016,39.6,0.052849,106.604
1501,5000,15.6,0.1016,39.6,0.052849,106.224


In [6]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns=[last_col])

evaluate_dataset_repeated("airfoil", X_raw, y)

## Dataset: Breast Cancer Wisconsin (Diagnostic)

https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)

In [None]:
from sklearn.datasets import load_breast_cancer

X_raw, y = load_breast_cancer(as_frame=True, return_X_y=True)

X_raw

In [None]:
evaluate_dataset_repeated("bc", X_raw, y)

## Concrete Compressive Strength Data Set
https://archive.ics.uci.edu/ml/datasets/concrete+compressive+strength

In [None]:
df = pd.read_excel(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls"
)

df

In [None]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns=[last_col])

evaluate_dataset_repeated("compression", X_raw, y)

## Wine-Red dataset

In [None]:
# Wine Quality Data Set

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
    sep=";",
)

df

In [None]:
last_col = df.columns[-1]

y = df[last_col]
mapped_labels = sorted(y.unique())
mapping = {}
for idx, label in enumerate(mapped_labels):
    mapping[label] = idx
y = y.map(mapping)

X_raw = df.drop(columns=[last_col])

evaluate_dataset_repeated("wine_red", X_raw, y)

## Wine-White dataset

In [None]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",
    sep=";",
)

df

In [None]:
last_col = df.columns[-1]

y = df[last_col]
mapped_labels = sorted(y.unique())
mapping = {}
for idx, label in enumerate(mapped_labels):
    mapping[label] = idx
y = y.map(mapping)

X_raw = df.drop(columns=[last_col])

evaluate_dataset_repeated("wine_white", X_raw, y)

## Yeast Data Set


In [None]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data",
    sep="\s+",
    header=None,
)

df = df.drop(columns=[0])

for col in [9]:
    df[col] = LabelEncoder().fit_transform(df[col])

df

In [None]:
last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns=[last_col])

evaluate_dataset_repeated("yeast", X_raw, y)

## Diabetes


In [None]:
from sklearn.datasets import load_diabetes

X, y = load_diabetes(as_frame=True, return_X_y=True)

X

In [None]:
evaluate_dataset_repeated("diabetes", X, y)

## Iris


In [None]:
from sklearn.datasets import load_iris

X, y = load_iris(as_frame=True, return_X_y=True)

X

In [None]:
evaluate_dataset_repeated("iris", X, y)

## Ionosphere

In [None]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data",
    sep=",",
    header=None,
)

for col in df.columns:
    if df[col].dtype == "object":
        df[col] = LabelEncoder().fit_transform(df[col])

last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns=[last_col])

X_raw

In [None]:
evaluate_dataset_repeated("ionosphere", X_raw, y)

## Libras

In [None]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/libras/movement_libras.data",
    sep=",",
    header=None,
)

last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns=[last_col])

X_raw

In [None]:
evaluate_dataset_repeated("libras", X_raw, y)

## Parkinsons

In [None]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data",
    sep=",",
)

for col in df.columns:
    if df[col].dtype == "object":
        df[col] = LabelEncoder().fit_transform(df[col])

df = df.drop(columns=["name"])

last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns=[last_col])

X_raw

In [None]:
evaluate_dataset_repeated("parkinsons", X_raw, y)

## Spam

In [None]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
)

for col in df.columns:
    if df[col].dtype == "object":
        df[col] = LabelEncoder().fit_transform(df[col])

last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns=[last_col])

X_raw

In [None]:
evaluate_dataset_repeated("spam", X_raw, y)

## Letter dataset

In [None]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data",
    header=None,
)

for col in df.columns:
    if df[col].dtype == "object":
        df[col] = LabelEncoder().fit_transform(df[col])

last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns=[last_col])

X_raw

In [None]:
evaluate_dataset_repeated("letter", X_raw, y)

## Credit dataset

In [None]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data",
    header=None,
)

for col in df.columns:
    if df[col].dtype == "object":
        df[col] = LabelEncoder().fit_transform(df[col])

last_col = df.columns[-1]
y = df[last_col]
X_raw = df.drop(columns=[last_col])

X_raw

In [None]:
evaluate_dataset_repeated("credit", X_raw, y)

# Plots

In [None]:
#!pip install xlrd
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_iris
from scipy import signal


def smooth_line(src: list) -> list:
    return signal.savgol_filter(src, 3, 1)


X_raw_diab, _ = load_diabetes(as_frame=True, return_X_y=True)

X_raw_breast_cancer, _ = load_breast_cancer(as_frame=True, return_X_y=True)
X_raw_california, _ = fetch_california_housing(as_frame=True, return_X_y=True)
X_raw_iris, y_raw_iris = load_iris(as_frame=True, return_X_y=True)

climate_model_samples = np.loadtxt(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat",
    skiprows=1,
)
climate_model_df = pd.DataFrame(climate_model_samples)

raw_datasets = {
    "airfoil": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat",
        header=None,
        sep="\\t",
    ),
    "blood": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data"
    ),
    "bc": X_raw_breast_cancer,
    "california": X_raw_california,
    "climate": climate_model_df,
    "compression": pd.read_excel(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls"
    ),
    "slump": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/slump/slump_test.data"
    ),
    "sonar": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data",
        header=None,
    ),
    "diabetes": X_raw_diab,
    "wine_red": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
        sep=";",
    ),
    "wine_white": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",
        sep=";",
    ),
    "iris": X_raw_iris,
    "libras": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/libras/movement_libras.data",
        sep=",",
        header=None,
    ),
    "parkinsons": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data",
        sep=",",
    ),
    "yacht": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/00243/yacht_hydrodynamics.data",
        sep="\s+",
        header=None,
    ),
    "ionosphere": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data",
        sep=",",
        header=None,
    ),
    "letter": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data",
        header=None,
    ),
    "spam": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
    ),
    "credit": pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data",
        header=None,
    ),
}

In [None]:
# Parse results

import pandas as pd
import numpy as np
from pathlib import Path
import json
import pandas as pd

experiment = "experiments_01_hyperimpute_with_naive_search"
results = Path(experiment).glob("*")

remap_models = {
    "Our method": "hyperimpute",
    "sklearn_missforest": "missforest",
    "sklearn_ice": "ice",
}
norm_cols = [
    "Our method",
    "mean",
    "sklearn_missforest",
    "sklearn_ice",
    "gain",
    "sinkhorn",
    "softimpute",
]

rmse_key = "Mean RMSE"
wass_key = "Mean Wasserstein distance"
pred_key = "Mean downstream prediction error"

data = {}

df_names = [
    "airfoil",
    "bc",
    "compression",
    "diabetes",
    "ionosphere",
    "iris",
    "libras",
    "letter",
    "credit",
    "spam",
    "parkinsons",
    "wine_red",
    "wine_white",
]


def generate_mean_std(data, headers):
    _mean = []
    _std = []

    for scenario in data:
        local_mean = []
        local_std = []
        for vals in scenario:
            if isinstance(vals, list):
                local_mean.append(vals[0])
                local_std.append(vals[1])
            else:
                local_mean.append(vals)
                local_std.append(vals)
        _mean.append(local_mean)
        _std.append(local_std)
    _mean_df = pd.DataFrame(_mean, columns=headers)
    _std_df = pd.DataFrame(_std, columns=headers)

    return _mean_df, _std_df


for res in results:
    if "debug" in res.name:
        continue

    if res.name not in df_names:
        continue

    with open(res) as f:
        local_data = json.load(f)

        headers = local_data["headers"]

        rmse_mean, rmse_std = generate_mean_std(local_data["rmse"], headers)
        distr_mean, distr_std = generate_mean_std(local_data["wasserstein"], headers)

    data[res.name] = {
        rmse_key: (rmse_mean, rmse_std),
        wass_key: (distr_mean, distr_std),
    }


results = {}
models_cnt = len(headers) - 2
df_names = sorted(data.keys())

for dataset in df_names:
    for metric in data[dataset]:
        df, df_std = data[dataset][metric]

        # Prediction norm
        num_df = df._get_numeric_data()
        num_df[num_df <= 0] = 1e-6

        for scenario in ["MAR", "MCAR", "MNAR"]:
            if scenario not in results:
                results[scenario] = {}

            for miss in [0.1, 0.3, 0.5, 0.7]:
                if miss not in results[scenario]:
                    results[scenario][miss] = {}

                local_df = df[df["Scenario"] == scenario].drop(columns=["Scenario"])
                local_df = local_df[local_df["miss_pct [0, 1]"] == miss].drop(
                    columns=["miss_pct [0, 1]"]
                )

                local_df = local_df.rename(columns=remap_models)

                if len(local_df) == 0:
                    continue

                local_df_std = df_std[df_std["Scenario"] == scenario].drop(
                    columns=["Scenario"]
                )
                local_df_std = local_df_std[
                    local_df_std["miss_pct [0, 1]"] == miss
                ].drop(columns=["miss_pct [0, 1]"])

                local_df_std = local_df_std.rename(columns=remap_models)

                if metric not in results[scenario][miss]:
                    results[scenario][miss][metric] = {}
                for col in local_df.columns:
                    if col not in results[scenario][miss][metric]:
                        results[scenario][miss][metric][col] = {
                            "mean": [],
                            "std": [],
                        }
                    results[scenario][miss][metric][col]["mean"].append(
                        min(local_df[col].values[0], 0.5)
                    )
                    results[scenario][miss][metric][col]["std"].append(
                        min(local_df_std[col].values[0], 0.01)
                    )

## General overview

In [None]:
output_dir = Path(f"diagrams_{experiment}")
output_dir.mkdir(parents=True, exist_ok=True)

In [None]:
import matplotlib.pyplot as plt

fontsize = 14
df_graph_len = models_cnt + 1


def generate_plot_for_ax(ax, scenario, miss, metric):
    offset = len(data)
    plt.setp(ax.get_yticklabels(), fontsize=fontsize)

    barWidth = 1

    max_val = 0
    idx = 0
    for model in results[scenario][miss][metric]:
        pos = [idx + df_graph_len * i * barWidth for i in range(offset)]

        if len(pos) == 0:
            continue

        mod_mean = results[scenario][miss][metric][model]["mean"]
        mod_std = results[scenario][miss][metric][model]["std"]
        if max_val < max(mod_mean):
            max_val = max(mod_mean)

        ax.bar(
            pos,
            mod_mean,
            yerr=mod_std,
            width=barWidth,
            label=str(model),
            edgecolor="k",
        )
        idx += barWidth

    ax.legend(
        loc="upper center",
        bbox_to_anchor=(0.5, 1),
        ncol=models_cnt,
        prop={"size": fontsize},
    )

    ax.set_xticks(
        [df_graph_len * r + int(models_cnt / 2) for r in range(offset)],
        df_names,
        rotation=30,
        fontsize=fontsize,
    )
    ax.set_yticks(np.linspace(0, max_val + 0.1, num=5), fontsize=fontsize)
    ax.set_ylabel(metric, fontsize=fontsize + 4)

    return ax


def generate_plot(scenario, miss):
    plt.style.use("seaborn-whitegrid")

    offset = len(data)
    metrics = list(results[scenario][miss].keys())
    fig, axs = plt.subplots(len(metrics), figsize=(20, 8))

    for idx, metric in enumerate(metrics):
        generate_plot_for_ax(axs[idx], scenario, miss, metric)

    plt.xlabel(f"{scenario} simulation with {miss} missingness", fontsize=fontsize)
    plt.subplots_adjust(hspace=0.35)

    plt.savefig(output_dir / f"general_overview_{scenario}_{miss}.png")
    plt.show()


for scenario in ["MAR", "MCAR", "MNAR"]:
    for miss in [0.1, 0.3, 0.5]:
        generate_plot(scenario, miss)

## Plot by miss ratio

In [None]:
import numpy as np

x_axis = [0.1, 0.3, 0.5]

fontsize = 14


def generate_plot_for_ax(ax, scenario, metric, df_idx):
    offset = len(data)

    barWidth = 1

    max_val = 0
    idx = 0

    for model in results[scenario][0.1][metric]:

        datapoints = []
        datapoints_std = []

        for miss in results[scenario]:
            if metric not in results[scenario][miss]:
                continue

            local_res = results[scenario][miss][metric][model]["mean"][df_idx]
            local_res_std = results[scenario][miss][metric][model]["std"][df_idx]
            datapoints.append(local_res)
            datapoints_std.append(local_res_std)

        ax.errorbar(
            x_axis,
            smooth_line(datapoints),
            yerr=datapoints_std,
            label=str(model),
            linewidth=2,
            marker="o",
        )

    ax.set_xticks(x_axis, fontsize=fontsize)
    ax.set_ylabel(metric, fontsize=fontsize)
    ax.tick_params(axis="both", which="major", labelsize=fontsize)

    return ax


def generate_plot(scenario, df_idx, df_name):
    plt.style.use("seaborn-whitegrid")

    offset = len(data)

    metrics = list(results[scenario][0.1].keys())
    fig, axs = plt.subplots(len(metrics), figsize=(10, 11))

    for idx, metric in enumerate(metrics):
        generate_plot_for_ax(axs[idx], scenario, metric, df_idx)

    axs[0].legend(
        loc="upper left",
        bbox_to_anchor=(0.15, 1.27),
        ncol=int(models_cnt / 3),
        prop={"size": fontsize},
    )
    fig.suptitle(f"{scenario} simulation", fontsize=fontsize)
    plt.savefig(output_dir / f"error_by_miss_{scenario}_{df_name}.png")

    plt.show()


plot_df = ["airfoil", "compression", "letter", "wine_white", "wine_red"]
for scenario in ["MAR", "MCAR", "MNAR"]:
    for idx, df_name in enumerate(df_names):
        if df_name not in plot_df:
            continue
        print("dataset ", df_name)
        generate_plot(scenario, idx, df_name)