In [28]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, train_test_split

from missingness import MissingnessParams, apply_missingness
from datasets import get_dataset

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
def generate_params(X, n_trials=10):
    strategies = ['basic', 'double_threshold', 'range_condition', 'nonlinear']
    missing_rates = [0.1*i for i in range(1, 10)]
    for i in range(n_trials):
        target = np.random.choice(X.columns)
        cond1, cond2 = np.random.choice(X.drop(columns=[target]).columns, 2)
        seed = np.random.randint(0, 1000)
        for missing_rate in missing_rates:
            for strategy in strategies:
                condition_feature = cond1 if strategy != 'double_threshold' else [cond1, cond2]
                yield MissingnessParams(
                    mechanism='MAR',
                    strategy=strategy,
                    random_state=seed,
                    target_feature=target,
                    missing_rate=missing_rate,
                    condition_feature=condition_feature
                )
            yield MissingnessParams(
                mechanism='MCAR',
                strategy='none',
                random_state=seed,
                target_feature=target,
                missing_rate=missing_rate,
                condition_feature=None
            )

def run_exp(ds_name):
    results = []
    df, label = get_dataset(ds_name)
    X, y = df.drop(columns=[label]), df[label]
    imputators = {
        'drop rows': None,
        'simple w/ mean': SimpleImputer(strategy='mean'),
        'simple w/ zero': SimpleImputer(strategy='constant', fill_value=0),
        'knn': KNNImputer(),
        'iterative w/ br': IterativeImputer(estimator=BayesianRidge(), sample_posterior=True),
        'iterative w/ knn': IterativeImputer(estimator=KNeighborsRegressor()),
        'iterative w/ lr': IterativeImputer(estimator=LinearRegression()),
    }
    
    for i, params in enumerate(tqdm(generate_params(X), total=10*9*5)):
        try:
            X_missing = apply_missingness(X, params)

            for impute_name, imputer in tqdm(imputators.items(), leave=False):
                if impute_name == 'drop rows':
                    X_imputed = X_missing.dropna(axis=0)
                    y_imputed = y[X_imputed.index]
                else:
                    X_imputed = imputer.fit_transform(X_missing)
                    y_imputed = y
                X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_imputed, test_size=0.2, random_state=params.random_state)
                acc = RandomForestRegressor().fit(X_train, y_train).score(X_test, y_test)
                results.append({'acc': acc,
                                'impute': impute_name,
                                **params.__dict__})
        except Exception as e:
            print(e)
            continue

    results = pd.DataFrame(results)
    results.to_csv(f"{ds_name}_imputation_results.csv", index=False)
    return results

In [30]:
results = run_exp('wine')
results



  0%|          | 0/450 [00:00<?, ?it/s]

0




drop rows




simple w/ mean




simple w/ zero




knn




iterative w/ br




iterative w/ knn




iterative w/ lr


  0%|          | 0/450 [01:18<?, ?it/s]


Unnamed: 0,acc,impute,mechanism,strategy,random_state,target_feature,missing_rate,condition_feature
0,0.471354,drop rows,MAR,basic,438,citric_acid,0.1,sulphates
1,0.454623,simple w/ mean,MAR,basic,438,citric_acid,0.1,sulphates
2,0.451704,simple w/ zero,MAR,basic,438,citric_acid,0.1,sulphates
3,0.459222,knn,MAR,basic,438,citric_acid,0.1,sulphates
4,0.449431,iterative w/ br,MAR,basic,438,citric_acid,0.1,sulphates
5,0.462843,iterative w/ knn,MAR,basic,438,citric_acid,0.1,sulphates
6,0.460953,iterative w/ lr,MAR,basic,438,citric_acid,0.1,sulphates


In [18]:
imputators = {
        'drop rows': None,
        'simple w/ mean': SimpleImputer(strategy='mean'),
        'simple w/ zero': SimpleImputer(strategy='constant', fill_value=0),
        'knn': KNNImputer(),
        'iterative w/ br': IterativeImputer(estimator=BayesianRidge(), sample_posterior=True),
        'iterative w/ knn': IterativeImputer(estimator=KNeighborsRegressor()),
        'iterative w/ lr': IterativeImputer(estimator=LinearRegression()),
    }
for im_name, im in imputators.items():
    print(im_name)

drop rows
simple w/ mean
simple w/ zero
knn
iterative w/ br
iterative w/ knn
iterative w/ lr
