# Import libraries

In [1]:
!pip install fancyimpute -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for knnimpute (setup.py) ... [?25l[?25hdone


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.utils.validation import check_random_state
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.metrics import r2_score

from time import perf_counter

In [3]:
# Set print format
np.set_printoptions(
    precision=4,
    suppress=True
)

# seed for reproducibility
seed = 0

# Generate Dataset

In [4]:
from math import sqrt

def generate_dataset(
        n : int = 10000,
        d : int = 10,
        seed : int = 0,
        missing_rate : float = 0.5,
        test_size: float = 0.1
    ):
    """Generate synthesis dataset"""
    # Set random seed for reproducible
    rng = check_random_state(seed)

    # Parameters of Gaussian data
    B = rng.randn(d, d//2)
    cov = B.dot(B.T) + np.diag(rng.uniform(low=0.01, high=0.1, size=d))

    mean = rng.randn(d)

    # Generate Gaussian data
    X = rng.multivariate_normal(mean=mean, cov=cov, size=n, check_valid='raise')

    # Generate y
    beta = np.repeat(1., d + 1)
    var = beta[1:].dot(cov).dot(beta[1:])
    beta[1:] *= 1/sqrt(var)
    y = X.dot(beta[1:]) + beta[0]

    snr = 10
    noise = rng.normal(loc=0, scale=sqrt(np.var(y)/snr), size=n)
    y += noise

    # Add missing values, MCAR
    missing_rate = 0.5
    ber = rng.rand(n, d)
    mask = ber < missing_rate
    np.putmask(X, mask, np.nan)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=rng
    )
    return X_train, X_test, y_train, y_test

In [5]:
# Dataset with 10000 samples, 10 features and has missing rate 50% (MCAR)
n = 10000  # number of samples
d = 10  # number of features
X_train, X_test, y_train, y_test = generate_dataset(n=n, d=d, missing_rate=0.5)

In [6]:
def simulate(imputer, lr) -> None:
    start = perf_counter()
    X_train_imputed = imputer.fit_transform(X_train)
    lr.fit(X_train_imputed, y_train)

    # Training set evaluation
    y_train_pred = lr.predict(X_train_imputed)
    r2_train = r2_score(y_train, y_train_pred)

    # Test set evaluation
    X_test_imputed = imputer.fit_transform(X_test)
    y_test_pred = lr.predict(X_test_imputed)
    r2_test = r2_score(y_test, y_test_pred)

    print(f' - R2 score on training set: {r2_train:.4f}')
    print(f' - R2 score on test set: {r2_test:.4f}')
    print(f' - Took {(perf_counter() - start):.4f}s')

# KNN Imputation

In [7]:
n_neighbors = [5, 10, 100, 1000, 5000, n - 100, n]
for n_neighbor in n_neighbors:
    print(f'KNN-{n_neighbor} imputation + LR: ')
    simulate(
        KNNImputer(n_neighbors=n_neighbor),
        LinearRegression()
    )

KNN-5 imputation + LR: 
 - R2 score on training set: 0.6224
 - R2 score on test set: 0.6191
 - Took 16.8913s
KNN-10 imputation + LR: 
 - R2 score on training set: 0.6529
 - R2 score on test set: 0.6587
 - Took 18.6717s
KNN-100 imputation + LR: 
 - R2 score on training set: 0.6929
 - R2 score on test set: 0.7018
 - Took 12.6683s
KNN-1000 imputation + LR: 
 - R2 score on training set: 0.7091
 - R2 score on test set: 0.6372
 - Took 15.8950s
KNN-5000 imputation + LR: 
 - R2 score on training set: 0.6573
 - R2 score on test set: 0.6472
 - Took 20.6401s
KNN-9900 imputation + LR: 
 - R2 score on training set: 0.6573
 - R2 score on test set: 0.6472
 - Took 24.5570s
KNN-10000 imputation + LR: 
 - R2 score on training set: 0.6573
 - R2 score on test set: 0.6472
 - Took 25.8276s


# Simple Impute

In [8]:
strategies = ['mean', 'median', 'most_frequent', 'constant']
for strategy in strategies:
    print(f'Simple imputation ({strategy}) + LR: ')
    simulate(
        SimpleImputer(strategy=strategy),
        LinearRegression()
    )

Simple imputation (mean) + LR: 
 - R2 score on training set: 0.6574
 - R2 score on test set: 0.6478
 - Took 0.0855s
Simple imputation (median) + LR: 
 - R2 score on training set: 0.6573
 - R2 score on test set: 0.6446
 - Took 0.1279s
Simple imputation (most_frequent) + LR: 
 - R2 score on training set: 0.1509
 - R2 score on test set: 0.1756
 - Took 0.1528s
Simple imputation (constant) + LR: 
 - R2 score on training set: 0.6489
 - R2 score on test set: 0.6345
 - Took 0.0949s


# SoftImpute

In [9]:
import warnings
warnings.filterwarnings('ignore')

from fancyimpute import SoftImpute

start = perf_counter()
regularization_values = [0.1, 1, 10, 100]
for regularization in regularization_values:
    print(f'SoftImpute with lambda={regularization} + LR:')
    simulate(
        SoftImpute(shrinkage_value=regularization, verbose=False),
        LinearRegression()
    )

SoftImpute with lambda=0.1 + LR:
 - R2 score on training set: 0.6535
 - R2 score on test set: 0.6479
 - Took 2.2479s
SoftImpute with lambda=1 + LR:
 - R2 score on training set: 0.6869
 - R2 score on test set: 0.7061
 - Took 2.4577s
SoftImpute with lambda=10 + LR:
 - R2 score on training set: 0.7604
 - R2 score on test set: 0.7524
 - Took 2.3163s
SoftImpute with lambda=100 + LR:
 - R2 score on training set: 0.7229
 - R2 score on test set: 0.5991
 - Took 0.7562s


# MissForest

In [10]:
from sklearn.ensemble import RandomForestRegressor

n_estimators_values = [5, 10, 25, 50]  # Different numbers of trees in the Random Forest
for n_estimators in n_estimators_values:
    print(f'MissForest with {n_estimators} estimators + LR:')
    simulate(
        IterativeImputer(
            estimator=RandomForestRegressor(n_estimators=n_estimators, random_state=seed)
        ),
        LinearRegression()
    )

MissForest with 5 estimators + LR:
 - R2 score on training set: 0.7474
 - R2 score on test set: 0.7130
 - Took 37.2037s
MissForest with 10 estimators + LR:
 - R2 score on training set: 0.7586
 - R2 score on test set: 0.7204
 - Took 58.1140s
MissForest with 25 estimators + LR:
 - R2 score on training set: 0.7660
 - R2 score on test set: 0.7246
 - Took 148.2751s
MissForest with 50 estimators + LR:
 - R2 score on training set: 0.7708
 - R2 score on test set: 0.7442
 - Took 292.8146s


# MICE

In [11]:
simulate(
    IterativeImputer(random_state=0),
    LinearRegression()
)

 - R2 score on training set: 0.7496
 - R2 score on test set: 0.7313
 - Took 4.5647s


# MLP

In [12]:
from sklearn.neural_network import MLPRegressor

start = perf_counter()
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

mlp = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=seed)
mlp.fit(X_train_imputed, y_train)

y_train_pred = mlp.predict(X_train_imputed)
r2_train = r2_score(y_train, y_train_pred)

y_test_pred = mlp.predict(X_test_imputed)
r2_test = r2_score(y_test, y_test_pred)

print(f' - R2 score on training set: {r2_train:.4f}')
print(f' - R2 score on test set: {r2_test:.4f}')
print(f' - Took {(perf_counter() - start):.4f}s')

 - R2 score on training set: 0.8022
 - R2 score on test set: 0.7330
 - Took 14.1862s


# Different samples size, missing rates

In [13]:
ns = [1000, 10000, 50000]
ms = [0.1, 0.3, 0.5, 0.8]
for n in ns:
    print(f'Number of samples: {n = }')
    for m in ms:
        print(f'Missing rate: {m}')
        X_train, X_test, y_train, y_test = generate_dataset(n=n, d=10, missing_rate=0.5)
        print(' - KNN + LR: ')
        simulate(
            KNNImputer(n_neighbors=1000),
            LinearRegression()
        )

        print(' - Simple impute + LR: ')
        simulate(
            SimpleImputer(strategy=strategy),
            LinearRegression()
        )

Number of samples: n = 1000
Missing rate: 0.1
 - KNN + LR: 
 - R2 score on training set: 0.6448
 - R2 score on test set: 0.5743
 - Took 0.2508s
 - Simple impute + LR: 
 - R2 score on training set: 0.6410
 - R2 score on test set: 0.6842
 - Took 0.0065s
Missing rate: 0.3
 - KNN + LR: 
 - R2 score on training set: 0.6448
 - R2 score on test set: 0.5743
 - Took 0.2557s
 - Simple impute + LR: 
 - R2 score on training set: 0.6410
 - R2 score on test set: 0.6842
 - Took 0.0065s
Missing rate: 0.5
 - KNN + LR: 
 - R2 score on training set: 0.6448
 - R2 score on test set: 0.5743
 - Took 0.2422s
 - Simple impute + LR: 
 - R2 score on training set: 0.6410
 - R2 score on test set: 0.6842
 - Took 0.0057s
Missing rate: 0.8
 - KNN + LR: 
 - R2 score on training set: 0.6448
 - R2 score on test set: 0.5743
 - Took 0.2408s
 - Simple impute + LR: 
 - R2 score on training set: 0.6410
 - R2 score on test set: 0.6842
 - Took 0.0076s
Number of samples: n = 10000
Missing rate: 0.1
 - KNN + LR: 
 - R2 score on 