In [1]:
import os
import time
import sys

sys.path.append("src/")
from DIMVImputation import DIMVImputation
from utils import create_randomly_missing

import numpy as np 

In [2]:
def create_randomly_missing(data: np.ndarray, perc_del: float) -> np.ndarray:
    """
    Creates a randomly missing mask for the input data.

    Args:
        data (np.ndarray): The input data.
        perc_del (float): The percentage of missing values to create.

    Returns:
        np.ndarray: An array with the same shape as `data` where missing values are marked as NaN.
    """
    n = data.shape[0]
    # Flatten data into 1 row
    flatten_data = data.reshape(1, -1)
    # Uniform missing mask
    missing_mask = np.random.uniform(0, 1, flatten_data.shape[1]).reshape(1, -1)
    # Mark as missing if value in mask  < perc_del
    missing_data = flatten_data.copy().astype('float')
    missing_data[missing_mask <= perc_del] = np.nan

    return missing_data.reshape(n, -1)
 

In [3]:
def rmse_calc(ori_data, imp_data, missing_mask):
    """
    missing_mask: 1 is missing_data, 0 is observed_data
    """

    nominator = np.sum(
            (missing_mask * ori_data \
                    - missing_mask * imp_data
             ) ** 2
            )
    denominator = np.sum(missing_mask)

    return np.sqrt(nominator / denominator) 

In [4]:
#create a sample data
data = np.random.randint(0, 100, size=(100, 30)).astype('float64')
missing_rate = 0.5
missing_data = create_randomly_missing(data, missing_rate)

#train test split
test_size = .2
split_index = int(len(missing_data) * (1 - test_size))

X_train_ori, X_test_ori = data[:split_index, :], data[split_index:, :]

X_train_miss, X_test_miss = missing_data[:split_index, :], missing_data[
    split_index:, :]

In [6]:
imputer1 = DIMVImputation()
start = time.time()

imputer1.fit(X_train_miss, initializing=False, n_jobs=1)

imputer1.cross_validate(train_percent=1)
X_test_imp1 = imputer.transform(X_test_miss)


nan_mask = np.isnan(X_test_miss)
rmse1 = rmse_calc(X_test_ori, X_test_imp1, nan_mask)
duration1 = time.time() - start
print("Imputation done after: {} and have Rmse = {}".format(duration1, rmse1)) 

Start Cross Validation with alphas = [0.0, 0.01, 0.1, 1.0, 10.0, 100.0] and 100 % of training set
Running Cross Validation, alpha=0.0


100%|████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 29.53it/s]


Running Cross Validation, alpha=0.01


100%|████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 28.67it/s]


Running Cross Validation, alpha=0.1


100%|████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 23.47it/s]


Running Cross Validation, alpha=1.0


100%|████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 26.17it/s]


Running Cross Validation, alpha=10.0


100%|████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 25.10it/s]


Running Cross Validation, alpha=100.0


100%|████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 24.51it/s]
100%|███████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 230.52it/s]

Imputation done after: 7.1285200119018555 and have Rmse = 1954.9800598808072





In [7]:
imputer2 = DIMVImputation()
start = time.time()

imputer2.fit(X_train_miss, initializing=False, n_jobs=-1)

imputer2.cross_validate(train_percent=1)
X_test_imp2 = imputer2.transform(X_test_miss)


nan_mask = np.isnan(X_test_miss)
rmse2 = rmse_calc(X_test_ori, X_test_imp2, nan_mask)
duration2 = time.time() - start
print("Imputation done after: {} and have Rmse = {}".format(duration2, rmse2))  

100%|█████████████████████████████████████████████████████████| 435/435 [00:00<00:00, 479.00it/s]


Start Cross Validation with alphas = [0.0, 0.01, 0.1, 1.0, 10.0, 100.0] and 100 % of training set
Running Cross Validation, alpha=0.0


100%|████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 32.05it/s]


Running Cross Validation, alpha=0.01


100%|████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 24.39it/s]


Running Cross Validation, alpha=0.1


100%|████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 26.01it/s]


Running Cross Validation, alpha=1.0


100%|████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 24.27it/s]


Running Cross Validation, alpha=10.0


100%|████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 23.19it/s]


Running Cross Validation, alpha=100.0


100%|████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 18.71it/s]
100%|███████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 304.12it/s]

Imputation done after: 8.677681922912598 and have Rmse = 1954.9800598808072





In [11]:
imputer3 = DIMVImputation()
start = time.time()

imputer3.fit(X_train_miss, initializing=True, n_jobs=-1)

imputer3.cross_validate(train_percent=1)
X_test_imp3 = imputer2.transform(X_test_miss)


nan_mask = np.isnan(X_test_miss)
rmse3 = rmse_calc(X_test_ori, X_test_imp3, nan_mask)
duration3 = time.time() - start
print("Imputation done after: {} and have Rmse = {}".format(duration3, rmse3))   

100%|█████████████████████████████████████████████████████████| 435/435 [00:00<00:00, 521.37it/s]


Start Cross Validation with alphas = [0.0, 0.01, 0.1, 1.0, 10.0, 100.0] and 100 % of training set
Running Cross Validation, alpha=0.0


100%|███████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 393.66it/s]


Running Cross Validation, alpha=0.01


100%|███████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 937.27it/s]


Running Cross Validation, alpha=0.1


100%|███████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 737.28it/s]


Running Cross Validation, alpha=1.0


100%|███████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 400.85it/s]


Running Cross Validation, alpha=10.0


100%|███████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 762.00it/s]


Running Cross Validation, alpha=100.0


100%|███████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 702.32it/s]
100%|███████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 261.84it/s]

Imputation done after: 1.4606752395629883 and have Rmse = 1954.9800598808072



