In [3]:
import os
import time
import sys
import numpy as np  

sys.path.append("src/")
from DIMVImputation import DIMVImputation

We create some functions for create missing dataset and calculate RMSE

In [23]:
def create_randomly_missing(data: np.ndarray, perc_del: float) -> np.ndarray:
    """
    Creates a randomly missing mask for the input data.

    Args:
        data (np.ndarray): The input data.
        perc_del (float): The percentage of missing values to create.

    Returns:
        np.ndarray: An array with the same shape as `data` where missing values are marked as NaN.
    """
    n = data.shape[0]
    # Flatten data into 1 row
    flatten_data = data.reshape(1, -1)
    # Uniform missing mask
    missing_mask = np.random.uniform(0, 1, flatten_data.shape[1]).reshape(1, -1)
    # Mark as missing if value in mask  < perc_del
    missing_data = flatten_data.copy().astype('float')
    missing_data[missing_mask <= perc_del] = np.nan

    return missing_data.reshape(n, -1)
 

In [24]:
def rmse_calc(ori_data, imp_data, missing_mask):
    """
    missing_mask: 1 is missing_data, 0 is observed_data
    Args: 
        ori_data (np.ndarray): original non missing data 
        imp_data (np.ndarray): imputed data 
        missing_mask (np.ndarray): 1 is missing_data, 0 is observed_data 
    Return:
        np.ndarray: RMSE between original data (ground truth) and the imputed data 
    """

    nominator = np.sum(
            (missing_mask * ori_data \
                    - missing_mask * imp_data
             ) ** 2
            )
    denominator = np.sum(missing_mask)

    return np.sqrt(nominator / denominator) 

We create a synthetic missing datasets 

In [25]:
#create a sample data
data = np.random.randint(0, 100, size=(1000, 50)).astype('float64')


#create missingness on data 
missing_rate = 0.5
missing_data = create_randomly_missing(data, missing_rate)


#train test split
test_size = .2
split_index = int(len(missing_data) * (1 - test_size))

X_train_ori, X_test_ori = data[:split_index, :], data[split_index:, :]

X_train_miss = missing_data[:split_index, :] 
X_test_miss = missing_data[split_index:, :] 

**Imputation for missing dataset**

- **Example 1**: We use does not initialize missing data with zero ```initializing=False```

In [21]:
imputer1 = DIMVImputation()
start = time.time()

imputer1.fit(X_train_miss, initializing=False)

imputer1.cross_validate(train_percent=1)


X_test_imp1 = imputer1.transform(X_test_miss)


nan_mask = np.isnan(X_test_miss)
rmse1 = rmse_calc(X_test_ori, X_test_imp1, nan_mask)
duration1 = time.time() - start
print("Imputation done after: {} (seconds) and have Rmse = {}".format(duration1, rmse1))  

Start Cross Validation with alphas = [0.0, 0.01, 0.1, 1.0, 10.0, 100.0] and 100 % of training set
Running Cross Validation, alpha=0.0


100%|████████████████████████████████████████████████████████████| 50/50 [00:15<00:00,  3.30it/s]


Running Cross Validation, alpha=0.01


100%|████████████████████████████████████████████████████████████| 50/50 [00:16<00:00,  3.05it/s]


Running Cross Validation, alpha=0.1


100%|████████████████████████████████████████████████████████████| 50/50 [00:19<00:00,  2.63it/s]


Running Cross Validation, alpha=1.0


100%|████████████████████████████████████████████████████████████| 50/50 [00:18<00:00,  2.68it/s]


Running Cross Validation, alpha=10.0


100%|████████████████████████████████████████████████████████████| 50/50 [00:16<00:00,  3.00it/s]


Running Cross Validation, alpha=100.0


100%|████████████████████████████████████████████████████████████| 50/50 [00:18<00:00,  2.68it/s]
100%|████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 26.27it/s]

Imputation done after: 106.69820308685303 (seconds) and have Rmse = 30.942500505619762





- **Example 2**: We set initialize missing data with zero ```initializing=True``` (init_with_zeros is set to True)

In [20]:
imputer3 = DIMVImputation()
start = time.time()

imputer3.fit(X_train_miss, initializing=True, n_jobs=-1)

imputer3.cross_validate(train_percent=1)
X_test_imp3 = imputer2.transform(X_test_miss)


nan_mask = np.isnan(X_test_miss)
rmse3 = rmse_calc(X_test_ori, X_test_imp3, nan_mask)
duration3 = time.time() - start
print("Imputation done after: {} (seconds) and have Rmse = {}".format(duration3, rmse3))    

100%|██████████████████████████████████████████████████████| 1225/1225 [00:00<00:00, 1833.19it/s]


Start Cross Validation with alphas = [0.0, 0.01, 0.1, 1.0, 10.0, 100.0] and 100 % of training set
Running Cross Validation, alpha=0.0


100%|██████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 1010.68it/s]


Running Cross Validation, alpha=0.01


100%|███████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 902.48it/s]


Running Cross Validation, alpha=0.1


100%|██████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 1024.86it/s]


Running Cross Validation, alpha=1.0


100%|███████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 918.24it/s]


Running Cross Validation, alpha=10.0


100%|███████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 981.08it/s]


Running Cross Validation, alpha=100.0


100%|███████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 963.20it/s]
100%|████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 17.16it/s]

Imputation done after: 4.0854761600494385 (seconds) and have Rmse = 30.942500505619762



