In [1]:
#  Comment the following line if you have installed the DIMVImputation library by cloning the repository.
!pip install git+https://github.com/maianhpuco/DIMVImputation.git

Collecting git+https://github.com/maianhpuco/DIMVImputation.git
  Cloning https://github.com/maianhpuco/DIMVImputation.git to /tmp/pip-req-build-8w0gk4i2
  Running command git clone --filter=blob:none --quiet https://github.com/maianhpuco/DIMVImputation.git /tmp/pip-req-build-8w0gk4i2
  Resolved https://github.com/maianhpuco/DIMVImputation.git to commit f6054bc46ae144a2a863a2d1d092248a8596bf30
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [2]:
import os
import time
import sys
import numpy as np

# Uncomment the following line if you have installed the DIMVImputation library by cloning the repository.
# sys.path.append("src/")
from DIMVImputation import DIMVImputation

We create some functions for create missing dataset and calculate RMSE

In [3]:
def create_randomly_missing(data: np.ndarray, perc_del: float) -> np.ndarray:
    """
    Creates a randomly missing mask for the input data.

    Args:
        data (np.ndarray): The input data.
        perc_del (float): The percentage of missing values to create.

    Returns:
        np.ndarray: An array with the same shape as `data` where missing values are marked as NaN.
    """
    n = data.shape[0]
    # Flatten data into 1 row
    flatten_data = data.reshape(1, -1)
    # Uniform missing mask
    missing_mask = np.random.uniform(0, 1, flatten_data.shape[1]).reshape(1, -1)
    # Mark as missing if value in mask  < perc_del
    missing_data = flatten_data.copy().astype('float')
    missing_data[missing_mask <= perc_del] = np.nan

    return missing_data.reshape(n, -1)


In [4]:
def rmse_calc(ori_data, imp_data, missing_mask):
    """
    missing_mask: 1 is missing_data, 0 is observed_data
    Args:
        ori_data (np.ndarray): original non missing data
        imp_data (np.ndarray): imputed data
        missing_mask (np.ndarray): 1 is missing_data, 0 is observed_data
    Return:
        np.ndarray: RMSE between original data (ground truth) and the imputed data
    """

    nominator = np.sum(
            (missing_mask * ori_data \
                    - missing_mask * imp_data
             ) ** 2
            )
    denominator = np.sum(missing_mask)

    return np.sqrt(nominator / denominator)

We create a synthetic missing datasets

In [5]:
#create a sample data
data = np.random.randint(0, 100, size=(1000, 50)).astype('float64')


#create missingness on data
missing_rate = 0.5
missing_data = create_randomly_missing(data, missing_rate)


#train test split
test_size = .2
split_index = int(len(missing_data) * (1 - test_size))

X_train_ori, X_test_ori = data[:split_index, :], data[split_index:, :]

X_train_miss = missing_data[:split_index, :]
X_test_miss = missing_data[split_index:, :]

**Imputation for missing dataset**

- **Example 1**: We use does not initialize missing data with zero ```initializing=False```

In [6]:
imputer1 = DIMVImputation()
start = time.time()

imputer1.fit(X_train_miss, initializing=False)
X_test_imp1 = imputer1.transform(X_test_miss)

nan_mask = np.isnan(X_test_miss)
rmse1 = rmse_calc(X_test_ori, X_test_imp1, nan_mask)
duration1 = time.time() - start
print("Imputation done after: {} (seconds) and have Rmse = {}".format(duration1, rmse1))

Start Cross Validation with alphas = [0.0, 0.01, 0.1, 1.0, 10.0, 100.0] and 100.0 % of training set
Running Cross Validation, alpha=0.0


100%|██████████| 50/50 [00:26<00:00,  1.90it/s]


Running Cross Validation, alpha=0.01


100%|██████████| 50/50 [00:25<00:00,  1.99it/s]


Running Cross Validation, alpha=0.1


100%|██████████| 50/50 [00:25<00:00,  1.98it/s]


Running Cross Validation, alpha=1.0


100%|██████████| 50/50 [00:24<00:00,  2.02it/s]


Running Cross Validation, alpha=10.0


100%|██████████| 50/50 [00:24<00:00,  2.01it/s]


Running Cross Validation, alpha=100.0


100%|██████████| 50/50 [00:22<00:00,  2.19it/s]


Validation result: best alpha 0.1, best score 27.14148795588147, scores {0.0: 27.21505524391791, 0.01: 27.203183864973955, 0.1: 27.14148795588147, 1.0: 27.44468312507921, 10.0: 28.409374541616472, 100.0: 28.674086858294753}
Value alpha used in for transforming is: 0.1


100%|██████████| 50/50 [00:03<00:00, 13.66it/s]

Imputation done after: 153.6686737537384 (seconds) and have Rmse = 30.8399043351438





- **Example 2**: We set initialize missing data with zero ```initializing=True``` (init_with_zeros is set to True)

In [7]:
imputer3 = DIMVImputation()
start = time.time()

imputer3.fit(X_train_miss, initializing=True, n_jobs=-1)
X_test_imp3 = imputer3.transform(X_test_miss)

nan_mask = np.isnan(X_test_miss)
rmse3 = rmse_calc(X_test_ori, X_test_imp3, nan_mask)
duration3 = time.time() - start
print("Imputation done after: {} (seconds) and have Rmse = {}".format(duration3, rmse3))

100%|██████████| 1225/1225 [00:00<00:00, 4695.13it/s]


Start Cross Validation with alphas = [0.0, 0.01, 0.1, 1.0, 10.0, 100.0] and 100.0 % of training set
Running Cross Validation, alpha=0.0


100%|██████████| 50/50 [00:00<00:00, 543.21it/s]


Running Cross Validation, alpha=0.01


100%|██████████| 50/50 [00:00<00:00, 434.30it/s]


Running Cross Validation, alpha=0.1


100%|██████████| 50/50 [00:00<00:00, 548.56it/s]


Running Cross Validation, alpha=1.0


100%|██████████| 50/50 [00:00<00:00, 626.63it/s]


Running Cross Validation, alpha=10.0


100%|██████████| 50/50 [00:00<00:00, 579.64it/s]


Running Cross Validation, alpha=100.0


100%|██████████| 50/50 [00:00<00:00, 629.20it/s]


Validation result: best alpha 0.1, best score 27.219416582074913, scores {0.0: 27.656521692765303, 0.01: 27.579867856867306, 0.1: 27.219416582074913, 1.0: 27.403506476405003, 10.0: 28.40884562687523, 100.0: 28.674084913087043}
Value alpha used in for transforming is: 0.1


100%|██████████| 50/50 [00:00<00:00, 1390.06it/s]

Imputation done after: 1.0491135120391846 (seconds) and have Rmse = 31.46687205446924



