In [1]:
import numpy as np
import pandas as pd
import pyvinecopulib as pv

from gcimpute.gaussian_copula import GaussianCopula
from gcimpute.helper_evaluation import get_smae
from gcimpute.helper_data import generate_mixed_from_gc
from gcimpute.helper_mask import mask_MCAR
from vcimpute.helper_mdp import all_mdps, mdp_coords
from vcimpute.helper_datagen import make_complete_data_matrix

In [2]:
d = 5
X = make_complete_data_matrix(1000, d, 'gaussian', 1)
X_mask = mask_MCAR(X, 0.2)

In [3]:
Z = np.array(list(map(lambda x: set(1 + np.where(x)[0]), all_mdps(X_mask))))
Z = Z[np.argsort(list(map(len, Z)))[::-1]] # try reverse sort

best_orders = []
while len(Z) > 0:
    best_matches = None
    best_order = None
    len_best_matches = 0
    for i in range(max(1, int(len(Z) * 0.05))):
        matches, = np.where(Z <= Z[i])
        if len(matches) > len_best_matches:
            best_matches = matches
            best_order = Z[i]
            len_best_matches = len(matches)
    if best_matches is not None:
        Z = np.delete(Z, best_matches)
        best_orders.append(best_order)

In [4]:
Z = np.array(list(map(lambda x: set(1 + np.where(x)[0]), all_mdps(X_mask))))
Z = Z[np.argsort(list(map(len, Z)))[::-1]] # try reverse sort

i = 0
X_imp = np.copy(X_mask)
for i in range(len(best_orders)): 
    miss_vars = best_orders[i]
    obs_vars = set(1 + np.arange(d)).difference(miss_vars)
    structure = pv.DVineStructure(order=list(miss_vars) + list(obs_vars))
    miss_var_lst = Z[Z <= best_orders[i]]
    
    controls = pv.FitControlsVinecop(family_set=[pv.BicopFamily.gaussian]) # parameter
    cop = pv.Vinecop(structure=structure)
    cop.select(X_imp, controls=controls)
    
    for miss_vars in miss_var_lst: # ensure sorted
        mdp = np.zeros(d, dtype=bool)
        mdp[np.array(list(miss_vars)) - 1] = True
        
        miss_rows = mdp_coords(X_imp, mdp)
        rb = cop.rosenblatt(X_imp[miss_rows])
        rb[np.isnan(rb)] = np.random.uniform(size=np.count_nonzero(np.isnan(rb)))
        irb = cop.inverse_rosenblatt(rb)
        for j in range(len(miss_rows)):
            X_imp[miss_rows[j], np.array(list(miss_vars))-1] = irb[j, np.array(list(miss_vars))-1]
    Z = np.delete(Z, np.where(Z <= best_orders[i])[0])

In [8]:
best_orders

[{1, 2, 3, 4}, {1, 3, 4, 5}, {1, 2, 3, 5}, {2, 3, 4, 5}, {1, 2, 4, 5}]

In [7]:
get_smae(X_imp, X, X_mask)

array([0.95161034, 1.03655314, 1.13584619, 1.31195052, 1.24382789])