In [1]:
import numpy as np
import pandas as pd
from vcimpute.helper_datagen import make_complete_data_matrix, mask_MCAR
from vcimpute.zeisberger import VineCopReg

In [2]:
X = make_complete_data_matrix(1000, 5, 'gaussian', seed=4)
X_mis = mask_MCAR(X, 'monotone', 0.2, seed=4, n_cols=2)

In [29]:
import numpy as np
import pyvinecopulib as pv

from vcimpute.helper_diagonalize import diagonalize_copula
from vcimpute.helper_mdp import all_mdps, select_by_mdp, idx_mis_by_col
from vcimpute.helper_subvines import find_subvine_structures, remove_var
from vcimpute.helper_vinestructs import generate_r_vine_structure, relabel_vine_matrix
from vcimpute.simulator import simulate_order_k
from vcimpute.utils import get, bicop_family_map, make_triangular_array, is_leaf_in_all_subtrees


class VineCopReg:
    def __init__(self, bicop_families, num_threads, vine_structure, seed):
        family_set = [bicop_family_map[k] for k in bicop_families]
        self.controls = pv.FitControlsVinecop(family_set=family_set, num_threads=num_threads)
        assert vine_structure in ['R', 'C', 'D']
        self.vine_structure = vine_structure
        self.seed = seed

    def fit_transform(self, X_mis):
        d = X_mis.shape[1]

        X_imp = np.copy(X_mis)

        mdps = all_mdps(X_imp)
        k = 0
        for mdp in mdps:
            X_sub = select_by_mdp(X_imp, mdp)
            miss_vars = list(1 + idx_mis_by_col(X_sub))
            obs_vars = list(set(1 + np.arange(d)).difference(miss_vars))
            if k == 1:
                return X_imp, X_sub, d, miss_vars, obs_vars
            k += 1
            self._impute(X_imp, X_sub, d, miss_vars, obs_vars)

        assert not np.any(np.isnan(X_imp)), 'invalid state, not all values imputed'
        return X_imp

    def _impute(self, X_imp, X_sub, d, miss_vars, obs_vars):
        assert (len(miss_vars) + len(obs_vars)) == d

        rng = np.random.default_rng(self.seed)
        miss_vars = miss_vars[::-1]  # decreasing missingness
        rng.shuffle(obs_vars)

        # simulate vine structure for sequential imputation
        structure = None
        if self.vine_structure == 'R':
            structure = generate_r_vine_structure(miss_vars, obs_vars)
        elif self.vine_structure == 'C':
            structure = pv.CVineStructure.simulate(order=miss_vars + obs_vars)
        elif self.vine_structure == 'D':
            structure = pv.DVineStructure.simulate(order=miss_vars + obs_vars)
        assert structure is not None

        # make copula with fixed structure
        pcs = make_triangular_array(d)
        for j in range(d - 1):
            for i in range(d - j - 1):
                pcs[i][j] = pv.Bicop()
        cop = pv.Vinecop(structure=structure, pair_copulas=pcs)

        for k, var_mis in enumerate(miss_vars):
            cop.select(X_imp, controls=self.controls)
            assert cop.order[k] == var_mis
            x_imp = simulate_order_k(cop, X_sub, k)
            assert not np.any(np.isnan(x_imp)), 'check imputation order'

            x_mis = get(X_sub, var_mis)
            is_missing = np.isnan(x_mis)
            x_mis[is_missing] = x_imp[is_missing]

In [30]:
cls = VineCopReg(['gaussian'], 10, 'R', 42)

In [35]:
from vcimpute.helper_mdp import all_mdps, n_miss_by_col

In [40]:
X_imp.base

In [48]:

def select_by_mdp(X_mis, mdp):
    return X_mis[(np.isnan(X_mis) == mdp).all(axis=1), :]

In [55]:
X_imp[(np.isnan(X_imp) == [False,False,False,False,True]).all(axis=1)].base

In [50]:
select_by_mdp(X_imp, [False,False,False,False,True]).base

In [57]:
X_mis = X_imp

In [58]:
mdp = [False,False,False,False,True]

In [72]:
X_imp.T.base

array([[0.789, 0.437, 0.15 , ..., 0.727, 0.792, 0.013],
       [0.193, 0.69 , 0.328, ..., 0.117, 0.61 , 0.899],
       [0.774, 0.415, 0.833, ..., 0.981, 0.21 , 0.17 ],
       [0.185, 0.603, 0.219, ..., 0.804, 0.53 , 0.06 ],
       [0.173,   nan, 0.891, ...,   nan, 0.03 , 0.888]])

In [65]:
X_imp[np.where((np.isnan(X_mis) == mdp).all(axis=1))[0], :]

In [42]:
X_imp[:,1].base

array([[0.789, 0.193, 0.774, 0.185, 0.173],
       [0.437, 0.69 , 0.415, 0.603,   nan],
       [0.15 , 0.328, 0.833, 0.219, 0.891],
       ...,
       [0.727, 0.117, 0.981, 0.804,   nan],
       [0.792, 0.61 , 0.21 , 0.53 , 0.03 ],
       [0.013, 0.899, 0.17 , 0.06 , 0.888]])

In [38]:
len(select_by_mdp(X_imp,[False,False,False,False,True]))

194

In [36]:
n_miss_by_col(X_mis)

array([  0,   0, 103,   0, 297])

In [None]:
n_

In [31]:
X_imp, X_sub, d, miss_vars, obs_vars = cls.fit_transform(X_mis)


In [15]:
miss_vars

[5]

In [34]:
all_mdps(X_imp)

array([[False, False, False, False,  True],
       [False, False,  True, False,  True]])

In [33]:
miss_vars

[3, 5]

In [16]:
X_imp

array([[0.789, 0.193, 0.774, 0.185, 0.173],
       [0.437, 0.69 , 0.415, 0.603,   nan],
       [0.15 , 0.328, 0.833, 0.219, 0.891],
       ...,
       [0.727, 0.117, 0.981, 0.804,   nan],
       [0.792, 0.61 , 0.21 , 0.53 , 0.03 ],
       [0.013, 0.899, 0.17 , 0.06 , 0.888]])

In [17]:
X_sub

array([[0.437, 0.69 , 0.415, 0.603,   nan],
       [0.946, 0.167, 0.946, 0.95 ,   nan],
       [0.714, 0.482, 0.564, 0.865,   nan],
       [0.065, 0.821, 0.44 , 0.473,   nan],
       [0.602, 0.775, 0.092, 0.111,   nan],
       [0.156, 0.497, 0.414, 0.08 ,   nan],
       [0.937, 0.102, 0.762, 0.353,   nan],
       [0.225, 0.23 , 0.661, 0.311,   nan],
       [0.494, 0.665, 0.411, 0.386,   nan],
       [0.998, 0.913, 0.019, 0.557,   nan],
       [0.03 , 0.326, 0.702, 0.232,   nan],
       [0.561, 0.658, 0.636, 0.188,   nan],
       [0.002, 0.707, 0.548, 0.025,   nan],
       [0.066, 0.278, 0.423, 0.037,   nan],
       [0.139, 0.03 , 0.991, 0.453,   nan],
       [0.483, 0.95 , 0.088, 0.278,   nan],
       [0.514, 0.355, 0.352, 0.3  ,   nan],
       [0.717, 0.591, 0.94 , 0.98 ,   nan],
       [0.253, 0.501, 0.331, 0.686,   nan],
       [0.63 , 0.477, 0.497, 0.451,   nan],
       [0.82 , 0.178, 0.7  , 0.894,   nan],
       [0.286, 0.947, 0.197, 0.83 ,   nan],
       [0.747, 0.818, 0.076, 0.2

In [10]:
obs_vars

[1, 2, 3, 4]