In [43]:
from scipy.stats import expon, norm
import numpy as np
import pyvinecopulib as pv

from gcimpute.helper_data import generate_mixed_from_gc
from vcimpute.helper_datagen import probability_integral_transform, mask_MCAR
from vcimpute.utils import make_triangular_array
from vcimpute.zeisberger import VineCopFit, VineCopReg
from vcimpute.sakuth import MdpFit
from gcimpute.gaussian_copula import GaussianCopula
from gcimpute.helper_evaluation import get_smae

In [44]:
d=3
n=1000

In [45]:
structure = pv.RVineStructure().simulate(d=d)
pcs = make_triangular_array(d)

# only works for d=3
pcs[0][0] = pv.Bicop(family=pv.BicopFamily.gaussian, parameters=[[0.5]])
pcs[0][1] = pv.Bicop(family=pv.BicopFamily.gaussian, parameters=[[0.5]])
pcs[1][0] = pv.Bicop(family=pv.BicopFamily.gaussian, parameters=[[0.9]])

In [46]:
structure

<pyvinecopulib.RVineStructure>
2 2 2 
1 1 
3 

In [47]:
cop = pv.Vinecop(structure, pcs)
cop

<pyvinecopulib.Vinecop>
** Tree: 0
3,2 <-> Gaussian, parameters = 0.5
1,2 <-> Gaussian, parameters = 0.5
** Tree: 1
3,1 | 2 <-> Gaussian, parameters = 0.9

In [48]:
dat1 = cop.simulate(n=n)
dat1

array([[0.99518483, 0.73330609, 0.99641264],
       [0.48230238, 0.34564437, 0.41356445],
       [0.40702339, 0.66615218, 0.45154099],
       ...,
       [0.05556498, 0.52191108, 0.07227599],
       [0.99416976, 0.99814403, 0.98677291],
       [0.15374492, 0.10818382, 0.19934993]])

In [49]:
sigma = np.corrcoef(dat1.T)
sigma

array([[1.        , 0.50625717, 0.91878076],
       [0.50625717, 1.        , 0.51138756],
       [0.91878076, 0.51138756, 1.        ]])

In [50]:
dat2 = generate_mixed_from_gc(
    n=n,
    sigma=sigma,
    var_types={'cont':list(range(3))},
    cont_transform=lambda x: norm.cdf(x)
)

In [51]:
np.linalg.norm(np.corrcoef(dat2.T)-np.corrcoef(dat1.T))/np.linalg.norm(np.corrcoef(dat1.T))

0.01976976848598962

In [52]:
np.linalg.norm(np.corrcoef(dat2.T)-np.corrcoef(dat1.T))/np.linalg.norm(np.corrcoef(dat2.T))

0.019683390734061213

In [53]:
dat1_mask = mask_MCAR(dat1, 'univariate', 0.1, seed=1)
idx = np.where(np.any(np.isnan(dat1_mask),axis=0))[0].item()
dat2_mask = np.copy(dat2)
dat2_mask[np.isnan(dat1_mask)[:,idx], idx] = np.nan

In [54]:
model_lst = [
    ('gcimpute', GaussianCopula()),
    ('mdpfit', MdpFit('gaussian', 10, 1)),
    ('copfit', VineCopFit('gaussian', 10, True, 1)),
    ('copreg', VineCopReg('gaussian', 10, 'R', True, 1)),
]

In [55]:
.49/.518

0.9459459459459459

In [56]:
for tag, model in model_lst:
    dat1_imp = model.fit_transform(dat1_mask)
    dat2_imp = model.fit_transform(dat2_mask)
    print(tag, get_smae(dat1_imp, dat1, dat1_mask), get_smae(dat2_imp, dat2, dat2_mask))

gcimpute [      nan 0.8569539       nan] [       nan 0.80467187        nan]
mdpfit [       nan 1.02193623        nan] [       nan 1.04539937        nan]
copfit [       nan 1.06410113        nan] [       nan 1.21082335        nan]
copreg [       nan 1.05827614        nan] [       nan 1.06625032        nan]
