In [56]:
from scipy.stats import expon, norm
import numpy as np
import pyvinecopulib as pv

from gcimpute.helper_data import generate_mixed_from_gc
from vcimpute.helper_datagen import probability_integral_transform, mask_MCAR
from vcimpute.utils import make_triangular_array
from vcimpute.zeisberger import VineCopFit, VineCopReg
from vcimpute.sakuth import MdpFit
from gcimpute.gaussian_copula import GaussianCopula
from gcimpute.helper_evaluation import get_smae

In [57]:
d=3
n=1000

In [58]:
structure = pv.RVineStructure().simulate(d=d)
pcs = make_triangular_array(d)

# only works for d=3
pcs[0][0] = pv.Bicop(family=pv.BicopFamily.gaussian, parameters=[[0.5]])
pcs[0][1] = pv.Bicop(family=pv.BicopFamily.gaussian, parameters=[[0.5]])
pcs[1][0] = pv.Bicop(family=pv.BicopFamily.gaussian, parameters=[[0.9]])

In [59]:
structure

<pyvinecopulib.RVineStructure>
2 2 2 
1 1 
3 

In [60]:
cop = pv.Vinecop(structure, pcs)
cop

<pyvinecopulib.Vinecop>
** Tree: 0
3,2 <-> Gaussian, parameters = 0.5
1,2 <-> Gaussian, parameters = 0.5
** Tree: 1
3,1 | 2 <-> Gaussian, parameters = 0.9

In [61]:
dat1 = cop.simulate(n=n)
dat1

array([[0.39174905, 0.07371826, 0.51604085],
       [0.44212977, 0.29780965, 0.76760869],
       [0.37981239, 0.89924073, 0.38569533],
       ...,
       [0.18400469, 0.67113535, 0.05704565],
       [0.55924298, 0.46632172, 0.48005325],
       [0.01778223, 0.04786829, 0.03609329]])

In [62]:
sigma = np.corrcoef(dat1.T)
sigma

array([[1.        , 0.4911035 , 0.91489773],
       [0.4911035 , 1.        , 0.48572268],
       [0.91489773, 0.48572268, 1.        ]])

In [63]:
dat2 = generate_mixed_from_gc(
    n=1000,
    sigma=sigma,
    var_types={'cont':list(range(3))},
    cont_transform=lambda x: norm.cdf(x)
)

In [64]:
np.linalg.norm(np.corrcoef(dat2.T)-np.corrcoef(dat1.T))/np.linalg.norm(np.corrcoef(dat1.T))

0.020980455093923968

In [65]:
np.linalg.norm(np.corrcoef(dat2.T)-np.corrcoef(dat1.T))/np.linalg.norm(np.corrcoef(dat2.T))

0.0208858256731524

In [66]:
dat1_mask = mask_MCAR(dat1, 'univariate', 0.1, seed=1)
idx = np.where(np.any(np.isnan(dat1_mask),axis=0))[0].item()
dat2_mask = np.copy(dat2)
dat2_mask[np.isnan(dat1_mask)[:,idx], idx] = np.nan

In [67]:
dat1_mask

array([[0.39174905,        nan, 0.51604085],
       [0.44212977, 0.29780965, 0.76760869],
       [0.37981239,        nan, 0.38569533],
       ...,
       [0.18400469, 0.67113535, 0.05704565],
       [0.55924298,        nan, 0.48005325],
       [0.01778223, 0.04786829, 0.03609329]])

In [68]:
dat2_mask

array([[0.42901268,        nan, 0.48552131],
       [0.91517678, 0.61812107, 0.94079086],
       [0.72006326,        nan, 0.76928428],
       ...,
       [0.60584333, 0.69626166, 0.33324633],
       [0.19290951,        nan, 0.34462737],
       [0.90995719, 0.07719122, 0.85704983]])

In [77]:
model_lst = [
    ('gcimpute', GaussianCopula()),
    ('mdpfit', MdpFit('gaussian', 10, 1)),
    ('copfit', VineCopFit('gaussian', 10, True, 1)),
    ('copreg', VineCopReg('gaussian', 10, 'R', True, 1)),
]

In [82]:
dat1_mask

array([[0.39174905,        nan, 0.51604085],
       [0.44212977, 0.29780965, 0.76760869],
       [0.37981239,        nan, 0.38569533],
       ...,
       [0.18400469, 0.67113535, 0.05704565],
       [0.55924298,        nan, 0.48005325],
       [0.01778223, 0.04786829, 0.03609329]])

In [80]:
model.fit_transform(q)

array([[0.39174905, 0.27751547, 0.51604085],
       [0.44212977, 0.29780965, 0.76760869],
       [0.37981239, 0.47725616, 0.38569533],
       ...,
       [0.18400469, 0.67113535, 0.05704565],
       [0.55924298, 0.44291361, 0.48005325],
       [0.01778223, 0.04786829, 0.03609329]])

In [79]:
model.fit_transform(dat2_mask)

AssertionError: 

In [114]:
from vcimpute.helper_diagonalize import is_diagonal_matrix

In [116]:
def diagonalize_matrix2(T1):
    d = T1.shape[0]
    T2 = np.copy(T1)
    m1 = T2[d-1, 0]
    m2 = T2[d-2, 0]
    assert is_diagonal_matrix(T1)
    T2[d-1,0]=m2
    T2[d-2,:2]=m1
    r1 = np.copy(T2[:d-2, 0])
    T2[:d-2, 0] = T2[:d-2, 1]
    T2[:d-2,1] = r1
    return T2

In [118]:
diagonalize_matrix(T)

array([[1, 1, 1],
       [2, 2, 0],
       [3, 0, 0]], dtype=uint64)

In [119]:
diagonalize_matrix2(T)

array([[1, 1, 1],
       [3, 3, 0],
       [2, 0, 0]], dtype=uint64)

In [117]:
T = np.array([
[1, 1, 1],
[2, 2, 0],
[3, 0, 0]
], dtype=np.uint64)
T

array([[1, 1, 1],
       [2, 2, 0],
       [3, 0, 0]], dtype=uint64)

In [92]:
def diagonalize_matrix(T1):
    d = T1.shape[0]
    T2 = np.zeros((d, d), dtype=np.uint64)
    forbidden = []
    for j in range(d):
        for i in range(d - j)[::-1]:
            if i == (d - 1):
                T2[d - 1, j] = T1[d - 1, j]
            elif (i == (d - 2)) and (j == 0):
                T2[d - 2, j] = T1[d - 2, j]
            elif (i == (d - j - 1)) and (j != 0):
                T2[d - j - 1, j] = T2[d - j - 1, j - 1]
            else:
                for k in range(d - i - 1):
                    if (T1[d - k - 1, k] == T2[d - j - 1, j]) and (T1[i, k] not in forbidden):
                        T2[i, j] = T1[i, k]
                    elif (T1[i, k] == T2[d - j - 1, j]) and (T1[d - k - 1, k] not in forbidden):
                        T2[i, j] = T1[d - k - 1, k]
            forbidden.append(T2[d - j - 1, j])
    return T2

In [78]:
for tag, model in model_lst:
    dat1_imp = model.fit_transform(dat1_mask)
    dat2_imp = model.fit_transform(dat2_mask)
    print(tag, get_smae(dat1_imp, dat1, dat1_mask), get_smae(dat2_imp, dat2, dat2_mask))

gcimpute [       nan 0.80838768        nan] [       nan 0.81447025        nan]
mdpfit [       nan 1.17982953        nan] [       nan 1.15196468        nan]


AssertionError: 

In [71]:
cop

<pyvinecopulib.Vinecop>
** Tree: 0
3,2 <-> Gaussian, parameters = 0.5
1,2 <-> Gaussian, parameters = 0.5
** Tree: 1
3,1 | 2 <-> Gaussian, parameters = 0.9

In [72]:
np.cov(dat1.T)

array([[0.08327159, 0.04051754, 0.07636132],
       [0.04051754, 0.08174163, 0.04016636],
       [0.07636132, 0.04016636, 0.08365749]])

In [73]:
np.cov(dat2.T)

array([[0.08348192, 0.04205458, 0.07463104],
       [0.04205458, 0.0830566 , 0.04249404],
       [0.07463104, 0.04249404, 0.08155707]])

In [74]:
dat1

array([[0.39174905, 0.07371826, 0.51604085],
       [0.44212977, 0.29780965, 0.76760869],
       [0.37981239, 0.89924073, 0.38569533],
       ...,
       [0.18400469, 0.67113535, 0.05704565],
       [0.55924298, 0.46632172, 0.48005325],
       [0.01778223, 0.04786829, 0.03609329]])

In [75]:
np.corrcoef(dat1.T)

array([[1.        , 0.4911035 , 0.91489773],
       [0.4911035 , 1.        , 0.48572268],
       [0.91489773, 0.48572268, 1.        ]])

In [76]:
np.corrcoef(dat2.T)

array([[1.        , 0.50504494, 0.90446645],
       [0.50504494, 1.        , 0.51630952],
       [0.90446645, 0.51630952, 1.        ]])