In [1]:
# goal
# step1. given (a) vine structure, (b) copula family and  (c) copula parameters
# step2. generate data
# step3. mask var5 with x% dropout
# step4. infer the (a-c) on masked data using complete cases
# step5.1 if the vine structure supports direct imputation of var5 -> admit to comparison
# step5.2 if the vine structure does not support direct imputation of var5 -> skip
# step6. run zhao/udell's code to impute masked data
# step7. compute smae imputation error, bias of correlation from step 5.1,6

# reference
# the pair-copula index for edge e in tree t of a d dimensional vine is
# (M[d - 1 - e, e], M[t, e]; M[t - 1, e], ..., M[0, e])

In [2]:
import numpy as np
from matplotlib import pyplot as plt

import pyvinecopulib as pv
from gcimpute.gaussian_copula import GaussianCopula
from gcimpute.helper_evaluation import get_smae

In [3]:
def get_ced_cing(T, cop=None):
    d = T.shape[1]
    cing = []
    ced = []
    param = []
    for j in range(d):
        for i1 in range(d-j-1):
            ced.append(sorted((T[i1,j], T[d-j-1, j])))
            tmp = []
            for i2 in range(i1):
                tmp.append(T[i2,j])
            cing.append(sorted(tmp))
            if cop is not None:
                param.append(cop.get_parameters(i1, j)[0][0])
    return ced, cing, param

def find(a, cing_len, ced, cing):
    out = [i for i in range(len(ced)) if a in ced[i]]
    matched = False
    for i in out:
        if len(cing[i]) == cing_len:
            matched = True
            break
    assert matched, f'bad argument, a={a}, cing_len={cing_len}'
    return ced[i][1] if ced[i][0] == a else ced[i][0]

def diagonalize(T1, a):
    d = T1.shape[1]
    if a == T1[d-1, 0]:
        return T1
    assert a == T1[d-2, 0], f'cannot be diagonalized with {a}'

    T2 = np.zeros(shape=T1.shape, dtype=np.uint64)
    T2[d-1, 0] = a
    order = [a]

    ced, cing, _ = get_ced_cing(T1)
    for j in range(d-1):
        for i in range(d-j-1):
            T2[i,j] = find(T2[d-j-1, j], i, ced, cing)

        remove_idx = [i for i, c in enumerate(ced) for k in order if k in c]
        keep_idx = set(range(len(ced))).difference(set(remove_idx))

        ced = [ced[i] for i in keep_idx]
        cing = [cing[i] for i in keep_idx]

        T2[d-j-2, j+1] = T2[d-j-2, j]
        order.append(T2[d-j-2, j+1])
    return T2

def make_diagonal_copula(cop1, a):
    T1 = cop1.matrix
    d = T1.shape[0]
    ced, cing, param = get_ced_cing(T1, cop1)

    T2 = diagonalize(T1, a)

    pair_copulas = []
    for t in range(d-1):
        cur = []
        pair_copulas.append(cur)
        for e in range(d-1-t):
            cur.append(
                pv.Bicop(
                    family=pv.BicopFamily.gaussian,
                    parameters=[param[ced.index(sorted((T2[d-1-e,e], T2[t,e])))]]
                )
            )
    cop2 = pv.Vinecop(matrix=T2, pair_copulas=pair_copulas)
    return cop2

In [4]:
d = 5
corr = np.array([
    [0.5, 0.25, 0.25, 0.9], 
    [0.1, 0.9, 0.9, np.nan], 
    [0.125, 0.66, np.nan, np.nan],
    [0.55, np.nan, np.nan, np.nan]]
)
vine_matrix = np.array([
    [3,2,3,3,3],
    [2,3,2,2,0],
    [4,4,4,0,0],
    [1,1,0,0,0],
    [5,0,0,0,0]
])

In [5]:
pair_copulas = []
for t in range(d-1):
    cur = []
    pair_copulas.append(cur)
    for e in range(d-1-t):
        cur.append(
            pv.Bicop(
                family=pv.BicopFamily.gaussian,
                parameters=[corr[t,e]]
            )
        )
cop = pv.Vinecop(matrix=vine_matrix, pair_copulas=pair_copulas)
u = cop.simulate(1000, seeds=[1,2,3,4,5])

In [12]:
cop

<pyvinecopulib.Vinecop>
** Tree: 0
5,3 <-> Gaussian, parameters = 0.5
1,2 <-> Gaussian, parameters = 0.25
4,3 <-> Gaussian, parameters = 0.25
2,3 <-> Gaussian, parameters = 0.9
** Tree: 1
5,2 | 3 <-> Gaussian, parameters = 0.1
1,3 | 2 <-> Gaussian, parameters = 0.9
4,2 | 3 <-> Gaussian, parameters = 0.9
** Tree: 2
5,4 | 2,3 <-> Gaussian, parameters = 0.125
1,4 | 3,2 <-> Gaussian, parameters = 0.66
** Tree: 3
5,1 | 4,2,3 <-> Gaussian, parameters = 0.55

In [13]:
cop2

<pyvinecopulib.Vinecop>
** Tree: 0
5,3 <-> Gaussian, parameters = 0.468025
4,2 <-> Gaussian, parameters = 0.625168
1,3 <-> Gaussian, parameters = 0.571803
2,3 <-> Gaussian, parameters = 0.900277
** Tree: 1
5,1 | 3 <-> Gaussian, parameters = 0.228236
4,3 | 2 <-> Gaussian, parameters = -0.844873
1,2 | 3 <-> Gaussian, parameters = -0.855547
** Tree: 2
5,2 | 1,3 <-> Gaussian, parameters = 0.465134
4,1 | 3,2 <-> Gaussian, parameters = 0.663084
** Tree: 3
5,4 | 2,1,3 <-> Gaussian, parameters = -0.296952

In [6]:
missing = np.random.binomial(n=1, p=0.2, size=u.shape[0])
present = (1 - missing)
u_mask = np.copy(u)
u_mask[:, 4] = np.where(missing > 0, np.nan, u_mask[:,4])

In [7]:
fit_controls = pv.FitControlsVinecop(family_set=[pv.BicopFamily.gaussian])
cop1 = pv.Vinecop(u_mask, controls=fit_controls)
cop2 = make_diagonal_copula(cop1, 5)

In [8]:
cop2.matrix

array([[3, 2, 3, 3, 3],
       [1, 3, 2, 2, 0],
       [2, 1, 1, 0, 0],
       [4, 4, 0, 0, 0],
       [5, 0, 0, 0, 0]], dtype=uint64)

In [9]:
u1 = u_mask[:, 0][:, None]
u2 = u_mask[:, 1][:, None]
u3 = u_mask[:, 2][:, None]
u4 = u_mask[:, 3][:, None]
u5_true = u[:, 4][:, None]

### Impute

In [10]:
# complete cases
F_1_3 = cop2.get_pair_copula(0,2).hfunc2(np.hstack([u1, u3]))[:, None]
F_2_3 = cop2.get_pair_copula(0,3).hfunc2(np.hstack([u2, u3]))[:, None]
F_2_13 = cop2.get_pair_copula(1,2).hfunc1(np.hstack([F_1_3, F_2_3]))[:, None]
F_1_32 = cop2.get_pair_copula(1,2).hfunc2(np.hstack([F_1_3, F_2_3]))[:, None]

F_3_2 = cop2.get_pair_copula(0,3).hfunc1(np.hstack([u2, u3]))[:, None]
F_4_2 = cop2.get_pair_copula(0,1).hfunc2(np.hstack([u4, u2]))[:, None]
F_4_32 = cop2.get_pair_copula(1,1).hfunc2(np.hstack([F_4_2, F_3_2]))[:, None]

F_4_123 = cop2.get_pair_copula(2,1).hfunc2(np.hstack([F_4_32, F_1_32]))[:, None]

w = np.random.uniform(size=(1000,1))
inv1 = cop2.get_pair_copula(3,0).hinv2(np.hstack([w, F_4_123]))[:, None]
inv2 = cop2.get_pair_copula(2,0).hinv2(np.hstack([inv1, F_2_13]))[:, None]
inv3 = cop2.get_pair_copula(1,0).hinv2(np.hstack([inv2, F_1_3]))[:, None]
inv4 = cop2.get_pair_copula(0,0).hinv2(np.hstack([inv3, u3]))[:, None]

u_imp = np.copy(u_mask)
u_imp[missing==1,4] = np.ravel(inv4[missing==1])

get_smae(u_imp, u, u_mask)

There is no entry to be evaluated in variable 0.
There is no entry to be evaluated in variable 1.
There is no entry to be evaluated in variable 2.
There is no entry to be evaluated in variable 3.


array([       nan,        nan,        nan,        nan, 0.85507762])

In [11]:
# zhao/udell
model = GaussianCopula(verbose=1)
u_imp2 = model.fit_transform(X=u_mask)

get_smae(u_imp2, u, u_mask)

Iter 1: copula parameter change 0.0390, likelihood -4.0209
Iter 2: copula parameter change 0.0084, likelihood -4.0117
Convergence achieved at iteration 2
There is no entry to be evaluated in variable 0.
There is no entry to be evaluated in variable 1.
There is no entry to be evaluated in variable 2.
There is no entry to be evaluated in variable 3.


array([      nan,       nan,       nan,       nan, 0.6828369])