In [1]:
import  torch
import  numpy as np
from dataTST import SynDataMetaTST
from MetaTST import Meta
import argparse
parser = argparse.ArgumentParser()
from utils import MatConvert, TST_MMD_u, MMDu

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
class myargs:
    def __init__(self):
        self.n=50
        self.n_te=150
        self.d=2
        self.K=10
        self.num_meta_tasks=100
        self.epoch=1000
        self.meta_lr=1e-2
        self.update_lr=0.8
        self.update_step=10
        self.closeness=0.3
args = myargs()

In [3]:
def mu_sigma(delta):
    Num_clusters = 2
    mu = np.zeros([Num_clusters, d])
    mu[1] = mu[1] + 0.5
    sigma = [np.identity(d), np.identity(d)]
    sigma[0][0, 1] = delta
    sigma[0][1, 0] = delta
    sigma[1][0, 1] = -delta
    sigma[1][1, 0] = -delta
    return mu, sigma

In [4]:
def gen_data(n, delta = 0.7, kk=0):
    mu_mx_1, sigma_mx_1 = mu_sigma(0)
    mu_mx_2, sigma_mx_2 = mu_sigma(delta)
    s1 = np.zeros([n * Num_clusters, d])
    s2 = np.zeros([n * Num_clusters, d])
    for i in range(Num_clusters):
        np.random.seed(seed=1102*kk + i + n)
        s1[n * (i):n * (i + 1), :] = np.random.multivariate_normal(mu_mx_1[i], sigma_mx_1[i], n)
    for i in range(Num_clusters):
        np.random.seed(seed=819*kk + 1 + i + n)
        s2[n * (i):n * (i + 1), :] = np.random.multivariate_normal(mu_mx_2[i], sigma_mx_2[i], n)
    S = np.concatenate((s1, s2), axis=0)
    return S, s1, s2

In [5]:
def gen_meta_data(num_meta_tasks, n, closeness):
    # generate meta-samples
    data_org = np.random.randn(num_meta_tasks,4*n,2)
    for nn in range(num_meta_tasks):
        delta= 0.6 - closeness + (0.1/num_meta_tasks) * (nn+1)
        data_org[nn] = gen_data(n,delta,nn)[0]
    
    return data_org

In [6]:
def alg_one(maml, db_train, epoch):
    for step in range(epoch):
        x_spt, y_spt, x_qry, y_qry = db_train.next()
        x_spt, y_spt, x_qry, y_qry = torch.from_numpy(x_spt).to(device), torch.from_numpy(y_spt).to(device), \
                                     torch.from_numpy(x_qry).to(device), torch.from_numpy(y_qry).to(device)
        # train meta kernels
        J_value, model_u, sigma, sigma0_u, ep = maml(x_spt, y_spt, x_qry, y_qry) 
        # print objectives from epoch
        if step % 10 == 0:
            print('step:', step, '\ttraining J value:', J_value.item())
    return model_u, sigma, sigma0_u, ep

In [7]:
def get_no_trainable_tensors(model):
    tmp = filter(lambda x: x.requires_grad, model.parameters())
    num = sum(map(lambda x: np.prod(x.shape), tmp))
    print('Total trainable tensors:', num)
    return num

In [8]:
def fine_tune(model, it, learning_rate, S, sigma, sigma0, ep, device, dtype):
    n_te = int(S.shape[0]/4)
    # setup optimizer for training deep kernel
    optimizer = torch.optim.Adam(list(model.parameters()) + [ep] + [sigma] + [sigma0],
                                   lr=learning_rate)
    J_star = np.zeros([it])
    for t in range(it):
        # one way to train kernel with limited data
        n_random = int(n_te*Num_clusters/5)
        selected_cls1 = np.random.choice(n_te * Num_clusters, n_random, False)
        selected_cls2 = np.random.choice(n_te * Num_clusters, n_random, False)
        s1_te_random = s1_te[selected_cls1,:]
        s2_te_random = s2_te[selected_cls2, :]
        S_random = np.concatenate((s1_te_random, s2_te_random), axis=0)
        S_random = MatConvert(S_random, device, dtype)
        # another way to train kernel with limited data (similar performance)
        # S_random = S
        # n_random = N1

        # Compute epsilon, sigma and sigma_0
        ep_ = ep ** 2
        sigma_ = sigma ** 2
        sigma0_ = sigma0 ** 2

        # Compute output of the deep network
        model_output = model(S_random)

        # Compute J (STAT_u)
        TEMP = MMDu(model_output, n_random, S_random, sigma_, sigma0_, ep_)
        mmd_value_temp = -1 * (TEMP[0] + 10 ** (-8))
        mmd_std_temp = torch.sqrt(TEMP[1] + 10 ** (-8))
        if mmd_std_temp.item() == 0:
            print('error!!')
        if np.isnan(mmd_std_temp.item()):
            print('error!!')
        STAT = torch.div(mmd_value_temp, mmd_std_temp)
        J_star[t] = STAT.item()

        # Initialize optimizer and Compute gradient
        optimizer.zero_grad()
        STAT.backward(retain_graph=True)

        # Update weights using gradient descent
        optimizer.step()
        # Print MMD, std of MMD and J
        if t % 100 == 0:
            print("mmd_value: ", -1 * mmd_value_temp.item(), "mmd_std: ", mmd_std_temp.item(), "Statistic: ",
                  -1 * STAT.item())
    return model, sigma, sigma0, ep

In [9]:
def DkTST(model, epoch, learning_rate, S, sigma, sigma0, ep, device, dtype):
    # Setup optimizer for training init kernel
    optimizer = torch.optim.Adam(list(model.parameters()) + [ep] + [sigma] + [sigma0],
                               lr=learning_rate)
    J_star = np.zeros([N_epoch])
    N1 = int(S.shape[0]/2)
    for t in range(epoch):
        # Compute epsilon, sigma and sigma_0
        ep_ = ep ** 2
        sigma_ = sigma ** 2
        sigma0_ = sigma0 ** 2
        # Compute output of the deep network
        model_output = model(S)
        # Compute J (STAT_u)
        TEMP = MMDu(model_output, N1, S, sigma_, sigma0_, ep_)
        mmd_value_temp = -1 * (TEMP[0] + 10 ** (-8))
        mmd_std_temp = torch.sqrt(TEMP[1] + 10 ** (-8))
        if mmd_std_temp.item() == 0:
            print('error!!')
        if np.isnan(mmd_std_temp.item()):
            print('error!!')
        STAT = torch.div(mmd_value_temp, mmd_std_temp)
        # STAT_u = mmd_value_temp # D+M
        J_star[t] = STAT.item()
        # Initialize optimizer and Compute gradient
        optimizer.zero_grad()
        STAT.backward(retain_graph=True)
        # Update weights using gradient descent
        optimizer.step()
        # Print MMD, std of MMD and J
        if t % 100 == 0:
            print("mmd_value_init: ", -1 * mmd_value_temp.item(), "mmd_std_init: ", mmd_std_temp.item(), "Statistic_init: ",
                  -1 * STAT.item())
    return model, sigma, sigma0, ep

In [10]:
torch.manual_seed(222)
torch.cuda.manual_seed_all(222)
np.random.seed(222)

dtype = torch.float
device = torch.device("cuda:0")

d = args.d  # dimension of data
n = args.n  # number of samples in per mode
n_te = args.n_te # number of training samples for the target task
K = args.K  # number of trails
num_meta_tasks = args.num_meta_tasks # number of meta-samples
print('n: ' + str(n) + ' d: ' + str(d))

N_per = 100  # permutation times
alpha = 0.05  # test threshold
x_in = d  # number of neurons in the input layer, i.e., dimension of data
H = 30  # number of neurons in the hidden layer
x_out = 3 * d  # number of neurons in the output layer
learning_rate = 0.00005 # learning rate for MMD-D
N_epoch = 1000  # maximim number of epochs for training
N = 100  # # number of test sets
N_f = 100.0  # number of test sets (float)
list_nte = [50, 80, 100, 120, 150, 200, 250] # number of test samples for the target task

config = [
    ('linear', [H, x_in]),
    ('softplus', [True]),
    ('linear', [H, H]),
    ('softplus', [True]),
    ('linear', [H, H]),
    ('softplus', [True]),
    ('linear', [x_out, H]),
]

print(args)

# Generate variance and co-variance matrix of Q (target task)
Num_clusters = 2

# Naming variables
s1 = np.zeros([n * Num_clusters, d])
s2 = np.zeros([n * Num_clusters, d])
s1_te = np.zeros([n_te * Num_clusters, d])
s2_te = np.zeros([n_te * Num_clusters, d])
J_star_MetaKL = np.zeros([N_epoch])
J_star_DK = np.zeros([N_epoch])
Results = np.zeros([len(list_nte), 2, K])

<torch._C.Generator at 0x7f00f00ef6f0>

n: 50 d: 2
<__main__.myargs object at 0x7f004e9f9ac0>


In [None]:
kk=0
maml_MetaKL = Meta(args, config).to(device)
maml_DK = Meta(args, config).to(device)

get_no_trainable_tensors(maml_MetaKL)
get_no_trainable_tensors(maml_DK)

# get the same init parameters for MMD-D
_,model_DK, sigma_DK, sigma0_DK, ep_DK = maml_DK.get_init()

# generate meta-samples
data_org = gen_meta_data(num_meta_tasks, n, args.closeness)
# get training loader for meta-samples
db_train = SynDataMetaTST(data_org, 10, 2, 150, 50)

# train meta kernels using the generated training loader
# this is the for loop of algorithm1
model_MetaKL, sigma_MetaKL, sigma0_MetaKL, ep_MetaKL = alg_one(maml_MetaKL, db_train, args.epoch)

Total trainable tensors: 2136


2136

Total trainable tensors: 2136


2136

DB: train (80, 200, 2) test (20, 200, 2)
sigma: 0.20559516549110413 sigma0: 0.01671348512172699 epsilon: 2.767058942513399e-17
J_value: 0.010120658203959465
step: 0 	training J value: 0.010120658203959465
sigma: 0.1966267079114914 sigma0: 0.014227871783077717 epsilon: 5.245743182058504e-07
J_value: -0.004747361410409212
sigma: 0.20249001681804657 sigma0: 0.015475654043257236 epsilon: 6.66746636852622e-05
J_value: -0.0004521567316260189
sigma: 0.20700708031654358 sigma0: 0.016662057489156723 epsilon: 0.0002396562194917351
J_value: 0.02556450478732586
sigma: 0.21307934820652008 sigma0: 0.01764904148876667 epsilon: 0.00034253718331456184
J_value: 0.011304693296551704
sigma: 0.21772420406341553 sigma0: 0.019145991653203964 epsilon: 0.000581011117901653
J_value: -0.013781196437776089
sigma: 0.22256694734096527 sigma0: 0.02080332301557064 epsilon: 0.0009742077672854066
J_value: -0.011179816909134388
sigma: 0.22246505320072174 sigma0: 0.022120624780654907 epsilon: 0.0009574690484441817
J_valu

In [29]:
#page 5
# setup meta kernels
torch.manual_seed(1 * 19 + n)
torch.cuda.manual_seed(kk * 19 + n)
epsilonOPT_MetaKL = MatConvert(np.ones(1) * np.sqrt(ep_MetaKL.detach().cpu().numpy()), device, dtype)
epsilonOPT_MetaKL.requires_grad = True
sigmaOPT_MetaKL = MatConvert(np.ones(1) * np.sqrt(sigma_MetaKL.detach().cpu().numpy()), device, dtype)
sigmaOPT_MetaKL.requires_grad = True
sigma0OPT_MetaKL = MatConvert(np.ones(1) * np.sqrt(sigma0_MetaKL.detach().cpu().numpy()), device, dtype)
sigma0OPT_MetaKL.requires_grad = True
print(epsilonOPT_MetaKL.item())

# Generate training data for target tasks
S = gen_data(n_te)[0]
S = MatConvert(S, device, dtype)
N1 = Num_clusters*n_te

# Meta Kernels as init when training with training set from the target task
np.random.seed(seed=1102)
torch.manual_seed(1102)
torch.cuda.manual_seed(1102)
it=101
model_MetaKL, sigmaOPT_MetaKL, sigma0OPT_MetaKL, epsilonOPT_MetaKL = fine_tune(model_MetaKL, it, learning_rate/3, S, sigmaOPT_MetaKL,  sigma0OPT_MetaKL, epsilonOPT_MetaKL, device, dtype)

<torch._C.Generator at 0x7f167cb17690>

1.14047639452312e-16


<torch._C.Generator at 0x7f167cb17690>

error!!tensor(0., device='cuda:0', grad_fn=<DivBackward0>)
mmd_value:  9.99999993922529e-09 mmd_std:  9.999999747378752e-05 Statistic:  0.00010000000474974513
error!!tensor(0., device='cuda:0', grad_fn=<DivBackward0>)
error!!tensor(0., device='cuda:0', grad_fn=<DivBackward0>)
error!!tensor(0., device='cuda:0', grad_fn=<DivBackward0>)
error!!tensor(0., device='cuda:0', grad_fn=<DivBackward0>)
error!!tensor(0., device='cuda:0', grad_fn=<DivBackward0>)
error!!tensor(0., device='cuda:0', grad_fn=<DivBackward0>)
error!!tensor(0., device='cuda:0', grad_fn=<DivBackward0>)
error!!tensor(0., device='cuda:0', grad_fn=<DivBackward0>)
error!!tensor(0., device='cuda:0', grad_fn=<DivBackward0>)
error!!tensor(0., device='cuda:0', grad_fn=<DivBackward0>)
error!!tensor(0., device='cuda:0', grad_fn=<DivBackward0>)
error!!tensor(0., device='cuda:0', grad_fn=<DivBackward0>)
error!!tensor(0., device='cuda:0', grad_fn=<DivBackward0>)
error!!tensor(0., device='cuda:0', grad_fn=<DivBackward0>)
error!!tensor(0

In [46]:
#page 6
# random kernel as init when training with training set from the target task --
# --> validate the consistence of performance of MMD-D
# setup init Kernels 
torch.manual_seed(1 * 19 + n)
torch.cuda.manual_seed(kk * 19 + n)
epsilonOPT_DK = MatConvert(np.ones(1) * np.sqrt(ep_DK.detach().cpu().numpy()), device, dtype)
epsilonOPT_DK.requires_grad = True
sigmaOPT_DK = MatConvert(np.ones(1) * np.sqrt(sigma_DK.detach().cpu().numpy()), device, dtype)
sigmaOPT_DK.requires_grad = True
sigma0OPT_DK = MatConvert(np.ones(1) * np.sqrt(sigma0_DK.detach().cpu().numpy()), device, dtype)
sigma0OPT_DK.requires_grad = True
print(epsilonOPT_DK.item())
np.random.seed(seed=1102)
torch.manual_seed(1102)
torch.cuda.manual_seed(1102)
epoch = 1000
model_DK, sigmaOPT_DK, sigma0OPT_DK, epsilonOPT_DK = DkTST(model_DK, epoch, learning_rate, S, sigmaOPT_DK, sigma0OPT_DK, epsilonOPT_DK, device, dtype)

<torch._C.Generator at 0x7f167cb17690>

6.537652641469549e-09


<torch._C.Generator at 0x7f167cb17690>

mmd_value_init:  0.00010027350799646229 mmd_std_init:  0.004481685347855091 Statistic_init:  0.02237406186759472
mmd_value_init:  0.00016742291336413473 mmd_std_init:  0.005034811794757843 Statistic_init:  0.03325306251645088
mmd_value_init:  0.00020742938795592636 mmd_std_init:  0.005126380827277899 Statistic_init:  0.040463123470544815
mmd_value_init:  0.00023890960437711328 mmd_std_init:  0.005071706138551235 Statistic_init:  0.04710635915398598
mmd_value_init:  0.0002677026204764843 mmd_std_init:  0.005043688230216503 Statistic_init:  0.05307675898075104
mmd_value_init:  0.00029157311655581 mmd_std_init:  0.005000464152544737 Statistic_init:  0.05830920860171318
mmd_value_init:  0.0003124646609649062 mmd_std_init:  0.004842504858970642 Statistic_init:  0.06452541798353195
mmd_value_init:  0.0003327984595671296 mmd_std_init:  0.004418324679136276 Statistic_init:  0.07532231509685516
mmd_value_init:  0.0002969273482449353 mmd_std_init:  0.0030128411017358303 Statistic_init:  0.098553

I should change the name of *_init, what the hell is that it should refer to the method that we are using like
h_MetaKL,h_MetaMKL, h_DK,...

In [47]:
#page 7
# test the trained kernel on the target task (with different sample size: 50, 80, 100, 120, 150, 200, 250)
sigma_MetaKL, sigma0_MetaKL, ep_MetaKL = sigmaOPT_MetaKL**2, sigma0OPT_MetaKL**2, epsilonOPT_MetaKL**2
sigma_DK, sigma0_DK, ep_Dk = sigmaOPT_DK**2, sigma0OPT_DK**2, epsilonOPT_DK**2
for i_test in range(len(list_nte)):
    n_te2 = list_nte[i_test]
    N1_te2 = Num_clusters * n_te2
    H_MetaKL = np.zeros(N)
    T_MetaKL = np.zeros(N)
    M_MetaKL = np.zeros(N)
    H_DK = np.zeros(N)
    T_DK = np.zeros(N)
    M_DK = np.zeros(N)
    np.random.seed(1102)
    count_MetaKL = 0
    count_DK = 0
    for k in range(N):
        # Generate target tasks
        S = gen_data(n_te2)[0]
        S = MatConvert(S, device, dtype)

        # Run two sample test (deep kernel) on generated data
        h_MetaKL, threshold_MetaKL, mmd_value_MetaKL = TST_MMD_u(model_MetaKL(S), N_per, N1_te2, S, sigma_MetaKL, sigma0_MetaKL, ep_MetaKL, alpha, device, dtype)
        h_DK, threshold_DK, mmd_value_DK = TST_MMD_u(model_DK(S), N_per, N1_te2, S, sigma_DK, sigma0_DK, ep_Dk, alpha, device, dtype)

        # Gather results
        count_MetaKL = count_MetaKL + h_MetaKL
        count_DK = count_DK + h_DK
        print("Meta_KL:", count_MetaKL, "MMD-DK:", count_DK)
        H_MetaKL[k] = h_MetaKL
        T_MetaKL[k] = threshold_MetaKL
        M_MetaKL[k] = mmd_value_MetaKL
        H_DK[k] = h_DK
        T_DK[k] = threshold_DK
        M_DK[k] = mmd_value_DK

    # Print test power of MetaKL and MMD-D
    print("Test Power of Meta MMD: ", H_MetaKL.sum() / N_f)
    Results[i_test, 0, kk] = H_MetaKL.sum() / N_f
    print("Test Power of Meta MMD (K times): ", Results[i_test, 0])
    print("Average Test Power of Meta MMD: ", Results[i_test, 0].sum() / (kk + 1))

    print("Test Power of deep MMD: ", H_DK.sum() / N_f)
    Results[i_test, 1, kk] = H_DK.sum() / N_f
    print("Test Power of deep MMD (K times): ", Results[i_test, 1])
    print("Average Test Power of deep MMD: ", Results[i_test, 1].sum() / (kk + 1))

Meta_KL: 0 MMD-DK: 0
Meta_KL: 0 MMD-DK: 0
Meta_KL: 0 MMD-DK: 0
Meta_KL: 0 MMD-DK: 0
Meta_KL: 0 MMD-DK: 0
Meta_KL: 0 MMD-DK: 0
Meta_KL: 0 MMD-DK: 0
Meta_KL: 0 MMD-DK: 0
Meta_KL: 0 MMD-DK: 0
Meta_KL: 0 MMD-DK: 0
Meta_KL: 0 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MMD-DK: 0
Meta_KL: 1 MM

In [48]:
print(Results[:,:,kk])

[[0.04 0.  ]
 [0.   0.  ]
 [0.   0.  ]
 [0.   0.  ]
 [0.   0.  ]
 [0.   0.  ]
 [0.   1.  ]]


In [6]:
import numpy
import torch 
n=2
perm = 10
K = torch.rand(2*n,2*n)
K = K@K.T
w_X = 1.
w_Y = -1.
ws = torch.full((perm+1, 2*n), w_Y)
ws[-1,:n] = w_X
for i in range(perm):
    ws[i, torch.torch.randperm(2*n)[:n].numpy()] = w_X
biased_ests = torch.einsum("pi,ij,pj->p", ws, K, ws)

ws, K, biased_ests

is_X = ws > 0
X_inds = is_X.nonzero()[:, 1].view(perm + 1, n)
Y_inds = (~is_X).nonzero()[:, 1].view(perm + 1, n)

cross_terms = K.take(Y_inds * 2* n + X_inds).sum(1)

cross_terms

(biased_ests - K.trace() + 2 * cross_terms)

Kx = K[:n,:n]
Ky = K[n:,n:]
Kxy = K[:n,n:]
Kyx = K[n:,:n]
H = Kx+Ky-Kxy-Kyx
H.fill_diagonal_(0)
torch.sum(H)