# ZSL target-shift: synthetic data generation
<br>

In [1]:
import pickle
import numpy as np
import os, sys
import csv

-----------
### Synthetic data generation.

In [2]:
num_feat = 5
num_attr = 2
cov_matrix = np.eye(num_feat)  # a common cov mat for all distributions.

means = []
for i in range(num_attr):
    x = np.random.multivariate_normal(np.zeros(num_feat), cov_matrix)
    x = np.array([x, x+np.ones(num_feat)])    
    means.append(x)
    
# distance between means can be used to make classification task easy or hard. Larger the distances easier the task
def mean_alteration(means, d1=1, d2=1):
    # ratio: ratio between the means of second task to first task
    dis1 = np.linalg.norm(means[0][0] - means[0][1])
    dis2 = np.linalg.norm(means[1][0] - means[1][1])
    means[0][1] = means[0][0] + d1*(means[0][1] - means[0][0])/dis1
    means[1][1] = means[1][0] + d2*(means[1][1] - means[1][0])/dis2
    return means
means_alt = mean_alteration(means, 1.5, 1.5)

In [3]:
# Co-occurence of classes. This ratio varies for train and test set
prob0 = 0.5
def get_conditional_prob(corr_prob):
    cond_mat = np.zeros((num_attr, num_attr))
    cond_mat[0,0] = corr_prob
    cond_mat[0,1] = 1 - cond_mat[0,0]
    cond_mat[1,0] = 1 - cond_mat[0,0]
    cond_mat[1,1] = cond_mat[0,0]
    return cond_mat

cond_best = 0.5 * np.ones((num_attr, num_attr))

def gen_data_cond(num_data, cond_prob, means):
    X = []
    Y = []
    prob0 = 0.5
    for i in range(num_data):
        y0 = int(np.random.rand() < prob0)
        y1 = int(np.random.rand() < cond_prob[y0, 1])
        Y.append([y0, y1])

        x0 = np.random.multivariate_normal(means[0][y0], np.eye(num_feat), 1)
        x1 = np.random.multivariate_normal(means[1][y1], np.eye(num_feat), 1)
        X.append(np.append(x0, x1))
    X = np.array(X)
    Y = np.array(Y)
    return X, Y

# Xtrain, Ytrain = gen_data_cond(1000, cond_train, means_alt)
# Xval, Yval = gen_data_cond(1000, cond_train, means_alt)
# Xtest, Ytest = gen_data_cond(1000, cond_test, means_alt)
Xbest, Ybest = gen_data_cond(1000, cond_best, means_alt)

In [4]:
num_train = 1000
dp = 1.5
da = 1.5
corr_train = 0.8
corr_test = 0.5
test_data_size = 50000

# training set
dfilename = 'train_data_n-' + str(num_train) + '_cor-' + str(corr_train) + '_dp-' + str(dp) + '_da-' + str(da) + '.pckl'
means_alt = mean_alteration(means, dp, da)
cond_test = get_conditional_prob(corr_train)
Xtrain, Ytrain = gen_data_cond(num_train, cond_test, means_alt)
with open('../synthetic_data/' + dfilename, 'w') as fp:
   pickle.dump({'X':Xtrain, 'Y':Ytrain}, fp)
print(dfilename)

# test set with different correlation between attributes
for corr_test in [0.1, 0.2, 0.3, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]:
    # test data
    dfilename = 'test_data_' + '_cor-' + str(corr_test) + 'dp-' + str(dp) + '_da-' + str(da) + '.pckl'
    if dfilename not in os.listdir('../synthetic_data/'):
        means_alt = mean_alteration(means, dp, da)
        cond_test = get_conditional_prob(corr_test)
        Xtest, Ytest = gen_data_cond(test_data_size, cond_test, means_alt)
        pickle.dump({'X':Xtest, 'Y':Ytest}, open('../synthetic_data/' + dfilename, 'w'))
        print(dfilename)

train_data_n-1000_cor-0.8_dp-1.5_da-1.5.pckl
test_data__cor-0.1dp-1.5_da-1.5.pckl
test_data__cor-0.2dp-1.5_da-1.5.pckl
test_data__cor-0.3dp-1.5_da-1.5.pckl
test_data__cor-0.4dp-1.5_da-1.5.pckl
test_data__cor-0.45dp-1.5_da-1.5.pckl
test_data__cor-0.5dp-1.5_da-1.5.pckl
test_data__cor-0.55dp-1.5_da-1.5.pckl
test_data__cor-0.6dp-1.5_da-1.5.pckl
test_data__cor-0.65dp-1.5_da-1.5.pckl
test_data__cor-0.7dp-1.5_da-1.5.pckl
test_data__cor-0.75dp-1.5_da-1.5.pckl
test_data__cor-0.8dp-1.5_da-1.5.pckl
test_data__cor-0.85dp-1.5_da-1.5.pckl
test_data__cor-0.9dp-1.5_da-1.5.pckl
