In [1]:
# imports
import numpy as np

In [64]:
def generate_data(n, p, num_samples, sigma, noise, degree):
    '''
    Generate data for nxn grid
    
    Parameters:
        int n: Size of nxn grid
        int p: number of features
        int num_samples: number of data points to generate
        array sigma: array of length p, is the variance of each feature vector dimension, i.e. x_i ~ N(0, sigma_p)
        float noise: multiplicative noise term applied to cost vector, sampled from uniform distribution in [1-noise, 1+noise]
        int degree: polynomial degree of generated cost vector. When degree=1, expected value of c is linear in x. Degree > 1 controls the amount of model misspecification.
    
    Returns:
        np.array X: feature data of dimension [num_samples, p]
        np.array C: cost data of dimension [num_samples, d]
    '''
    # Define number of edges based on grid size, i.e. size of cost vector
    d = n*(n-1)*2
    
    # Define the parameters of the true model
    B_star = np.random.binomial(size=[d,p], n=1, p= 0.5) # each entry of B is a bernoulli RV with prob = 0.5 entry is 1
    
    # Generate feature data: Generated from multivariate Gaussian distribution with i.i.d. standard normal entries --> x ~ N(0, sigma)
    X = np.random.normal(loc = 0, scale = sigma, size = [num_samples, p]) # each row is a training point of size p
    
    # Generate cost data
    noise_vector = np.random.uniform(low = 1-noise, high = 1+noise, size = [num_samples, d]) # i.i.d noise terms
    C = np.multiply((((1/np.sqrt(p) * B_star@X.T) + 3)**degree + 1).T, noise_vector)
    
    return X, C

In [65]:
n = 5 # size of cost vector
p = 5 # number of features
num_samples = 10 
sigma = [0.1,0.2,0.3,0.4,0.5]
noise = 0.25
degree = 3

X, C = generate_data(n, p, num_samples, sigma, noise, degree)
print("X dim: ", X.shape)
print("C dim: ", C.shape)

X dim:  (10, 5)
C dim:  (10, 40)


In [66]:
X

array([[-0.14252375,  0.08956791, -0.07488203, -0.24252375,  0.3988807 ],
       [ 0.14308866, -0.14129513, -0.2797642 , -0.02655548, -0.54917618],
       [ 0.11881205,  0.22328956,  0.03273052,  0.56717117, -0.05293378],
       [ 0.21921007,  0.13098133,  0.10269018, -0.31600689, -0.16701512],
       [-0.11736495,  0.39061607, -0.0400777 ,  0.02381611, -1.06366179],
       [ 0.04202911,  0.41344886,  0.00862802, -0.32510424, -0.33476599],
       [ 0.10353665, -0.11750462,  0.0896954 , -0.02197983, -0.01743142],
       [-0.156595  , -0.15153743, -0.36610813,  0.33766014,  0.49862375],
       [-0.07491527, -0.00213333,  0.20651231,  0.12647256, -0.36521422],
       [-0.06226342, -0.01922638, -0.11215412,  0.20668124,  0.26729641]])

In [67]:
C

array([[25.03202216, 25.15969043, 32.35718048, 22.37391577, 27.41964448,
        29.75896583, 31.20131744, 26.16440084, 30.84755794, 21.28578763,
        18.63889797, 31.51545461, 31.40775054, 33.10842513, 28.41036205,
        21.83907518, 23.83683501, 23.95040237, 31.63938607, 29.23882205,
        30.37066849, 26.14885096, 24.63828788, 38.42050896, 28.74157347,
        31.03294449, 28.51513564, 25.70748383, 28.19996802, 30.1598491 ,
        30.41032121, 24.7329057 , 23.40955117, 32.06830781, 20.1292397 ,
        27.46829134, 21.87626142, 21.53207003, 35.59618818, 34.32833394],
       [18.0951556 , 25.35669189, 24.63990139, 21.57467531, 32.73971526,
        23.52586212, 15.95393763, 20.97471697, 29.32880312, 24.61358381,
        23.05586052, 22.86964975, 19.60191461, 22.34092855, 32.29564945,
        28.17690608, 21.71618446, 13.70796194, 16.87825505, 17.88647836,
        21.26364578, 17.26240356, 15.86493157, 18.775289  , 25.27512756,
        18.73455774, 32.08148454, 26.4187617 , 19.

In [None]:
|