In [1]:
import numpy as np

In [2]:
def generate_cov_matrix(SDs): 
  #Input - SDs (np array of standard deviations of features)
  #Output - a square np array of covariances, to be tested later for PSD-ness 
  
  #Initialize square covariance matrix
  cov = np.zeros((SDs.shape[0], SDs.shape[0]))
  #Loop through all pairwise variable combos
  for i in range(SDs.shape[0]):
    for j in range(i, SDs.shape[0]):
      #Set diagonals: variance = SD squared
      if i == j:
        cov[i,j] = SDs[i]**2
      #Set non-diagonal covariances 
      else:
        #Set bounds on covariance - maximum magnitude is SD[first variable] * SD[second variable]
        max_cov = SDs[i]*SDs[j]
        #Sample uniformly from the range of (-max magnitude, +max magnitude)
        covariance = np.random.uniform(low=-max_cov, high=max_cov)
        #Set diagonals of matrix 
        cov[i,j] = covariance
        cov[j,i] = covariance
  return cov

In [3]:
def generate_valid_cov_matrix(SDs):
  """Input - SDs (np array of standard deviations of features)
  Output - tuple of: (
    a valid, positive semidefinite symmetric (square) covariance matrix;
    eigenvalue array, of same shape as SD array (hopefully);
    unit-length eigenvector matrix
    )
  """

  #Initialize covariance, eigenvalue-calculation variables 
  cov = None
  w = None
  v = None
  #Counter for iterations
  i = 0
  #Loop until PSD matrix is generated 
  while True: 
    i += 1
    cov = generate_cov_matrix(SDs)
    try:
      #Decomposition
      w, v = np.linalg.eig(cov)
      #Stop the loop if all eigenvectors are positive
      if sum(np.where(w > 0, 1, 0)) == SDs.shape[0]:
        break
    #If we generate a non-invertible matrix - retry 
    except:
      continue
  
  #Print some basic info
  print(f"Valid covariance matrix generated after {i} attempts!")
  print(f"Number of negative covariances: {np.sum(np.where(cov < 0, 1, 0))} (of {SDs.shape[0]**2} total covariances)")
  return cov, w, v

In [4]:
#Copied from old notebook
SDs = np.array([0.7, 21.1, 27.9, 1, 1135.8, 4.3, 36.3])

In [5]:
#Test out! 
cov, w, v = generate_valid_cov_matrix(SDs)

Valid covariance matrix generated after 273118 attempts!
Number of negative covariances: 18 (of 49 total covariances)


In [6]:
#Testing out random data generation 
means = np.array([1.7, 87.6, 71.4, 1.6, 777.5, 5.2, 76.0]) #from old notebook
sigma = np.sqrt(w) * v #get covariance-corrected SDs 

#In the unvariate example, this is like "undoing" z-score
#Multiply "z-scores" (randomly sampled from standard normal distribution) by 
#standard deviation, then add means, to generate random data vectors 
ten_test_examples = (sigma @ np.random.normal(loc=0, scale=1, size=(means.shape[0], 10)) + means[:, np.newaxis]).T

In [7]:
#Note - some of our features are CLEARLY not Gaussian because they have large SDs but
#the lab readings can't be negative - how do we handle this behavior? 

#One idea might be to standardize each variable (e.g. subtract mean from all values, divide by SD),
#Calculate covariance matrix on that, and generate non-standard Gaussian "seeds"? But lots of work
#Or maybe start with a non-Gaussian "seed" in the above code instead of np.random.normal() (preferred), 
#or set The "scale" argument to be <1 - that way, we don't generate any negative values (hacky but probably ok)
ten_test_examples 

array([[ 2.79557737e+00,  8.56643552e+01,  1.44511554e+01,
         1.77894055e+00,  1.44728752e+03,  8.84895631e+00,
         5.74099312e+01],
       [ 2.33041564e+00,  1.05766704e+02,  8.85271280e+01,
         3.45416600e-01, -3.10445037e+02,  5.90187846e+00,
         1.32553275e+01],
       [ 1.95729089e+00,  1.05584186e+02,  5.18764280e+01,
         3.04675258e-01,  2.15542973e+03,  8.57695501e+00,
         6.47662954e+01],
       [ 1.72973892e+00,  1.05234266e+02,  3.41179187e+00,
         6.92008715e-01,  7.57246740e+02,  8.03368054e+00,
         6.41372230e+01],
       [ 1.89000503e+00,  1.08610227e+02,  6.85910369e+01,
         2.02683131e+00,  1.43525767e+03,  1.38224662e+00,
         3.84158174e+01],
       [ 1.78158738e+00,  7.70589609e+01,  4.35952546e+01,
         9.43654923e-01, -2.47613558e+02,  5.02542232e+00,
         5.14556473e+01],
       [ 1.65641900e+00,  8.25360102e+01,  2.76517776e+01,
         2.93852364e+00,  9.06384704e+02,  1.68858363e+00,
         9.9282286