# Generate artificial data sets
Here, we generate data sets to explore the behavior of scores describing differences between high-dimensional point clouds in presence of noise and outliers.  

In [1]:
import numpy as np

In [2]:
# Number of points in control dataset
NR = 2000
# Number of points in other datasets
N = 1000
# Number of dimensions in each dataset
D = 100

## Dataset 1 - Reference R
We assume our data of interest to follow a multivariate normal distribution: In a morphological profiling, components are to some extent *independent (by removing correlated morphological features) and* normally distributed (by using a log-transformation).

In [3]:
np.random.seed(1)

### Data center

In [4]:
# The reference is centered on 0
mu = [0]*D

### Data covariance

In [5]:
# Diagonal: variances follow a Beta distribution of shape and scale parameters equal to 2 and 2
# Rationale: Some variability in scales with some high values, and no negative values
sigma_diag = np.eye(D)
sigma_diag = sigma_diag * np.random.gamma(2,2,D)

The following may lead to non positive-semidefinite matrices:

```
# Lower and upper triangles: some relation but no strong covariances
# Rationale: in a morphological profiling, we remove highly correlated features
# E.g. for a correlation threshold of 0.5, since cov(x,y) = corr(x,y) * sigma(x) * sigma(y)
# This corresponds to a covariance threshold of 0.5 * sigma(x) * sigma(y) which is 2 in average
# We select a centered normal distribution of variance 0.75 based on the 99-th percentile
sigma_tril = np.random.normal(0, 0.75, [D,D])
sigma_tril = np.tril(sigma_tril ) + np.tril(sigma_tril , -1).T
# Possible improvement: rescale all values based on the actual standard deviations of the features studied 
# (instead of using the expectation)
```

In [6]:
sigma_tril = np.random.normal(0, 0.05, [D,D])
sigma_tril = np.tril(sigma_tril ) + np.tril(sigma_tril , -1).T

In [7]:
sigma = sigma_diag + sigma_tril - np.eye(D)*np.diag(sigma_tril)

In [8]:
# Matrix is symmetrical
assert all([sigma[i,j] == sigma[j,i] for i in range(D) for j in range(D) if j>i])
# Sylvester's criterion of positive semidefinite matrices
assert all([np.linalg.det(sigma[:(size+1),:(size+1)]) > 0 for size in range(D)])

In [9]:
matR = np.random.multivariate_normal(mu, sigma, NR)

In [10]:
np.savetxt("Data/matR.csv", matR, delimiter=",")

## Dataset 2 - Negative control N
This is generated with the same generator as the positive control

In [11]:
np.random.seed(2)

### Data covariance

In [12]:
matN = np.random.multivariate_normal(mu, sigma, N)

In [13]:
np.savetxt("Data/matN.csv", matN, delimiter=",")

## Dataset 3 - Positive control (shifted) PS

In [14]:
np.random.seed(3)

### Data center

In [15]:
# The reference is not centered on 0 anymore
mu = np.random.normal(0, 1, D)

In [16]:
matPS = np.random.multivariate_normal(mu, sigma, N)

In [17]:
np.savetxt("Data/matPS.csv", matPS, delimiter=",")

## Dataset 4 - Positive control (reshaped) PR
This is generated with the same parameters but a different generator as the reference data set, so the covariance of is different.

In [18]:
np.random.seed(4)

### Data center

In [19]:
# The reference is centered on 0
mu = [0]*D

### Data covariance

In [20]:
# Diagonal: variances follow a Beta distribution of shape and scale parameters equal to 2 and 2
# Rationale: Some variability in scales with some high values, and no negative values
sigma_diag = np.eye(D)
sigma_diag = sigma_diag * np.random.gamma(2,2,D)

In [21]:
sigma_tril = np.random.normal(0, 0.05, [D,D])
sigma_tril = np.tril(sigma_tril ) + np.tril(sigma_tril , -1).T

In [22]:
sigma = sigma_diag + sigma_tril - np.eye(D)*np.diag(sigma_tril)

In [23]:
# Matrix is symmetrical
assert all([sigma[i,j] == sigma[j,i] for i in range(D) for j in range(D) if j>i])
# Sylvester's criterion of positive semidefinite matrices
assert all([np.linalg.det(sigma[:(size+1),:(size+1)]) > 0 for size in range(D)])

In [24]:
matPR = np.random.multivariate_normal(mu, sigma, N)

In [25]:
np.savetxt("Data/matPR.csv", matPR, delimiter=",")

## Dataset 5 - Uniform noise U

## Dataset 6 - Negative with noise NO

## Dataset 7 - Positive with noise PO