# Generate artificial data sets
Here, we generate data sets to explore the behavior of scores describing differences between high-dimensional point clouds in presence of noise and outliers.  

In [None]:
import numpy as np

In [None]:
# Number of points in control dataset
NR = 2000
# Number of points in other datasets
N = 1000
# Number of dimensions in each dataset
D = 100

## Dataset 1 - Reference R
We assume our data of interest to follow a multivariate normal distribution: In a morphological profiling, components are to some extent *independent (by removing correlated morphological features) and* normally distributed (by using a log-transformation).

In [None]:
np.random.seed(1)

### Data center

In [None]:
# The reference is centered on 0
refMu = [0]*D

### Data covariance

In [None]:
# Diagonal: variances follow a Beta distribution of shape and scale parameters equal to 2 and 2
# Rationale: Some variability in scales with some high values, and no negative values
sigma_diag = np.eye(D)
sigma_diag = sigma_diag * np.random.gamma(2,2,D)

The following may lead to non positive-semidefinite matrices:

```
# Lower and upper triangles: some relation but no strong covariances
# Rationale: in a morphological profiling, we remove highly correlated features
# E.g. for a correlation threshold of 0.5, since cov(x,y) = corr(x,y) * sigma(x) * sigma(y)
# This corresponds to a covariance threshold of 0.5 * sigma(x) * sigma(y) which is 2 in average
# We select a centered normal distribution of variance 0.75 based on the 99-th percentile
sigma_tril = np.random.normal(0, 0.75, [D,D])
sigma_tril = np.tril(sigma_tril ) + np.tril(sigma_tril , -1).T
# Possible improvement: rescale all values based on the actual standard deviations of the features studied 
# (instead of using the expectation)
```

In [None]:
sigma_tril = np.random.normal(0, 0.05, [D,D])
sigma_tril = np.tril(sigma_tril ) + np.tril(sigma_tril , -1).T

In [None]:
refSigma = sigma_diag + sigma_tril - np.eye(D)*np.diag(sigma_tril)

In [None]:
# Matrix is symmetrical
assert all([refSigma[i,j] == refSigma[j,i] for i in range(D) for j in range(D) if j>i])
# Sylvester's criterion of positive semidefinite matrices
assert all([np.linalg.det(refSigma[:(size+1),:(size+1)]) > 0 for size in range(D)])

In [None]:
matR = np.random.multivariate_normal(refMu, refSigma, NR)

In [None]:
np.savetxt("Data/matR.csv", matR, delimiter=",")

## Dataset 2 - Negative control N
This is generated with the same generator as the positive control

In [None]:
np.random.seed(2)

### Data covariance

In [None]:
matN = np.random.multivariate_normal(refMu, refSigma, N)

In [None]:
np.savetxt("Data/matN.csv", matN, delimiter=",")

## Dataset 3 - Positive control (shifted) PS

In [None]:
np.random.seed(3)

### Data center

In [None]:
# The reference is not centered on 0 anymore
mu = np.random.normal(0, 1, D)

In [None]:
matPS = np.random.multivariate_normal(mu, refSigma, N)

In [None]:
np.savetxt("Data/matPS.csv", matPS, delimiter=",")

## Dataset 4 - Positive control (reshaped) PR
This is generated with the same parameters but a different generator as the reference data set, so the covariance of is different.

In [None]:
np.random.seed(4)

### Data covariance

In [None]:
# Diagonal: variances follow a Beta distribution of shape and scale parameters equal to 2 and 2
# Rationale: Some variability in scales with some high values, and no negative values
sigma_diag = np.eye(D)
sigma_diag = sigma_diag * np.random.gamma(2,2,D)

In [None]:
sigma_tril = np.random.normal(0, 0.05, [D,D])
sigma_tril = np.tril(sigma_tril ) + np.tril(sigma_tril , -1).T

In [None]:
sigma = sigma_diag + sigma_tril - np.eye(D)*np.diag(sigma_tril)

In [None]:
# Matrix is symmetrical
assert all([sigma[i,j] == sigma[j,i] for i in range(D) for j in range(D) if j>i])
# Sylvester's criterion of positive semidefinite matrices
assert all([np.linalg.det(sigma[:(size+1),:(size+1)]) > 0 for size in range(D)])

In [None]:
matPR = np.random.multivariate_normal(refMu, sigma, N)

In [None]:
np.savetxt("Data/matPR.csv", matPR, delimiter=",")

## Dataset 5 - Uniform noise U

In [None]:
np.random.seed(5)

In [None]:
matU = np.random.multivariate_normal(refMu, refSigma, N) + np.random.uniform(-1,1, size = [N,D])

In [None]:
np.savetxt("Data/matU.csv", matU, delimiter=",")

## Dataset 6 - Negative with few outliers NF

In [None]:
np.random.seed(6)

In [None]:
matNF = np.random.multivariate_normal(refMu, refSigma, N)

In [None]:
print(matNF.mean())
print(np.median(matNF))
print(np.std(matNF))

In [None]:
# For 1 out of k points, multiply 1 out of k features by 2
for iOutlier in np.random.choice(N, round(N/10), replace = False):
    matNF[iOutlier,np.random.choice(D, round(D/10), replace = False)] *= 2

In [None]:
print(matNF.mean())
print(np.median(matNF))
print(np.std(matNF))

In [None]:
np.savetxt("Data/matNF.csv", matNF, delimiter=",")

## Dataset 7 - Negative with many outliers NM

In [None]:
np.random.seed(7)

In [None]:
matNM = np.random.multivariate_normal(refMu, refSigma, N)

In [None]:
print(matNM.mean())
print(np.median(matNM))
print(np.std(matNM))

In [None]:
# For 1 out of k points, multiply 1 out of k features by 2
for iOutlier in np.random.choice(N, round(N/3), replace = False):
    matNM[iOutlier,np.random.choice(D, round(D/3), replace = False)] *= 2

In [None]:
print(matNM.mean())
print(np.median(matNM))
print(np.std(matNM))

In [None]:
np.savetxt("Data/matNM.csv", matNM, delimiter=",")

## Dataset 8 - Positive with few outliers PF

In [None]:
np.random.seed(8)

### Data center

In [None]:
# The reference is not centered on 0 anymore
mu = np.random.normal(0, 1, D)

In [None]:
matPF = np.random.multivariate_normal(mu, refSigma, N)

In [None]:
print(matPF.mean())
print(np.median(matPF))
print(np.std(matPF))

In [None]:
# For 1 out of k points, multiply 1 out of k features by 2
for iOutlier in np.random.choice(N, round(N/10), replace = False):
    matPF[iOutlier,np.random.choice(D, round(D/10), replace = False)] *= 2

In [None]:
print(matPF.mean())
print(np.median(matPF))
print(np.std(matPF))

In [None]:
np.savetxt("Data/matPF.csv", matPF, delimiter=",")

## Dataset 9 - Positive with many outliers PM

In [None]:
np.random.seed(9)

### Data center

In [None]:
# The reference is not centered on 0 anymore
mu = np.random.normal(0, 1, D)

In [None]:
matPM = np.random.multivariate_normal(mu, refSigma, N)

In [None]:
print(matPM.mean())
print(np.median(matPM))
print(np.std(matPM))

In [None]:
# For 1 out of k points, multiply 1 out of k features by 2
for iOutlier in np.random.choice(N, round(N/3), replace = False):
    matPM[iOutlier,np.random.choice(D, round(D/3), replace = False)] *= 2

In [None]:
print(matPM.mean())
print(np.median(matPM))
print(np.std(matPM))

In [None]:
np.savetxt("Data/matPM.csv", matPM, delimiter=",")