# Generate artificial data sets
Here, we generate data sets to explore the behavior of scores describing differences between high-dimensional point clouds in presence of noise and outliers.  
NB: We might want to keep the exact same generator and chang

In [2]:
import numpy as np

In [3]:
# Number of points in each dataset
N = 1000
# Number of dimensions in each dataset
D = 100

## Dataset 1 - Reference R
We assume our data of interest to follow a multivariate normal distribution: In a morphological profiling, components are to some extent *independent (by removing correlated morphological features) and* normally distributed (by using a log-transformation).

In [4]:
np.random.seed(1)

### Data center

In [5]:
# The reference is centered on 0
mu = [0]*D

### Data covariance

In [6]:
# Diagonal: variances follow a Beta distribution of shape and scale parameters equal to 2 and 2
# Rationale: Some variability in scales with some high values, and no negative values
sigma_diag = np.eye(D)
sigma_diag = sigma_diag * np.random.gamma(2,2,D)

The following may lead to non positive-semidefinite matrices:

```
# Lower and upper triangles: some relation but no strong covariances
# Rationale: in a morphological profiling, we remove highly correlated features
# E.g. for a correlation threshold of 0.5, since cov(x,y) = corr(x,y) * sigma(x) * sigma(y)
# This corresponds to a covariance threshold of 0.5 * sigma(x) * sigma(y) which is 2 in average
# We select a centered normal distribution of variance 0.75 based on the 99-th percentile
sigma_tril = np.random.normal(0, 0.75, [D,D])
sigma_tril = np.tril(sigma_tril ) + np.tril(sigma_tril , -1).T
# Possible improvement: rescale all values based on the actual standard deviations of the features studied 
# (instead of using the expectation)
```

In [7]:
sigma_tril = np.random.normal(0, 0.05, [D,D])
sigma_tril = np.tril(sigma_tril ) + np.tril(sigma_tril , -1).T

In [8]:
sigma = sigma_diag + sigma_tril - np.eye(D)*np.diag(sigma_tril)

In [9]:
# Matrix is symmetrical
assert all([sigma[i,j] == sigma[j,i] for i in range(D) for j in range(D) if j>i])
# Sylvester's criterion of positive semidefinite matrices
assert all([np.linalg.det(sigma[:(size+1),:(size+1)]) > 0 for size in range(D)])

In [10]:
matR = np.random.multivariate_normal(mu, sigma, N)

In [11]:
np.savetxt("Data/matR.csv", matR, delimiter=",")

## Dataset 2 - Negative control N
This is generated with the same parameters as the positive control

In [21]:
np.random.seed(2)

In [22]:
# The reference is centered on 0
mu = [0]*D

### Data covariance

In [23]:
# Diagonal: variances follow a Beta distribution of shape and scale parameters equal to 2 and 2
# Rationale: Some variability in scales with some high values, and no negative values
sigma_diag = np.eye(D)
sigma_diag = sigma_diag * np.random.gamma(2,2,D)

In [25]:
sigma_tril = np.random.normal(0, 0.05, [D,D])
sigma_tril = np.tril(sigma_tril ) + np.tril(sigma_tril , -1).T

In [16]:
sigma = sigma_diag + sigma_tril - np.eye(D)*np.diag(sigma_tril)

In [17]:
# Matrix is symmetrical
assert all([sigma[i,j] == sigma[j,i] for i in range(D) for j in range(D) if j>i])
# Sylvester's criterion of positive semidefinite matrices
assert all([np.linalg.det(sigma[:(size+1),:(size+1)]) > 0 for size in range(D)])

In [18]:
matN = np.random.multivariate_normal(mu, sigma, N)

In [19]:
np.savetxt("Data/matN.csv", matN, delimiter=",")

In [24]:
sigma_diag

array([[2.3689073 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 3.19015343, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.51599864, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 2.02328286, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 7.26938248,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        2.2926934 ]])

In [26]:
sigma_tril

array([[ 0.02571854,  0.01138837, -0.06374812, ...,  0.06145864,
        -0.06970232,  0.03289393],
       [ 0.01138837,  0.03638444,  0.05242452, ...,  0.00722283,
         0.02652863, -0.05766976],
       [-0.06374812,  0.05242452,  0.08626016, ...,  0.05385503,
        -0.01055927, -0.0617021 ],
       ...,
       [ 0.06145864,  0.00722283,  0.05385503, ..., -0.00786091,
        -0.0336918 , -0.09713829],
       [-0.06970232,  0.02652863, -0.01055927, ..., -0.0336918 ,
        -0.00055249,  0.03202692],
       [ 0.03289393, -0.05766976, -0.0617021 , ..., -0.09713829,
         0.03202692,  0.05724172]])

In [27]:
sigma_diag + sigma_tril - np.eye(D)*np.diag(sigma_tril)

array([[ 2.36890730e+00,  1.13883686e-02, -6.37481218e-02, ...,
         6.14586407e-02, -6.97023197e-02,  3.28939302e-02],
       [ 1.13883686e-02,  3.19015343e+00,  5.24245169e-02, ...,
         7.22283304e-03,  2.65286326e-02, -5.76697575e-02],
       [-6.37481218e-02,  5.24245169e-02,  5.15998637e-01, ...,
         5.38550277e-02, -1.05592689e-02, -6.17021022e-02],
       ...,
       [ 6.14586407e-02,  7.22283304e-03,  5.38550277e-02, ...,
         2.02328286e+00, -3.36918009e-02, -9.71382858e-02],
       [-6.97023197e-02,  2.65286326e-02, -1.05592689e-02, ...,
        -3.36918009e-02,  7.26938248e+00,  3.20269216e-02],
       [ 3.28939302e-02, -5.76697575e-02, -6.17021022e-02, ...,
        -9.71382858e-02,  3.20269216e-02,  2.29269340e+00]])

## Dataset 3 - Positive control P

## Dataset 4 - Uniform noise U

## Dataset 5 - Negative with noise NO

## Dataset 6 - Positive with noise PO