# Generate artificial data sets
Here, we generate data sets to explore the behavior of scores describing differences between high-dimensional point clouds in presence of noise and outliers.  

This script output the following datasets:

* `matR.csv` - Reference dataset, following a centered multivariate normal distribution
* `matN.csv` - Negative control dataset, using the same generator used for reference dataset
* `matPS.csv` - Positive control dataset, shifted (center coordinates now follow a centered normal distribution with standard deviation 0.5) while the covariance is kept identical to reference dataset
* `matPR.csv` - Positive control dataset, centered but with the squared covariance matrix compared to the reference dataset

As well as `matRo.csv`, `matNo.csv`, `matPSo.csv` and `matPRo.csv` which follow the same rules but include outliers.

In [None]:
using Random, Distributions, RCall
using DataFrames, CSV

using LinearAlgebra: I, Diagonal, diag, det, qr, Symmetric

In [None]:
@rlibrary ggplot2
@rlibrary extrafont

In [None]:
R"""
# Used later for MCD computation

library(robustbase)

# Customize ggplot appearance

library(ggplot2)
library(extrafont)


# Load extra fonts
ttf_import(paths = "/tmp/.fonts/")
loadfonts()

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), 
                     legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}

cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
scale_fill_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)
"""

In [None]:
# Number of points in control dataset
NR = 4500
# Number of points in other datasets
N = 1500
# Number of dimensions in each dataset
D = 100
# Percentage of datasets contaminated with outliers
pOutliers = 1/3
# Scaling of the transformation for positive controls
posScaling = 1.0;

## Dataset 1 - Reference R
We assume our data of interest to follow a multivariate normal distribution: In a morphological profiling, components are to some extent *independent (by removing correlated morphological features) and* normally distributed (by using a log-transformation).

In [None]:
Random.seed!(1);

### Data center

In [None]:
# The reference is centered on 0
µ = zeros(D);

### Data covariance

In [None]:
# Diagonal: variances follow a Gamma distribution of shape and scale parameters equal to 1 and 2
# Rationale: Some variability in scales with some high values, and no negative values
# NB: Beta distribution could be used instead of Gamma distribution if long-tail is not needed
# NB: Effects are smoothed by the orthogonal transformation anyway
distrib = Gamma(1,2)
sigma_diag = rand(distrib, D);

In [None]:
ggplot(DataFrame(x = sigma_diag)) + geom_histogram(aes(:x))

In [None]:
# Now we transform this space by multiplying by a random orthogonal matrix
s = rand(D,D)
Q, R = qr(s);

In [None]:
# NB: becomes really slow, do not try with D > 500
∑ = Q' * Diagonal(sigma_diag) * Q;

In [None]:
# Check that the matrix is symmetrical (up to machine error)
@assert all([∑[i,j] ≈ ∑[j,i] for i in 1:D for j in 1:D if j>i])
# Make it perfectly symmetrical
[∑[i,j] = ∑[j,i] for i in 1:D for j in 1:D if j>i]
# Sylvester's criterion of positive semidefinite matrices
@assert all([det(∑[1:size,1:size]) > 0 for size in 1:D])

In [None]:
ggplot(DataFrame(x = diag(∑))) + geom_histogram(aes(:x)) 

### Output reference dataset

In [None]:
# The data will follow a multivariate normal distribution with the parameters
# we generated previously
distrib = MvNormal(µ, ∑);

In [None]:
matR = DataFrame(rand(distrib, NR)')

In [None]:
mkpath("Data")
CSV.write("Data/matR.csv", matR; delim=",", writeheader = false)

### Covariance matrix (for exploration only)
By definition, $\text{cov}(x,y) = E[x,y] - E[x]E[y]$.
Here because the distribution is centered, $E[x) = E(y) = 0$, so the covariance between 2 variables can be seen as the propension for both variables to jointly yield big values of the same sign.

In [None]:
# Convert to covariance
sds = 1 ./(sqrt.(diag(∑)))
corSig = Diagonal(sds)*∑*Diagonal(sds)

In [None]:
ggplot(DataFrame(x = vcat(corSig - I...))) + geom_histogram(aes(:x))  

In [None]:
println(maximum(corSig - I))
println(minimum(corSig - I))
println(var(corSig - I))

## Dataset 2 - Negative control N
This is generated with the same generator as the positive control

In [None]:
Random.seed!(2);

### Output negative control dataset

In [None]:
matN = DataFrame(rand(distrib, N)')

In [None]:
mkpath("Data")
CSV.write("Data/matN.csv", matN; delim=",", writeheader = false)

## Dataset 3 - Positive control (shifted) PS

In [None]:
Random.seed!(3);

### Data center

In [None]:
# The reference is not centered on 0 anymore
distrib = Normal(0, 0.5*posScaling)
µmod = rand(distrib, D)

### Output shifted dataset

In [None]:
# The data will follow a multivariate normal distribution with the parameters
# we generated previously
distrib = MvNormal(µmod, ∑);

In [None]:
matPS = DataFrame(rand(distrib, N)')

In [None]:
mkpath("Data")
CSV.write("Data/matPS.csv", matPS; delim=",", writeheader = false)

## Dataset 4 - Positive control (reshaped) PR
This is generated with a covariance increased in all direction, so the reshaped data is more spread than reference.

In [None]:
Random.seed!(4);

### Data covariance

In [None]:
# We increase the variation in all direction
∑mod = ∑^(1+posScaling)

In [None]:
# Check that the matrix is symmetrical (up to machine error)
@assert all([∑mod[i,j] ≈ ∑mod[j,i] for i in 1:D for j in 1:D if j>i])
# Make it perfectly symmetrical
if !(∑mod isa Symmetric)
    [∑mod[i,j] = ∑mod[j,i] for i in 1:D for j in 1:D if j>i]
end
# Sylvester's criterion of positive semidefinite matrices
@assert all([det(∑mod[1:size,1:size]) > 0 for size in 1:D])

In [None]:
ggplot(DataFrame(x = diag(∑mod))) + geom_histogram(aes(:x)) 

### Output reshaped dataset

In [None]:
# The data will follow a multivariate normal distribution with the parameters
# we generated previously
distrib = MvNormal(µ, ∑mod);

In [None]:
matPR = DataFrame(rand(distrib, N)')

In [None]:
mkpath("Data")
CSV.write("Data/matPR.csv", matPR; delim=",", writeheader = false)

## Dataset 5 - Reference with outliers Ro
We add outliers from a different distribution to our reference set: In a morphological profiling, some outliers often result from technical artifacts (e.g. dye precipitation, bubbles) or biological confounders (e.g. cell cycle).

In [None]:
Random.seed!(5);

### Data center

In [None]:
# The reference still is centered (center µ)
# The outliers are not centered on 0 anymore
distrib = Normal(0, 1)
µOutliers = rand(distrib, D)

### Data covariance

In [None]:
# The covariance of the outliers is similar but independent of the reference points
distrib = Gamma(1,2)
sigma_diagOutliers = rand(distrib, D);

In [None]:
# Now we transform this space by multiplying by a random orthogonal matrix
s = rand(D,D)
Q, R = qr(s);

In [None]:
# NB: becomes really slow, do not try with D > 500
∑Outliers = Q' * Diagonal(sigma_diagOutliers) * Q;

In [None]:
# Check that the matrix is symmetrical (up to machine error)
@assert all([∑Outliers[i,j] ≈ ∑Outliers[j,i] for i in 1:D for j in 1:D if j>i])
# Make it perfectly symmetrical
[∑Outliers[i,j] = ∑Outliers[j,i] for i in 1:D for j in 1:D if j>i]
# Sylvester's criterion of positive semidefinite matrices
@assert all([det(∑Outliers[1:size,1:size]) > 0 for size in 1:D])

In [None]:
ggplot(DataFrame(x = diag(∑Outliers))) + geom_histogram(aes(:x)) 

### Output reference dataset

In [None]:
# 80% of the data will follow a multivariate normal distribution with the parameters
# we generated previously
distrib = MvNormal(µ, ∑);
distribOutliers = MvNormal(µOutliers, ∑Outliers);

In [None]:
matRo = DataFrame(hcat(rand(distrib, Int(round(NR*(1-pOutliers)))),
                      rand(distribOutliers, Int(round(NR*pOutliers))))')

In [None]:
mkpath("Data")
CSV.write("Data/matRo.csv", matRo; delim=",", writeheader = false)

## Dataset 6 - Negative control with outliers No
This is generated with the same generator as the positive control

In [None]:
Random.seed!(6);

### Output negative control dataset

In [None]:
matNo = DataFrame(hcat(rand(distrib, Int(round(N*(1-pOutliers)))),
                      rand(distribOutliers, Int(round(N*pOutliers))))')

In [None]:
mkpath("Data")
CSV.write("Data/matNo.csv", matNo; delim=",", writeheader = false)

## Dataset 7 - Positive control (shifted) with outliers PSo

In [None]:
Random.seed!(7);

### Data center
The reference is centered on µmod as previously generated

### Output shifted dataset

In [None]:
# The data will follow a multivariate normal distribution with the parameters
# we generated previously
distrib = MvNormal(µmod, ∑);

In [None]:
matPSo = DataFrame(hcat(rand(distrib, Int(round(N*(1-pOutliers)))),
                        rand(distribOutliers, Int(round(N*pOutliers))))')

In [None]:
mkpath("Data")
CSV.write("Data/matPSo.csv", matPSo; delim=",", writeheader = false)

## Dataset 8 - Positive control (reshaped) with outliers PRo

In [None]:
Random.seed!(8);

### Data covariance
We use the larger covariance matrix ∑mod previously generated.

### Output reshaped dataset

In [None]:
# The data will follow a multivariate normal distribution with the parameters
# we generated previously
distrib = MvNormal(µ, ∑mod);

In [None]:
matPRo = DataFrame(hcat(rand(distrib, Int(round(N*(1-pOutliers)))),
                        rand(distribOutliers, Int(round(N*pOutliers))))')

In [None]:
mkpath("Data")
CSV.write("Data/matPRo.csv", matPRo; delim=",", writeheader = false)