In [None]:
using CSV, StatsBase, Statistics, DataFrames, UMAP, RCall
using Distributed, RMP, Random, Distributions
using MultivariateStats, MultipleTesting, Dates
using LinearAlgebra: I, Diagonal, diag, det, qr, Symmetric

In [None]:
@rlibrary ggplot2
@rlibrary extrafont

In [None]:
R"""
# Used later for MCD computation

library(robustbase)

# Customize ggplot appearance

library(ggplot2)
library(extrafont)


# Load extra fonts
ttf_import(paths = "/tmp/.fonts/")
loadfonts()

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), 
                     legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}

cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
scale_fill_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)
"""

## Dataset generation
We assume our data of interest to follow a multivariate normal distribution: In a morphological profiling, components are to some extent *independent (by removing correlated morphological features) and* normally distributed (by using a log-transformation).

In [None]:
"""Generate reference and positive control datasets with the following parameters:
* `NR` Number of points in control dataset  
* `N` Number of points in other datasets  
* `D` Number of dimensions in each dataset  
* `pOutliers` Percentage of datasets contaminated with outliers  
* `posScaling` Scaling of the transformation for positive controls  
* `dimUMAP` Number of dimensions kept for UMAP distance computation"""
generateDatasets = function(; NR = 4500, N = 1500, D = 100, pOutliers = 1/3, posScaling = 1.0, dimUMAP = 10)
    Random.seed!(1);
    
    # The reference is centered on 0
    µ = zeros(D);
    
    # Diagonal: variances follow a Gamma distribution of shape and scale parameters equal to 1 and 2
    # Rationale: Some variability in scales with some high values, and no negative values
    # NB: Beta distribution could be used instead of Gamma distribution if long-tail is not needed
    # NB: Effects are smoothed by the orthogonal transformation anyway
    distrib = Gamma(1,2)
    sigma_diag = rand(distrib, D);
    
    # Now we transform this space by multiplying by a random orthogonal matrix
    s = rand(D,D)
    Q, R = qr(s);

    # NB: becomes really slow, do not try with D > 500
    ∑ = Q' * Diagonal(sigma_diag) * Q;

    # Check that the matrix is symmetrical (up to machine error)
    @assert all([∑[i,j] ≈ ∑[j,i] for i in 1:D for j in 1:D if j>i])

    # Make it perfectly symmetrical
    [∑[i,j] = ∑[j,i] for i in 1:D for j in 1:D if j>i]

    # Sylvester's criterion of positive semidefinite matrices
    @assert all([det(∑[1:size,1:size]) > 0 for size in 1:D])

    # The reference still is centered (center µ)
    # The outliers are not centered on 0 anymore
    distrib = Normal(0, 1)
    µOutliers = rand(distrib, D)

    # The covariance of the outliers is similar but independent of the reference points
    distrib = Gamma(1,2)
    sigma_diagOutliers = rand(distrib, D);
    
    # Now we transform this space by multiplying by a random orthogonal matrix
    s = rand(D,D)
    Q, R = qr(s);

    # NB: becomes really slow, do not try with D > 500
    ∑Outliers = Q' * Diagonal(sigma_diagOutliers) * Q;

    # Check that the matrix is symmetrical (up to machine error)
    @assert all([∑Outliers[i,j] ≈ ∑Outliers[j,i] for i in 1:D for j in 1:D if j>i])

    # Make it perfectly symmetrical
    [∑Outliers[i,j] = ∑Outliers[j,i] for i in 1:D for j in 1:D if j>i]

    # Sylvester's criterion of positive semidefinite matrices
    @assert all([det(∑Outliers[1:size,1:size]) > 0 for size in 1:D])

    # `100*pOutliers`% of the data will follow a multivariate normal distribution
    # with the parameters we generated previously
    distrib = MvNormal(µ, ∑);
    distribOutliers = MvNormal(µOutliers, ∑Outliers);
    matR = DataFrame(hcat(rand(distrib, Int(round(NR*(1-pOutliers)))),
                      rand(distribOutliers, Int(round(NR*pOutliers))))')

    Random.seed!(2);

    # The reference is not centered on 0 anymore
    distrib = Normal(0, 0.5*posScaling)
    µmod = rand(distrib, D)

    # The data will follow a multivariate normal distribution with the parameters
    # we generated previously
    distrib = MvNormal(µmod, ∑);
    matP = DataFrame(hcat(rand(distrib, Int(round(N*(1-pOutliers)))),
                      rand(distribOutliers, Int(round(N*pOutliers))))')
    return(matR, matP)
end


## Define distances of interest

In [None]:
""" Compute the Mahalanobis Distance to center (MDC)
    in a dataset 'data' for a given perturbation of indices 'iPert' 
    compared to a reference of indices 'iRef'."""
function MDC(data, iPert, iRef)
    setPert = Matrix(data[iPert,:])
    setRef = Matrix(data[iRef,:])

    mdCenter = dropdims(mean(setRef, dims = 1), dims = 1)
    mdCov = cov(setRef)

    pertCenter = dropdims(mean(setPert, dims = 1), dims = 1)
    
    MD = mahalanobis(pertCenter, mdCenter, mdCov)
    
    return(MD)
end

In [None]:
""" Compute the median Mahalanobis Distance (MD)
    in a dataset 'data' for a given perturbation of indices 'iPert' 
    compared to a reference of indices 'iRef'."""
function MD(data, iPert, iRef)
    setPert = data[iPert,:]
    setRef = Matrix(data[iRef,:])

    mdCenter = dropdims(mean(setRef, dims = 1), dims = 1)
    mdCov = cov(setRef)
    
    MD = median(map(x -> mahalanobis(x, mdCenter, mdCov), eachrow(setPert)))
    return(MD)
end

In [None]:
""" Compute the median Robust Mahalanobis Distance (RMD)
    in a dataset 'data' for a given perturbation of indices 'iPert' 
    compared to a reference of indices 'iRef'.
    See https://e-archivo.uc3m.es/bitstream/handle/10016/24613/ws201710.pdf """
function RMD(data, iPert, iRef)
    setPert = data[iPert,:]
    setRef = data[iRef,:] 

    # Ensure that we have enough points to compute distance
    if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
        return(missing)
    end
    # NB: having less points than twice the number of dimensions leads to singularity
    
    # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
    @rput setRef

    R"""
    set.seed(3895)
    mcd <- covMcd(setRef)
    mcdCenter <- mcd$center
    mcdCov <- mcd$cov
    """
    @rget mcdCenter
    @rget mcdCov
    
    RMD = median(map(x -> mahalanobis(x, mcdCenter, mcdCov), eachrow(setPert)))
    return(RMD)
end

In [None]:
""" Compute the Robust Hellinger Distance (RHD)
    in a dataset `data` for a given perturbation of indices `iPert` 
    compared to a reference of indices `iRef`."""
function RHD(data, iPert, iRef)
    setPert = data[iPert,:]
    setRef = data[iRef,:] 

    # Ensure that we have enough points to compute distance
    if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
        return(missing)
    end
    # NB: having less points than twice the number of dimensions leads to singularity
    
    # Compute Minimum Covariance Determinant and corresponding Robust Hellinger Distance
    @rput setRef
    @rput setPert

    R"""
    set.seed(3895)
    mcd1 <- covMcd(setRef)
    mcdCenter1 <- mcd1$center
    mcdCov1 <- mcd1$cov
    
    # We set the seed twice to always
    # find the same estimators given
    # the same sample
    set.seed(3895)
    mcd2 <- covMcd(setPert)
    mcdCenter2 <- mcd2$center
    mcdCov2 <- mcd2$cov
    """
    @rget mcdCenter1
    @rget mcdCov1
    @rget mcdCenter2
    @rget mcdCov2
    
    RHD = hellinger(mcdCenter1, mcdCov1, mcdCenter2, mcdCov2)
    return(RHD)
end

## Compare raw distances

In [None]:
"""Plot evolution of multiple metrics based on a previous exploration of the parameter space"""
function savePlotSeries(resMetrics, paramSpace; refMetrics = nothing,
                        plotTitle = "Distance dynamics", plot = false, save = true)
    outFolder = Dates.format(Dates.now(), "yyyy-mm-dd_HH-MM-SS")
    mkpath("Fig")
    mkpath("Fig/0c")
    mkpath("Fig/0c/"*outFolder)
    for (s,m) in resMetrics
        if !isnothing(refMetrics)
            ggDF = DataFrame(x = repeat(paramSpace, 2), y = vcat(m, refMetrics[s]), 
                             c = repeat(["Pert","Ref"], inner = length(paramSpace)))
            ggp = ggplot(ggDF) + aes(x = :x, y = :y, color = :c) + labs(color = "Distance to")
        else
            ggDF = DataFrame(x = paramSpace, y = m)
            ggp = ggplot(ggDF) + aes(x = :x, y = :y) 
        end
        ggp = ggp + geom_point() + xlab("Perturbation scaling") + 
              ylab(s) + ggtitle(plotTitle)
        if plot
            print(ggp)
        end
        if save
            ggsave("Fig/0c/"*outFolder*'/'s*".pdf", ggp)
        end
    end
end

In [None]:
# Parameter space to parse
paramSpace = range(0,1,step = 0.02)
NR = 300
N = 150
D = 2
pOutliers = 0

print(now())
allMDCraw = Array{Float64,1}()
allMDraw = Array{Float64,1}()
allRMDraw = Array{Float64,1}()
allRHDraw = Array{Float64,1}()
refMDCraw = Array{Float64,1}()
refMDraw = Array{Float64,1}()
refRMDraw = Array{Float64,1}()
refRHDraw = Array{Float64,1}()

for param in paramSpace
    df = vcat(generateDatasets(posScaling = param, pOutliers = pOutliers, NR = NR, N = N, D = D)...)
    # Remember how these records were generated
    origDataset = vcat(repeat(["Reference"], NR),
                       repeat(["Positive control"], N));
    
    MDCraw = map(x -> MDC(df, origDataset.==x,
                origDataset.=="Reference"), unique(origDataset))
    MDraw = map(x -> MD(df, origDataset.==x,
                       origDataset.=="Reference"), unique(origDataset))
    RMDraw = map(x -> RMD(df, origDataset.==x, 
                origDataset.=="Reference"), unique(origDataset))
    RHDraw = map(x -> RHD(df, origDataset.==x, origDataset.=="Reference"),
                unique(origDataset))
    
    # Store results
    append!(allMDCraw, MDCraw[2])
    append!(allMDraw, MDraw[2])
    append!(allRMDraw, RMDraw[2])
    append!(allRHDraw, RHDraw[2])
    
    append!(refMDCraw, MDCraw[1])
    append!(refMDraw, MDraw[1])
    append!(refRMDraw, RMDraw[1])
    append!(refRHDraw, RHDraw[1])    
    
    # Display results
    println(string(now())*" - "*string(param))
    println(MDCraw, MDraw, RMDraw, RHDraw)
end

resMetrics = Dict("Mahalanobis distance from center" => allMDCraw,
                  "Median Mahalanobis distance" => allMDraw,
                  "Robust Mahalanobis distance" => allRMDraw,
                  "Robust Hellinger distance" => allRHDraw)
refMetrics = Dict("Mahalanobis distance from center" => refMDCraw,
                  "Median Mahalanobis distance" => refMDraw,
                  "Robust Mahalanobis distance" => refRMDraw,
                  "Robust Hellinger distance" => refRHDraw)
plotTitle = "NR/N/D/pO: "*string(NR)*'/'*string(N)*'/'*string(D)*'/'*string(pOutliers)

savePlotSeries(resMetrics, paramSpace, refMetrics = refMetrics,
               plotTitle = plotTitle, save = false, plot = true)