In [None]:
using CSV, StatsBase, Statistics, DataFrames, UMAP, RCall, FreqTables
using MultipleTesting, Random, MultivariateStats, Distributed
using RMP

In [None]:
using Dates: now
now()

## R Calls

In [None]:
@rlibrary ggplot2
@rlibrary extrafont
@rlibrary viridis
@rlibrary heatmaply
@rlibrary ggrepel

In [None]:
R"""
# Used later for MCD computation

library(robustbase)

# Customize ggplot appearance

library(ggplot2)
library(extrafont)


# Load extra fonts
ttf_import("/tmp/.fonts")
loadfonts()

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), 
                     legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}

cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
scale_fill_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)
"""

## Load measurements

### Load annotations

In [None]:
annotations = CSV.read("data/transferList.txt", DataFrame, header = 1, delim = "\t")
# Convert plate number to strings to make clear it is an ID and should not be used for computations
annotations[!,:DestinationPlate] = string.(annotations[:,:DestinationPlate])
# Wells in the transfer list without any compound name are filled with DMSO only
annotations[!,:CompoundName][ismissing.(annotations[:,:CompoundName])] .= "DMSO"
# A non-ASCII characters needs to be converted
annotations.CompoundName = replace.(annotations.CompoundName, "\xb1" => s"±");

In [None]:
annotations

### Load image data

In [None]:
image = CSV.read("data/compiled_PilotDrugScreen_Image_750.csv", DataFrame)
println(string.(names(image))[1:8])
# Number of images and features available
println(nrow(image))
println(ncol(image))

## Coarse-grain aggregation

In [None]:
aggregatedData = CSV.read("data/aggregatedData_750cells.csv", DataFrame);

### Transform aggregated data - Normalization
We want to focus on variables that are changing more overall than inside of reference condition (untreated WT).

In [None]:
expAgg = Experiment(aggregatedData, description = "Median values for aggregated FOV measurements")

In [None]:
filters = Array{RMP.AbstractSelector,1}()
# Remove metadata
strToRemove = ["Metadata_Well", "CompoundName", "Metadata_Field", "Metadata_Row", "Metadata_Column"]
push!(filters, NameSelector(x -> !any(occursin.(strToRemove, String(x)))))
# Remove constant columns
push!(filters, Selector(x -> mad(x, normalize = true) != 0, description = "Remove constant features"));
push!(filters, Selector(x -> mad(x, normalize = true) != 0, 
                        subset = x -> x.CompoundName .== "DMSO", 
                        description = "Remove features constant for reference"))
selectFeaturesExperiment!(expAgg, filters)

In [None]:
expTransformed = deepcopy(expAgg)
logtransform!(expTransformed)
expTransformed.description = "Transformed values for aggregated FOV measurements"

Here we apply a correction based on the specific details of the experimental design:
All rows and columns include DMSO (negative) controls and we normalize all values based on these matchings controls (same row and column).  

This examplifies how to directly modify the data of an `Experiment` object.  

NB: One might want to check that more iterations are not needed (cf. Median-polish method).

In [None]:
# Normalize on matching DMSO wells median values

# Entries in both data frames are matching
@assert nrow(aggregatedData) == nrow(getdata(expTransformed))

# Copy data before correction
ndf = getdata(expTransformed)

for (i, (fx, fy)) in enumerate(eachrow(aggregatedData[:,[:Metadata_Row, :Metadata_Column]])) 
    c1 = aggregatedData.CompoundName .== "DMSO"
    c2 = aggregatedData.Metadata_Row .== fx
    c3 = aggregatedData.Metadata_Column .== fy
    @assert sum((c1 .& (c2 .| c3))) > 0
    expTransformed.data[i:i, expTransformed.selectedFeatures] .-= 
        mapcols(median, ndf[(c1 .& (c2 .| c3)),:]) 
end

## Visualization

In [None]:
Random.seed!(3895)
umTPM = umap(expTransformed)
umTPM = convert(DataFrame, umTPM')
names!(umTPM, [:UMAP1, :UMAP2]);
umTPM.Compound = aggregatedData.CompoundName

subsetCompounds = ["Vinblastine", "Pentamidine", "JFD00244", "DMSO"]
subsetEntries = Bool.(map(sum, eachcol(map(x -> occursin.(x, umTPM.Compound), subsetCompounds)))[1])
ggplot(umTPM[subsetEntries,:], aes(:UMAP1, :UMAP2)) + 
    geom_point(aes(color = :Compound), alpha = 0.8) +
    coord_fixed() + 
    theme(var"legend.position"="bottom", var"legend.spacing.x" = unit(0.35, "cm"), 
    var"legend.spacing.y" = unit(0, "cm")) + 
    guides(color=guide_legend(nrow=3,byrow=true))

## Distance to DMSO

In [None]:
""" Compute the Mahalanobis Distance to center (MDC)
    in a dataset 'data' for a given perturbation of indices 'iPert' 
    compared to a reference of indices 'iRef'."""
function MDC(data, iPert, iRef)
    setPert = Matrix(data[iPert,:])
    setRef = Matrix(data[iRef,:])

    mdCenter = dropdims(mean(setRef, dims = 1), dims = 1)
    mdCov = cov(setRef)

    pertCenter = dropdims(mean(setPert, dims = 1), dims = 1)
    
    MD = mahalanobis(pertCenter, mdCenter, mdCov)
    
    return(MD)
end

""" Permute labels and compute the Mahalanobis Distance to center (MDC)
    in a dataset 'data' for a given perturbation of indices 'iPert' 
    compared to a reference of indices 'iRef', to create an empirical distribution."""
function shuffMDC(data, iPert, iRef; nbRep = 250)
    setPert = data[iPert,:]
    setRef = data[iRef,:]  
    set = Matrix(vcat(setRef, setPert))
    
    function iterShufMD()
        nset = size(set, 1)
        shuffSet = set[sample(1:nset, nset; replace = false),:]
        # Take random subsets of corresponding sizes
        shuffSetPert = shuffSet[1:nrow(setPert),:]
        shuffSetRef = shuffSet[(nrow(setPert)+1):(nrow(setPert)+nrow(setRef)),:]

        # Compute Mahalanobis Distance
        
        mdCenter = dropdims(mean(shuffSetRef, dims = 1), dims = 1)
        mdCov = cov(shuffSetRef)
        
        pertCenter = dropdims(mean(shuffSetPert, dims = 1), dims = 1)
    
        MD = mahalanobis(pertCenter, mdCenter, mdCov)
        return(MD)
    end       
    
    return(map(x -> iterShufMD(), 1:nbRep))
end

""" Compute the median Mahalanobis Distance (MD)
    in a dataset 'data' for a given perturbation of indices 'iPert' 
    compared to a reference of indices 'iRef'."""
function MD(data, iPert, iRef)
    setPert = data[iPert,:]
    setRef = Matrix(data[iRef,:])

    mdCenter = dropdims(mean(setRef, dims = 1), dims = 1)
    mdCov = cov(setRef)
    
    MD = median(map(x -> mahalanobis(x, mdCenter, mdCov), eachrow(setPert)))
    return(MD)
end

""" Permute labels and compute the median Mahalanobis Distance (RMD)
    in a dataset 'data' for a given perturbation of indices 'iPert' 
    compared to a reference of indices 'iRef', to create an empirical distribution."""
function shuffMD(data, iPert, iRef; nbRep = 250)
    setPert = data[iPert,:]
    setRef = data[iRef,:]  
    set = Matrix(vcat(setRef, setPert))
    
    function iterShufMD()
        nset = size(set, 1)
        shuffSet = set[sample(1:nset, nset; replace = false),:]
        # Take random subsets of corresponding sizes
        shuffSetPert = shuffSet[1:nrow(setPert),:]
        shuffSetRef = shuffSet[(nrow(setPert)+1):(nrow(setPert)+nrow(setRef)),:]

        # Compute Mahalanobis Distance
        
        mdCenter = dropdims(mean(shuffSetRef, dims = 1), dims = 1)
        mdCov = cov(shuffSetRef)

        MD = median(map(x -> mahalanobis(x, mdCenter, mdCov), eachrow(DataFrame(shuffSetPert))))
        return(MD)
    end       
    
    return(map(x -> iterShufMD(), 1:nbRep))
end


""" Compute the median Robust Mahalanobis Distance (RMD)
    in a dataset 'data' for a given perturbation of indices 'iPert' 
    compared to a reference of indices 'iRef'.
    See https://e-archivo.uc3m.es/bitstream/handle/10016/24613/ws201710.pdf """
function RMD(data, iPert, iRef)
    setPert = data[iPert,:]
    setRef = data[iRef,:] 

    # Ensure that we have enough points to compute distance
    if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
        return(missing)
    end
    # NB: having less points than twice the number of dimensions leads to singularity
    
    # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
    @rput setRef

    R"""
    set.seed(3895)
    mcd <- covMcd(setRef)
    mcdCenter <- mcd$center
    mcdCov <- mcd$cov
    """
    @rget mcdCenter
    @rget mcdCov
    
    RMD = median(map(x -> mahalanobis(x, mcdCenter, mcdCov), eachrow(setPert)))
    return(RMD)
end

""" Permute labels and compute the median Robust Mahalanobis Distance (RMD)
    in a dataset 'data' for a given perturbation of indices 'iPert' 
    compared to a reference of indices 'iRef', to create an empirical distribution."""
function shuffRMD(data, iPert, iRef; nbRep = 250)
    setPert = data[iPert,:]
    setRef = data[iRef,:]  
    set = vcat(setRef, setPert)
    
    # Ensure that we have enough points to compute distance
    if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
        return(repeat([missing], nbRep))
    end
    # NB: having less points than twice the number of dimensions leads to singularity
    
    function iterShufRMD()
        shuffSet = set[sample(1:nrow(set), nrow(set); replace = false),:]
        # Take random subsets of corresponding sizes
        shuffSetPert = shuffSet[1:nrow(setPert),:]
        shuffSetRef = shuffSet[(nrow(setPert)+1):(nrow(setPert)+nrow(setRef)),:]

        # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
        @rput shuffSetRef
        
        R"""
        set.seed(3895)
        mcd <- covMcd(shuffSetRef)
        mcdCenter <- mcd$center
        mcdCov <- mcd$cov
        """
        @rget mcdCenter
        @rget mcdCov

        RMD = median(map(x -> mahalanobis(x, mcdCenter, mcdCov), eachrow(shuffSetPert)))
        return(RMD)
    end       
    
    return(map(x -> iterShufRMD(), 1:nbRep))
end

""" Compute the Robust Hellinger Distance (RHD)
    in a dataset `data` for a given perturbation of indices `iPert` 
    compared to a reference of indices `iRef`."""
function RHD(data, iPert, iRef)
    setPert = data[iPert,:]
    setRef = data[iRef,:] 

    # Ensure that we have enough points to compute distance
    if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
        return(missing)
    end
    # NB: having less points than twice the number of dimensions leads to singularity
    
    # Compute Minimum Covariance Determinant and corresponding Robust Hellinger Distance
    @rput setRef
    @rput setPert

    R"""
    set.seed(3895)
    mcd1 <- covMcd(setRef)
    mcdCenter1 <- mcd1$center
    mcdCov1 <- mcd1$cov
    
    # We set the seed twice to always
    # find the same estimators given
    # the same sample
    set.seed(3895)
    mcd2 <- covMcd(setPert)
    mcdCenter2 <- mcd2$center
    mcdCov2 <- mcd2$cov
    """
    @rget mcdCenter1
    @rget mcdCov1
    @rget mcdCenter2
    @rget mcdCov2
    
    RHD = hellinger(mcdCenter1, mcdCov1, mcdCenter2, mcdCov2)
    return(RHD)
end

""" Permute labels and compute the Robust Hellinger Distance (RHD)
    in a dataset `data` for a given perturbation of indices `iPert` 
    compared to a reference of indices `iRef`, to create an empirical distribution."""
function shuffRHD(data, iPert, iRef; nbRep = 250)
    setPert = data[iPert,:]
    setRef = data[iRef,:]  
    set = vcat(setRef, setPert)
    
    # Ensure that we have enough points to compute distance
    if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
        return(repeat([missing], nbRep))
    end
    # NB: having less points than twice the number of dimensions leads to singularity
    
    function iterShufRHD()
        shuffSet = set[sample(1:nrow(set), nrow(set); replace = false),:]
        # Take random subsets of corresponding sizes
        shuffSetPert = shuffSet[1:nrow(setPert),:]
        shuffSetRef = shuffSet[(nrow(setPert)+1):(nrow(setPert)+nrow(setRef)),:]

        # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
        @rput shuffSetRef
        @rput shuffSetPert
        
        R"""
        set.seed(3895)
        mcd <- covMcd(shuffSetRef)
        mcdCenter1 <- mcd$center
        mcdCov1 <- mcd$cov
        
        # We set the seed twice to always
        # find the same estimators given
        # the same sample
        set.seed(3895)
        mcd <- covMcd(shuffSetPert)
        mcdCenter2 <- mcd$center
        mcdCov2 <- mcd$cov
        """
        @rget mcdCenter1
        @rget mcdCov1        
        @rget mcdCenter2
        @rget mcdCov2
        

        RHD = hellinger(mcdCenter1, mcdCov1, mcdCenter2, mcdCov2)
        return(RHD)
    end       
    
    return(map(x -> iterShufRHD(), 1:nbRep))
end

In [None]:
function rmpv(e::Experiment, on::Symbol; 
                    distance = :RobustHellinger,
                    reference = "DMSO",
                    iterations = 100,
                    correction = :FDR)
end

### Start with UMAP-based distance

In [None]:
expUMAP = Experiment(umTPM, description = "UMAP projection of profiling data")

In [None]:
filters = Array{RMP.AbstractReduce,1}()
# Remove (categorical) compound column from analysis
push!(filters, NameSelector(x -> x != "Compound"))
# Remove entries for compounds not present often enough
cmpd_to_keep = levels(umTPM.Compound)[freqtable(umTPM.Compound) .>= 4]
compare_in(x,y) = x in y
Broadcast.broadcasted(::typeof(compare_in), x, y) = broadcast(in, x, Ref(y))
push!(filters, Filter(cmpd_to_keep, :Compound, compare = compare_in))
# Apply filters
filterExperiment!(expUMAP, filters)
expUMAP

NB: because the `compare` function of Filter `f` is applied as f.compare.(entries, f.value)
the value provided needs to be of the length of the entries or of length 1.  
Otherwise, when broadcasting (calling `f.compare` on all elements with `f.compare.`), we need to specify that the `f.value` should be used "as is". This is done by overloading the broadcasting of the function. See:   
https://discourse.julialang.org/t/how-to-broadcast-over-only-certain-function-arguments/19274/5  
Another "trick" is to use a function that takes a length 1 value that is in practice not used:  
```julia
push!(filters, Filter("NotUsed", :Compound, compare = (x,y) -> (x in cmpd_to_keep)))
```

In [None]:
# The following RMD values are displayed in the following order:
selectedCompounds = expUMAP.data[expUMAP.selectedEntries, :Compound]
levels(selectedCompounds)

In [None]:
# Actual observed RHD
allRHD = map(x -> RHD(getdata(expUMAP), 
                      selectedCompounds.==x, 
                      selectedCompounds.=="DMSO"), 
             levels(selectedCompounds))

In [None]:
# DMSO should be at a distance 0 to itself
@assert allRHD[[i for (i,x) in enumerate(levels(selectedCompounds)) if x == "DMSO"]] == [0]

### Optimize rmpv runtime

In [None]:
import Pkg; Pkg.add("BenchmarkTools")
using BenchmarkTools

In [None]:
@btime allShuffRHD = map(x -> shuffRHD(getdata(expUMAP), selectedCompounds .== x, 
                                      selectedCompounds .== "DMSO", nbRep = 10), 
                        levels(selectedCompounds))

In [None]:
pool = CachingPool(workers())

In [None]:
@btime allShuffRHD2 = pmap(x -> shuffRHD(getdata(expUMAP), selectedCompounds .== x, 
                                      selectedCompounds .== "DMSO", nbRep = 10), pool,
                        levels(selectedCompounds))

In [None]:
@everywhere module StatDistances
    using RMP, RCall, DataFrames, StatsBase

    """ Compute the Mahalanobis Distance to center (MDC)
        in a dataset 'data' for a given perturbation of indices 'iPert' 
        compared to a reference of indices 'iRef'."""
    function MDC(data, iPert, iRef)
        setPert = Matrix(data[iPert,:])
        setRef = Matrix(data[iRef,:])

        mdCenter = dropdims(mean(setRef, dims = 1), dims = 1)
        mdCov = cov(setRef)

        pertCenter = dropdims(mean(setPert, dims = 1), dims = 1)

        MD = mahalanobis(pertCenter, mdCenter, mdCov)

        return(MD)
    end

    """ Permute labels and compute the Mahalanobis Distance to center (MDC)
        in a dataset 'data' for a given perturbation of indices 'iPert' 
        compared to a reference of indices 'iRef', to create an empirical distribution."""
    function shuffMDC(data, iPert, iRef; nbRep = 250)
        setPert = data[iPert,:]
        setRef = data[iRef,:]  
        set = Matrix(vcat(setRef, setPert))

        function iterShufMD()
            nset = size(set, 1)
            shuffSet = set[sample(1:nset, nset; replace = false),:]
            # Take random subsets of corresponding sizes
            shuffSetPert = shuffSet[1:nrow(setPert),:]
            shuffSetRef = shuffSet[(nrow(setPert)+1):(nrow(setPert)+nrow(setRef)),:]

            # Compute Mahalanobis Distance

            mdCenter = dropdims(mean(shuffSetRef, dims = 1), dims = 1)
            mdCov = cov(shuffSetRef)

            pertCenter = dropdims(mean(shuffSetPert, dims = 1), dims = 1)

            MD = mahalanobis(pertCenter, mdCenter, mdCov)
            return(MD)
        end       

        return(map(x -> iterShufMD(), 1:nbRep))
    end

    """ Compute the median Mahalanobis Distance (MD)
        in a dataset 'data' for a given perturbation of indices 'iPert' 
        compared to a reference of indices 'iRef'."""
    function MD(data, iPert, iRef)
        setPert = data[iPert,:]
        setRef = Matrix(data[iRef,:])

        mdCenter = dropdims(mean(setRef, dims = 1), dims = 1)
        mdCov = cov(setRef)

        MD = median(map(x -> mahalanobis(x, mdCenter, mdCov), eachrow(setPert)))
        return(MD)
    end

    """ Permute labels and compute the median Mahalanobis Distance (RMD)
        in a dataset 'data' for a given perturbation of indices 'iPert' 
        compared to a reference of indices 'iRef', to create an empirical distribution."""
    function shuffMD(data, iPert, iRef; nbRep = 250)
        setPert = data[iPert,:]
        setRef = data[iRef,:]  
        set = Matrix(vcat(setRef, setPert))

        function iterShufMD()
            nset = size(set, 1)
            shuffSet = set[sample(1:nset, nset; replace = false),:]
            # Take random subsets of corresponding sizes
            shuffSetPert = shuffSet[1:nrow(setPert),:]
            shuffSetRef = shuffSet[(nrow(setPert)+1):(nrow(setPert)+nrow(setRef)),:]

            # Compute Mahalanobis Distance

            mdCenter = dropdims(mean(shuffSetRef, dims = 1), dims = 1)
            mdCov = cov(shuffSetRef)

            MD = median(map(x -> mahalanobis(x, mdCenter, mdCov), eachrow(DataFrame(shuffSetPert))))
            return(MD)
        end       

        return(map(x -> iterShufMD(), 1:nbRep))
    end


    """ Compute the median Robust Mahalanobis Distance (RMD)
        in a dataset 'data' for a given perturbation of indices 'iPert' 
        compared to a reference of indices 'iRef'.
        See https://e-archivo.uc3m.es/bitstream/handle/10016/24613/ws201710.pdf """
    function RMD(data, iPert, iRef)
        setPert = data[iPert,:]
        setRef = data[iRef,:] 

        # Ensure that we have enough points to compute distance
        if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
            return(missing)
        end
        # NB: having less points than twice the number of dimensions leads to singularity

        # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
        @rput setRef

        R"""
        set.seed(3895)
        mcd <- covMcd(setRef)
        mcdCenter <- mcd$center
        mcdCov <- mcd$cov
        """
        @rget mcdCenter
        @rget mcdCov

        RMD = median(map(x -> mahalanobis(x, mcdCenter, mcdCov), eachrow(setPert)))
        return(RMD)
    end

    """ Permute labels and compute the median Robust Mahalanobis Distance (RMD)
        in a dataset 'data' for a given perturbation of indices 'iPert' 
        compared to a reference of indices 'iRef', to create an empirical distribution."""
    function shuffRMD(data, iPert, iRef; nbRep = 250)
        setPert = data[iPert,:]
        setRef = data[iRef,:]  
        set = vcat(setRef, setPert)

        # Ensure that we have enough points to compute distance
        if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
            return(repeat([missing], nbRep))
        end
        # NB: having less points than twice the number of dimensions leads to singularity

        function iterShufRMD()
            shuffSet = set[sample(1:nrow(set), nrow(set); replace = false),:]
            # Take random subsets of corresponding sizes
            shuffSetPert = shuffSet[1:nrow(setPert),:]
            shuffSetRef = shuffSet[(nrow(setPert)+1):(nrow(setPert)+nrow(setRef)),:]

            # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
            @rput shuffSetRef

            R"""
            set.seed(3895)
            mcd <- covMcd(shuffSetRef)
            mcdCenter <- mcd$center
            mcdCov <- mcd$cov
            """
            @rget mcdCenter
            @rget mcdCov

            RMD = median(map(x -> mahalanobis(x, mcdCenter, mcdCov), eachrow(shuffSetPert)))
            return(RMD)
        end       

        return(map(x -> iterShufRMD(), 1:nbRep))
    end

    """ Compute the Robust Hellinger Distance (RHD)
        in a dataset `data` for a given perturbation of indices `iPert` 
        compared to a reference of indices `iRef`."""
    function RHD(data, iPert, iRef)
        setPert = data[iPert,:]
        setRef = data[iRef,:] 

        # Ensure that we have enough points to compute distance
        if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
            return(missing)
        end
        # NB: having less points than twice the number of dimensions leads to singularity

        # Compute Minimum Covariance Determinant and corresponding Robust Hellinger Distance
        @rput setRef
        @rput setPert

        R"""
        set.seed(3895)
        mcd1 <- covMcd(setRef)
        mcdCenter1 <- mcd1$center
        mcdCov1 <- mcd1$cov

        # We set the seed twice to always
        # find the same estimators given
        # the same sample
        set.seed(3895)
        mcd2 <- covMcd(setPert)
        mcdCenter2 <- mcd2$center
        mcdCov2 <- mcd2$cov
        """
        @rget mcdCenter1
        @rget mcdCov1
        @rget mcdCenter2
        @rget mcdCov2

        RHD = hellinger(mcdCenter1, mcdCov1, mcdCenter2, mcdCov2)
        return(RHD)
    end

    """ Permute labels and compute the Robust Hellinger Distance (RHD)
        in a dataset `data` for a given perturbation of indices `iPert` 
        compared to a reference of indices `iRef`, to create an empirical distribution."""
    function shuffRHD(data, iPert, iRef; nbRep = 250)
        setPert = data[iPert,:]
        setRef = data[iRef,:]  
        set = vcat(setRef, setPert)

        # Ensure that we have enough points to compute distance
        if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
            return(repeat([missing], nbRep))
        end
        # NB: having less points than twice the number of dimensions leads to singularity

        function iterShufRHD()
            shuffSet = set[sample(1:nrow(set), nrow(set); replace = false),:]
            # Take random subsets of corresponding sizes
            shuffSetPert = shuffSet[1:nrow(setPert),:]
            shuffSetRef = shuffSet[(nrow(setPert)+1):(nrow(setPert)+nrow(setRef)),:]

            # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
            @rput shuffSetRef
            @rput shuffSetPert

            R"""
            set.seed(3895)
            mcd <- covMcd(shuffSetRef)
            mcdCenter1 <- mcd$center
            mcdCov1 <- mcd$cov

            # We set the seed twice to always
            # find the same estimators given
            # the same sample
            set.seed(3895)
            mcd <- covMcd(shuffSetPert)
            mcdCenter2 <- mcd$center
            mcdCov2 <- mcd$cov
            """
            @rget mcdCenter1
            @rget mcdCov1        
            @rget mcdCenter2
            @rget mcdCov2


            RHD = hellinger(mcdCenter1, mcdCov1, mcdCenter2, mcdCov2)
            return(RHD)
        end       

        return(map(x -> iterShufRHD(), 1:nbRep))
    end
end;

What to share:
* [DONE] levels(selectedCompounds)
* [DONE] getdata(expUMAP) -> dataUMAP
* [DONE] shuffRHD -> StatDistances.shuffRHD
* [DONE] RMP
* [DONE] allShuffRHD3

In [None]:
Pkg.add("ParallelDataTransfer")

In [None]:
# Load required packages in all workers and share variables used in RMPV computation
@everywhere using RMP, DataFrames, ParallelDataTransfer
sendto(workers(), selectedCompounds=selectedCompounds, 
                  dataUMAP=getdata(expUMAP))

The following commands indeed show that these variables are accessible from all workers:
```julia
@btime @sync @distributed for i in 1:length(allShuffRHD3)
    allShuffRHD3[i] = length(selectedCompounds[(i%length(selectedCompounds))])
end
```
```julia
@btime for i in 1:length(allShuffRHD3)
    allShuffRHD2[i] = du.UMAP1[i]
end
```

In [None]:
using SharedArrays
allShuffRHD2 = SharedArray{Float64}((length(levels(selectedCompounds)),25));

In [None]:
@btime @sync @distributed for i in 1:length(allShuffRHD3)
    R"""
    # Used later for MCD computation
    require(robustbase)
    """
    lvl = levels(selectedCompounds)[i]
    allShuffRHD2[i,:] = StatDistances.shuffRHD(dataUMAP, 
                                             selectedCompounds .== lvl, 
                                             selectedCompounds .== "DMSO", 
                                             nbRep = 25)
end

In [None]:
@btime allShuffRHD = map(x -> shuffRHD(getdata(expUMAP), 
                                    selectedCompounds .== x, 
                                    selectedCompounds .== "DMSO", 
                                    nbRep = 25),
                      levels(selectedCompounds))

In [None]:
allShuffRHD4 = zeros((length(levels(selectedCompounds)),25));

In [None]:
du = getdata(expUMAP);

In [None]:
@btime for i in 1:length(allShuffRHD3)
    R"""
    # Used later for MCD computation
    require(robustbase)
    """
    lvl = levels(selectedCompounds)[i]
    allShuffRHD4[i,:] = shuffRHD(du, 
                                 selectedCompounds .== lvl, 
                                 selectedCompounds .== "DMSO", 
                                 nbRep = 25)
end

In [None]:
# Missing values need to be handled in real case applications
@assert !any(ismissing.(allRHD))

In [None]:
# Compute the Robust Morphological Perturbation Value
plateRMPV = DataFrame()
plateRMPV[:RMPV] = adjust([mean(obs .< sim) for (obs, sim) 
            in zip(allRHD, allShuffRHD)], BenjaminiHochberg())
plateRMPV[:RHD] = allRHD
plateRMPV[:Condition] = unique(umND.Condition)

In [None]:
allShuffRHD

In [None]:
plateRMPV

In [None]:
# Display number of positive tests
gp = ggplot(plateRMPV) + geom_point(aes(:RMPV, :RHD, color = :Condition)) + 
    geom_vline(xintercept = 0.05, linetype = "dashed")
print(gp)
savePlot("UMAP_RMPV_test_hellinger.pdf", gp);