# Comparison of differential analysis methods
In this notebook, we compare the ability of different approaches to quantify and identify significant changes in statistical distributions of samples, corresponding to synthetic datasets having similarities to actual morphological measurement datasets from high-content imaging. 

We check if and how much a perturbed condition is inducing changes compared to a reference condition. We refer to all (simulated) measurements in a given condition as the *perturbed or reference sample* and correspondingly, an underlying *perturbed or reference distribution* can be infered from the samples.

We include the following methods:
* Comparing Mahalanobis distance from the center (mean) of the perturbed samples to the reference distribution *(WIP!)*
* Comparing median Mahalanobis distance from the perturbed samples to the reference distribution *(WIP!)*
* Comparing median robust Mahalanobis distance from the perturbed samples to the reference distribution
* Comparing robust Hellinger distance from the perturbed distribution to the reference distribution

We tried these approaches using different pre-processing approaches:
* Based on row measurements *(WIP!)*
* Based on variation-scaled PCA
* Based on UMAP

## Settings

Load packages

In [None]:
using CSV, StatsBase, Statistics, DataFrames, UMAP, RCall
using Distributed, RMP, Random 
using MultivariateStats, MultipleTesting
using Dates: now

In [None]:
@rlibrary ggplot2
@rlibrary extrafont

In [None]:
R"""
# Used later for MCD computation

library(robustbase)

# Customize ggplot appearance

library(ggplot2)
library(extrafont)


# Load extra fonts
ttf_import(paths = "/tmp/.fonts/")
loadfonts()

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), 
                     legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}

cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
scale_fill_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)
"""

## Loading

Load generated datasets and combine them

In [None]:
R = CSV.read("Data/matR.csv", header = false) # Reference
N = CSV.read("Data/matN.csv", header = false) # Negative control
PS = CSV.read("Data/matPS.csv", header = false) # Shifted
PR = CSV.read("Data/matPR.csv", header = false) # Rescaled
1 # Do not display last element

In [None]:
dataset = vcat(R, N, PS, PR)

In [None]:
# Remember how these records were generated
origDataset = vcat(repeat(["Reference"], size(R, 1)),
                   repeat(["Negative control"], size(N, 1)),
                   repeat(["Shifted control"], size(PS, 1)),
                   repeat(["Rescaled control"], size(PR, 1)))

## Feature filtering

In [None]:
FILT_MAX_CORR = 0.6 # Keep uncorrelated variables
dimUMAP = 5

In [None]:
# Center and scale on control values
indRef = origDataset .== "Reference"
normDataset = DataFrame(map(x -> normtransform(x, x[indRef]), eachcol(dataset)))
1

In [None]:
# Order features from biggest mad to smallest mad
# Since features have mad(reference) = 1, it means that we rank features by how more variable they are
# for all conditions compared to the reference
orderFt = sortperm(convert(Array, map(x -> mad(x, normalize = true), eachcol(normDataset))), rev=true)

In [None]:
uncorrFt = decorrelate(normDataset, orderCol = orderFt, threshold = FILT_MAX_CORR)
normDataset = normDataset[uncorrFt]

In [None]:
# Visualization
Random.seed!(3895)
# import Pkg; Pkg.add("Distances")
# import Distances
# umND = umap(convert(Matrix, normDataset)', 2; metric = Distances.CosineDist(), 
#     repulsion_strength = 10, n_epochs = 200)
umND = umap(convert(Matrix, normDataset)', 2; min_dist = 0.01, n_epochs = 200)
umND = convert(DataFrame, umND')
names!(umND, [:UMAP1, :UMAP2])

umND[:Condition] = origDataset

In [None]:
@rput umND
R"""
gp3 <- ggplot(umND, aes(UMAP1, UMAP2)) + geom_hex(aes(fill = ..density..), bins = 12) + facet_wrap("Condition")
print(gp3)
""";

In [None]:
gp = ggplot(umND, aes(:UMAP1, :UMAP2)) + geom_point(aes(color = :Condition), alpha = 0.3)
print(gp)
mkpath("Fig")
ggsave("Fig/UMAP_test.pdf", gp)
1

In [None]:
unique(umND.Condition)

In [None]:
gp = ggplot(umND[umND.Condition .!= "Reference",:], 
            aes(:UMAP1, :UMAP2)) + geom_point(aes(color = :Condition), alpha = 0.3)
print(gp)

In [None]:
gp = ggplot(umND[umND.Condition .!= "Rescaled control",:], 
            aes(:UMAP1, :UMAP2)) + geom_point(aes(color = :Condition), alpha = 0.3)
print(gp)

## Function definition
The following can be partially moved to the RMP package once validated.

In [None]:
""" Compute the Mahalanobis Distance to center (MDC)
    in a dataset 'data' for a given perturbation of indices 'iPert' 
    compared to a reference of indices 'iRef'."""
function MDC(data, iPert, iRef)
    setPert = Matrix(data[iPert,:])
    setRef = Matrix(data[iRef,:])

    mdCenter = dropdims(mean(setRef, dims = 1), dims = 1)
    mdCov = cov(setRef)

    pertCenter = dropdims(mean(setPert, dims = 1), dims = 1)
    
    MD = mahalanobis(pertCenter, mdCenter, mdCov)
    
    return(MD)
end

In [None]:
""" Permute labels and compute the Mahalanobis Distance to center (MDC)
    in a dataset 'data' for a given perturbation of indices 'iPert' 
    compared to a reference of indices 'iRef', to create an empirical distribution."""
function shuffMDC(data, iPert, iRef; nbRep = 250)
    setPert = data[iPert,:]
    setRef = data[iRef,:]  
    set = Matrix(vcat(setRef, setPert))
    
    function iterShufMD()
        nset = size(set, 1)
        shuffSet = set[sample(1:nset, nset; replace = false),:]
        # Take random subsets of corresponding sizes
        shuffSetPert = shuffSet[1:nrow(setPert),:]
        shuffSetRef = shuffSet[(nrow(setPert)+1):(nrow(setPert)+nrow(setRef)),:]

        # Compute Mahalanobis Distance
        
        mdCenter = dropdims(mean(shuffSetRef, dims = 1), dims = 1)
        mdCov = cov(shuffSetRef)
        
        pertCenter = dropdims(mean(shuffSetPert, dims = 1), dims = 1)
    
        MD = mahalanobis(pertCenter, mdCenter, mdCov)
        return(MD)
    end       
    
    return(map(x -> iterShufMD(), 1:nbRep))
end

In [None]:
""" Compute the median Mahalanobis Distance (MD)
    in a dataset 'data' for a given perturbation of indices 'iPert' 
    compared to a reference of indices 'iRef'."""
function MD(data, iPert, iRef)
    setPert = data[iPert,:]
    setRef = Matrix(data[iRef,:])

    mdCenter = dropdims(mean(setRef, dims = 1), dims = 1)
    mdCov = cov(setRef)
    
    MD = median(map(x -> mahalanobis(x, mdCenter, mdCov), eachrow(setPert)))
    return(MD)
end

In [None]:
""" Permute labels and compute the median Mahalanobis Distance (RMD)
    in a dataset 'data' for a given perturbation of indices 'iPert' 
    compared to a reference of indices 'iRef', to create an empirical distribution."""
function shuffMD(data, iPert, iRef; nbRep = 250)
    setPert = data[iPert,:]
    setRef = data[iRef,:]  
    set = Matrix(vcat(setRef, setPert))
    
    function iterShufMD()
        nset = size(set, 1)
        shuffSet = set[sample(1:nset, nset; replace = false),:]
        # Take random subsets of corresponding sizes
        shuffSetPert = shuffSet[1:nrow(setPert),:]
        shuffSetRef = shuffSet[(nrow(setPert)+1):(nrow(setPert)+nrow(setRef)),:]

        # Compute Mahalanobis Distance
        
        mdCenter = dropdims(mean(shuffSetRef, dims = 1), dims = 1)
        mdCov = cov(shuffSetRef)

        MD = median(map(x -> mahalanobis(x, mdCenter, mdCov), eachrow(DataFrame(shuffSetPert))))
        return(MD)
    end       
    
    return(map(x -> iterShufMD(), 1:nbRep))
end

In [None]:
""" Compute the median Robust Mahalanobis Distance (RMD)
    in a dataset 'data' for a given perturbation of indices 'iPert' 
    compared to a reference of indices 'iRef'.
    See https://e-archivo.uc3m.es/bitstream/handle/10016/24613/ws201710.pdf """
function RMD(data, iPert, iRef)
    setPert = data[iPert,:]
    setRef = data[iRef,:] 

    # Ensure that we have enough points to compute distance
    if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
        return(missing)
    end
    # NB: having less points than twice the number of dimensions leads to singularity
    
    # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
    @rput setRef

    R"""
    set.seed(3895)
    mcd <- covMcd(setRef)
    mcdCenter <- mcd$center
    mcdCov <- mcd$cov
    """
    @rget mcdCenter
    @rget mcdCov
    
    RMD = median(map(x -> mahalanobis(x, mcdCenter, mcdCov), eachrow(setPert)))
    return(RMD)
end

In [None]:
""" Permute labels and compute the median Robust Mahalanobis Distance (RMD)
    in a dataset 'data' for a given perturbation of indices 'iPert' 
    compared to a reference of indices 'iRef', to create an empirical distribution."""
function shuffRMD(data, iPert, iRef; nbRep = 250)
    setPert = data[iPert,:]
    setRef = data[iRef,:]  
    set = vcat(setRef, setPert)
    
    # Ensure that we have enough points to compute distance
    if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
        return(repeat([missing], nbRep))
    end
    # NB: having less points than twice the number of dimensions leads to singularity
    
    function iterShufRMD()
        shuffSet = set[sample(1:nrow(set), nrow(set); replace = false),:]
        # Take random subsets of corresponding sizes
        shuffSetPert = shuffSet[1:nrow(setPert),:]
        shuffSetRef = shuffSet[(nrow(setPert)+1):(nrow(setPert)+nrow(setRef)),:]

        # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
        @rput shuffSetRef
        
        R"""
        set.seed(3895)
        mcd <- covMcd(shuffSetRef)
        mcdCenter <- mcd$center
        mcdCov <- mcd$cov
        """
        @rget mcdCenter
        @rget mcdCov

        RMD = median(map(x -> mahalanobis(x, mcdCenter, mcdCov), eachrow(shuffSetPert)))
        return(RMD)
    end       
    
    return(map(x -> iterShufRMD(), 1:nbRep))
end

In [None]:
""" Compute the Robust Hellinger Distance (RHD)
    in a dataset `data` for a given perturbation of indices `iPert` 
    compared to a reference of indices `iRef`."""
function RHD(data, iPert, iRef)
    setPert = data[iPert,:]
    setRef = data[iRef,:] 

    # Ensure that we have enough points to compute distance
    if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
        return(missing)
    end
    # NB: having less points than twice the number of dimensions leads to singularity
    
    # Compute Minimum Covariance Determinant and corresponding Robust Hellinger Distance
    @rput setRef
    @rput setPert

    R"""
    set.seed(3895)
    mcd1 <- covMcd(setRef)
    mcdCenter1 <- mcd1$center
    mcdCov1 <- mcd1$cov
    
    # We set the seed twice to always
    # find the same estimators given
    # the same sample
    set.seed(3895)
    mcd2 <- covMcd(setPert)
    mcdCenter2 <- mcd2$center
    mcdCov2 <- mcd2$cov
    """
    @rget mcdCenter1
    @rget mcdCov1
    @rget mcdCenter2
    @rget mcdCov2
    
    RHD = hellinger(mcdCenter1, mcdCov1, mcdCenter2, mcdCov2)
    return(RHD)
end

In [None]:
""" Permute labels and compute the Robust Hellinger Distance (RHD)
    in a dataset `data` for a given perturbation of indices `iPert` 
    compared to a reference of indices `iRef`, to create an empirical distribution."""
function shuffRHD(data, iPert, iRef; nbRep = 250)
    setPert = data[iPert,:]
    setRef = data[iRef,:]  
    set = vcat(setRef, setPert)
    
    # Ensure that we have enough points to compute distance
    if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
        return(repeat([missing], nbRep))
    end
    # NB: having less points than twice the number of dimensions leads to singularity
    
    function iterShufRHD()
        shuffSet = set[sample(1:nrow(set), nrow(set); replace = false),:]
        # Take random subsets of corresponding sizes
        shuffSetPert = shuffSet[1:nrow(setPert),:]
        shuffSetRef = shuffSet[(nrow(setPert)+1):(nrow(setPert)+nrow(setRef)),:]

        # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
        @rput shuffSetRef
        @rput shuffSetPert
        
        R"""
        set.seed(3895)
        mcd <- covMcd(shuffSetRef)
        mcdCenter1 <- mcd$center
        mcdCov1 <- mcd$cov
        
        # We set the seed twice to always
        # find the same estimators given
        # the same sample
        set.seed(3895)
        mcd <- covMcd(shuffSetPert)
        mcdCenter2 <- mcd$center
        mcdCov2 <- mcd$cov
        """
        @rget mcdCenter1
        @rget mcdCov1        
        @rget mcdCenter2
        @rget mcdCov2
        

        RHD = hellinger(mcdCenter1, mcdCov1, mcdCenter2, mcdCov2)
        return(RHD)
    end       
    
    return(map(x -> iterShufRHD(), 1:nbRep))
end

## UMAP pre-processing

In [None]:
# Re-run UMAP with more dimensions (to preserve more of the total information)
Random.seed!(3895)
umND = umap(convert(Matrix, normDataset)', dimUMAP; min_dist = 0.01, n_epochs = 200)
umND = convert(DataFrame, umND')
names!(umND, Symbol.(string.("UMAP", 1:dimUMAP)))

umND[:Condition] = origDataset

### Mahalanobis Distance to Center

In [None]:
# The following MD values are displayed in the following order:
unique(umND.Condition)

In [None]:
# Actual observed MD to center
allMDC = map(x -> MDC(umND[:,1:dimUMAP], umND.Condition.==x, umND.Condition.=="Reference"), unique(umND.Condition))

We then compute the expected distribution of RMDs under the assumption that the condition does not matter.

In [None]:
@time allShuffMDC = map(x -> shuffMDC(umND[:,1:dimUMAP], umND.Condition .== x, 
                        umND.Condition .== "Reference", nbRep = 500), 
    unique(umND.Condition))

In [None]:
# Missing values need to be handled in real case applications
@assert !any(ismissing.(allMDC))

In [None]:
# Compute the Robust Morphological Perturbation Value
plateRMPV = DataFrame()
plateRMPV[:MPV] = adjust([mean(obs .< sim) for (obs, sim) 
            in zip(allMDC, allShuffMDC)], BenjaminiHochberg())
plateRMPV[:MD] = allMDC
plateRMPV[:Condition] = unique(umND.Condition);

In [None]:
plateRMPV

In [None]:
# Display number of positive tests
gp = ggplot(plateRMPV) + geom_point(aes(:MPV, :MD, color = :Condition)) + 
    geom_vline(xintercept = 0.05, linetype = "dashed")
print(gp)
ggsave("Fig/MPV_center_test_mahalanobis.pdf", gp)

### Median Mahalanobis Distance

In [None]:
# The following MD values are displayed in the following order:
unique(umND.Condition)

In [None]:
# Actual observed MD
allMD = map(x -> MD(umND[:,1:dimUMAP], umND.Condition.==x, umND.Condition.=="Reference"), unique(umND.Condition))

We then compute the expected distribution of RMDs under the assumption that the condition does not matter.

In [None]:
@time allShuffMD = map(x -> shuffMD(umND[:,1:dimUMAP], umND.Condition .== x, 
                        umND.Condition .== "Reference", nbRep = 500), 
    unique(umND.Condition))

In [None]:
# Missing values need to be handled in real case applications
@assert !any(ismissing.(allMD))

In [None]:
# Compute the Robust Morphological Perturbation Value
plateRMPV = DataFrame()
plateRMPV[:MPV] = adjust([mean(obs .< sim) for (obs, sim) 
            in zip(allMD, allShuffMD)], BenjaminiHochberg())
plateRMPV[:MD] = allMD
plateRMPV[:Condition] = unique(umND.Condition);

In [None]:
plateRMPV

In [None]:
# Display number of positive tests
gp = ggplot(plateRMPV) + geom_point(aes(:MPV, :MD, color = :Condition)) + 
    geom_vline(xintercept = 0.05, linetype = "dashed")
print(gp)
ggsave("Fig/MPV_test_mahalanobis.pdf", gp)

### Median Robust Mahalanobis Distance 

In [None]:
# The following RMD values are displayed in the following order:
unique(umND.Condition)

In [None]:
# Actual observed RMD
allRMD = map(x -> RMD(umND[:,1:dimUMAP], umND.Condition.==x, umND.Condition.=="Reference"), unique(umND.Condition))

We then compute the expected distribution of RMDs under the assumption that the condition does not matter.

In [None]:
@time allShuffRMD = map(x -> shuffRMD(umND[:,1:dimUMAP], umND.Condition .== x, 
                        umND.Condition .== "Reference", nbRep = 500), 
    unique(umND.Condition))

In [None]:
# Missing values need to be handled in real case applications
@assert !any(ismissing.(allRMD))

In [None]:
# Compute the Robust Morphological Perturbation Value
plateRMPV = DataFrame()
plateRMPV[:RMPV] = adjust([mean(obs .< sim) for (obs, sim) 
            in zip(allRMD, allShuffRMD)], BenjaminiHochberg())
plateRMPV[:RMD] = allRMD
plateRMPV[:Condition] = unique(umND.Condition);

In [None]:
plateRMPV

In [None]:
# Display number of positive tests
gp = ggplot(plateRMPV) + geom_point(aes(:RMPV, :RMD, color = :Condition)) + 
    geom_vline(xintercept = 0.05, linetype = "dashed")
print(gp)
ggsave("Fig/RMPV_test_mahalanobis.pdf", gp)

### Robust Hellinger Distance 

In [None]:
# The following RMD values are displayed in the following order:
unique(umND.Condition)

In [None]:
# Actual observed RHD
allRHD = map(x -> RHD(umND[:,1:dimUMAP], umND.Condition.==x, umND.Condition.=="Reference"), unique(umND.Condition))

NB: as expected, Hellinger distance is symmetrical.

    map(x -> RHD(umND[:,1:dimUMAP], umND.Condition.==x, umND.Condition.=="Negative control"), unique(umND.Condition))
    map(x -> RHD(umND[:,1:dimUMAP], umND.Condition.==x, umND.Condition.=="Shifted control"), unique(umND.Condition))
    map(x -> RHD(umND[:,1:dimUMAP], umND.Condition.==x, umND.Condition.=="Rescaled control"), unique(umND.Condition))
    
We then compute the expected distribution of RHDs under the assumption that the condition does not matter.

In [None]:
@time allShuffRHD = map(x -> shuffRHD(umND[:,1:dimUMAP], umND.Condition .== x, 
                        umND.Condition .== "Reference", nbRep = 500), 
    unique(umND.Condition))

In [None]:
# Missing values need to be handled in real case applications
@assert !any(ismissing.(allRHD))

In [None]:
# Compute the Robust Morphological Perturbation Value
plateRMPV = DataFrame()
plateRMPV[:RMPV] = adjust([mean(obs .< sim) for (obs, sim) 
            in zip(allRHD, allShuffRHD)], BenjaminiHochberg())
plateRMPV[:RHD] = allRHD
plateRMPV[:Condition] = unique(umND.Condition)

In [None]:
# Display number of positive tests
gp = ggplot(plateRMPV) + geom_point(aes(:RMPV, :RHD, color = :Condition)) + 
    geom_vline(xintercept = 0.05, linetype = "dashed")
print(gp)
ggsave("Fig/RMPV_test_hellinger.pdf", gp)

## PCA pre-processing

In [None]:
Random.seed!(3895)
modelPCA = fit(PCA, convert(Matrix, normDataset)'; pratio = 0.9)
dimPCA = outdim(modelPCA)
pcaND = transform(modelPCA, convert(Matrix, normDataset)')
pcaND = convert(DataFrame, pcaND')
# Scale by importance of each principal component
pcaND = DataFrame(principalvars(modelPCA) .* eachcol(pcaND))
names!(pcaND, Symbol.(string.("PC", 1:dimPCA)))


pcaND[:Condition] = origDataset

In [None]:
gp = ggplot(pcaND, aes(:PC1, :PC2)) + geom_point(aes(color = :Condition), alpha = 0.3)
print(gp)
ggsave("Fig/PCA_test.pdf", gp)
1

### Mahalanobis Distance to Center

In [None]:
# The following MD values are displayed in the following order:
unique(pcaND.Condition)

In [None]:
# Actual observed MD to center
allMDCpca = map(x -> MDC(pcaND[:,1:dimPCA], pcaND.Condition.==x,
             pcaND.Condition.=="Reference"), unique(pcaND.Condition))

We then compute the expected distribution of RMDs under the assumption that the condition does not matter.

In [None]:
@time allShuffMDCpca = map(x -> shuffMDC(pcaND[:,1:dimPCA], pcaND.Condition .== x, 
                        pcaND.Condition .== "Reference", nbRep = 500), 
    unique(pcaND.Condition))

In [None]:
# Missing values need to be handled in real case applications
@assert !any(ismissing.(allMDCpca))

In [None]:
# Compute the Robust Morphological Perturbation Value
plateRMPV = DataFrame()
plateRMPV[:MPV] = adjust([mean(obs .< sim) for (obs, sim) 
            in zip(allMDCpca, allShuffMDCpca)], BenjaminiHochberg())
plateRMPV[:MD] = allMDCpca
plateRMPV[:Condition] = unique(pcaND.Condition);

In [None]:
plateRMPV

In [None]:
# Display number of positive tests
gp = ggplot(plateRMPV) + geom_point(aes(:MPV, :MD, color = :Condition)) + 
    geom_vline(xintercept = 0.05, linetype = "dashed")
print(gp)
ggsave("Fig/PCA_MPV_center_test_mahalanobis.pdf", gp)

### Median Mahalanobis Distance

In [None]:
# The following MD values are displayed in the following order:
unique(pcaND.Condition)

In [None]:
# Actual observed MD
allMDpca = map(x -> MD(pcaND[:,1:dimPCA], pcaND.Condition.==x, 
                       pcaND.Condition.=="Reference"), unique(pcaND.Condition))

We then compute the expected distribution of MDs under the assumption that the condition does not matter.

In [None]:
@time allShuffMDpca = map(x -> shuffMD(pcaND[:,1:dimPCA], pcaND.Condition .== x, 
                        pcaND.Condition .== "Reference", nbRep = 500), 
    unique(pcaND.Condition))

In [None]:
# Missing values need to be handled in real case applications
@assert !any(ismissing.(allMDpca))

In [None]:
# Compute the Robust Morphological Perturbation Value
plateRMPV = DataFrame()
plateRMPV[:MPV] = adjust([mean(obs .< sim) for (obs, sim) 
            in zip(allMDpca, allShuffMDpca)], BenjaminiHochberg())
plateRMPV[:MD] = allMDpca
plateRMPV[:Condition] = unique(umND.Condition);

In [None]:
plateRMPV

In [None]:
# Display number of positive tests
gp = ggplot(plateRMPV) + geom_point(aes(:MPV, :MD, color = :Condition)) + 
    geom_vline(xintercept = 0.05, linetype = "dashed")
print(gp)
ggsave("Fig/PCA_MPV_test_mahalanobis.pdf", gp)

### Median Robust Mahalanobis Distance 

In [None]:
# The following RMD values are displayed in the following order:
unique(pcaND.Condition)

In [None]:
# Actual observed RMD
allRMDpca = map(x -> RMD(pcaND[:,1:dimPCA], pcaND.Condition.==x, 
                pcaND.Condition.=="Reference"), unique(pcaND.Condition))

We then compute the expected distribution of RMDs under the assumption that the condition does not matter.

In [None]:
@time allShuffRMDpca = map(x -> shuffRMD(pcaND[:,1:dimPCA], pcaND.Condition .== x, 
                        pcaND.Condition .== "Reference", nbRep = 20), 
    unique(umND.Condition))

In [None]:
# Missing values need to be handled in real case applications
@assert !any(ismissing.(allRMDpca))

In [None]:
# Compute the Robust Morphological Perturbation Value
plateRMPV = DataFrame()
plateRMPV[:RMPV] = adjust([mean(obs .< sim) for (obs, sim) 
            in zip(allRMDpca, allShuffRMDpca)], BenjaminiHochberg())
plateRMPV[:RHD] = allRMDpca
plateRMPV[:Condition] = unique(pcaND.Condition)

In [None]:
# Display number of positive tests
gp = ggplot(plateRMPV) + geom_point(aes(:RMPV, :RHD, color = :Condition)) + 
    geom_vline(xintercept = 0.05, linetype = "dashed")
print(gp)
ggsave("Fig/PCA_test_mahalanobis.pdf", gp)

### Robust Hellinger Distance 

In [None]:
# Actual observed RHD
allRHDpca = map(x -> RHD(pcaND[:,1:dimUMAP], pcaND.Condition.==x, pcaND.Condition.=="Reference"),
                unique(pcaND.Condition))

NB: as expected, Hellinger distance is symmetrical.

    map(x -> RHD(umND[:,1:dimUMAP], umND.Condition.==x, umND.Condition.=="Negative control"), unique(umND.Condition))
    map(x -> RHD(umND[:,1:dimUMAP], umND.Condition.==x, umND.Condition.=="Shifted control"), unique(umND.Condition))
    map(x -> RHD(umND[:,1:dimUMAP], umND.Condition.==x, umND.Condition.=="Rescaled control"), unique(umND.Condition))
    
We then compute the expected distribution of RHDs under the assumption that the condition does not matter.

In [None]:
@time allShuffRHDpca = map(x -> shuffRHD(pcaND[:,1:dimPCA], pcaND.Condition .== x, 
                        pcaND.Condition .== "Reference", nbRep = 12), 
    unique(umND.Condition))

In [None]:
# Missing values need to be handled in real case applications
@assert !any(ismissing.(allRHDpca))

In [None]:
# Compute the Robust Morphological Perturbation Value
plateRMPV = DataFrame()
plateRMPV[:RMPV] = adjust([mean(obs .< sim) for (obs, sim) 
            in zip(allRHDpca, allShuffRHDpca)], BenjaminiHochberg())
plateRMPV[:RHD] = allRHDpca
plateRMPV[:Condition] = unique(pcaND.Condition)

In [None]:
# Display number of positive tests
gp = ggplot(plateRMPV) + geom_point(aes(:RMPV, :RHD, color = :Condition)) + 
    geom_vline(xintercept = 0.05, linetype = "dashed")
print(gp)
ggsave("Fig/PCA_test_hellinger.pdf", gp)

## Full size dataset

In [None]:
Random.seed!(3895)
dimRaw = size(normDataset, 2)

In [None]:
odCode = Dict(x => i for (i,x) in enumerate(unique(origDataset)))
gp = ggplot(normDataset, aes(:x1, :x2))  + 
     geom_point(color = [odCode[x] for x in origDataset], alpha = 0.3)
print(gp)
ggsave("Fig/Raw_test.pdf", gp)
1

### Mahalanobis Distance to Center

In [None]:
# The following MD values are displayed in the following order:
unique(origDataset)

In [None]:
# Actual observed MD to center
allMDCraw = map(x -> MDC(normDataset, origDataset.==x,
                origDataset.=="Reference"), unique(origDataset))

Why exactly 0?

We then compute the expected distribution of RMDs under the assumption that the condition does not matter.

In [None]:
@time allShuffMDCraw = map(x -> shuffMDC(normDataset, origDataset .== x, 
                        origDataset .== "Reference", nbRep = 10), unique(origDataset))

In [None]:
# Missing values need to be handled in real case applications
@assert !any(ismissing.(allMDCraw))

In [None]:
# Compute the Robust Morphological Perturbation Value
plateRMPV = DataFrame()
plateRMPV[:MPV] = adjust([mean(obs .< sim) for (obs, sim) 
            in zip(allMDCraw, allShuffMDCraw)], BenjaminiHochberg())
plateRMPV[:MD] = allMDCraw
plateRMPV[:Condition] = unique(origDataset);

In [None]:
plateRMPV

In [None]:
# Display number of positive tests
gp = ggplot(plateRMPV) + geom_point(aes(:MPV, :MD, color = :Condition)) + 
    geom_vline(xintercept = 0.05, linetype = "dashed")
print(gp)
ggsave("Fig/Raw_MPV_center_test_mahalanobis.pdf", gp)

### Median Mahalanobis Distance

In [None]:
# The following MD values are displayed in the following order:
unique(origDataset)

In [None]:
# Actual observed MD
allMDraw = map(x -> MD(normDataset, origDataset.==x,
                       origDataset.=="Reference"), unique(origDataset))

Why exactly same distance?!

We then compute the expected distribution of MDs under the assumption that the condition does not matter.

In [None]:
@time allShuffMDraw = map(x -> shuffMD(normDataset, origDataset .== x, 
                        origDataset .== "Reference", nbRep = 10), unique(origDataset))

In [None]:
# Missing values need to be handled in real case applications
@assert !any(ismissing.(allMDraw))

In [None]:
# Compute the Robust Morphological Perturbation Value
plateRMPV = DataFrame()
plateRMPV[:MPV] = adjust([mean(obs .< sim) for (obs, sim) 
            in zip(allMDraw, allShuffMDraw)], BenjaminiHochberg())
plateRMPV[:MD] = allMDraw
plateRMPV[:Condition] = unique(origDataset);

In [None]:
plateRMPV

In [None]:
# Display number of positive tests
gp = ggplot(plateRMPV) + geom_point(aes(:MPV, :MD, color = :Condition)) + 
    geom_vline(xintercept = 0.05, linetype = "dashed")
print(gp)
ggsave("Fig/Raw_MPV_test_mahalanobis.pdf", gp)

### Median Robust Mahalanobis Distance 

In [None]:
# The following RMD values are displayed in the following order:
unique(origDataset)

In [None]:
# Actual observed RMD
allRMDraw = map(x -> RMD(normDataset, origDataset.==x, 
                origDataset.=="Reference"), unique(origDataset))

We then compute the expected distribution of RMDs under the assumption that the condition does not matter.

In [None]:
@time allShuffRMDraw = map(x -> shuffRMD(normDataset, origDataset .== x, 
                        origDataset .== "Reference", nbRep = 10), unique(origDataset))

In [None]:
# Missing values need to be handled in real case applications
@assert !any(ismissing.(allRMDraw))

In [None]:
# Compute the Robust Morphological Perturbation Value
plateRMPV = DataFrame()
plateRMPV[:RMPV] = adjust([mean(obs .< sim) for (obs, sim) 
            in zip(allRMDraw, allShuffRMDraw)], BenjaminiHochberg())
plateRMPV[:RMD] = allRMDraw
plateRMPV[:Condition] = unique(origDataset)

In [None]:
# Display number of positive tests
gp = ggplot(plateRMPV) + geom_point(aes(:RMPV, :RMD, color = :Condition)) + 
    geom_vline(xintercept = 0.05, linetype = "dashed")
print(gp)
ggsave("Fig/Raw_test_mahalanobis.pdf", gp)

### Robust Hellinger Distance 

In [None]:
# Actual observed RHD
allRHDraw = map(x -> RHD(normDataset, origDataset.==x, origDataset.=="Reference"),
                unique(origDataset))

In [None]:
@time allShuffRHDraw = map(x -> shuffRHD(normDataset, origDataset .== x, 
                           origDataset .== "Reference", nbRep = 6), unique(umND.Condition))

In [None]:
# Missing values need to be handled in real case applications
@assert !any(ismissing.(allRHDraw))

In [None]:
# Compute the Robust Morphological Perturbation Value
plateRMPV = DataFrame()
plateRMPV[:RMPV] = adjust([mean(obs .< sim) for (obs, sim) 
            in zip(allRHDraw, allShuffRHDraw)], BenjaminiHochberg())
plateRMPV[:RHD] = allRHDraw
plateRMPV[:Condition] = unique(origDataset)

In [None]:
# Display number of positive tests
gp = ggplot(plateRMPV) + geom_point(aes(:RMPV, :RHD, color = :Condition)) + 
    geom_vline(xintercept = 0.05, linetype = "dashed")
print(gp)
ggsave("Fig/Raw_test_hellinger.pdf", gp)

In [None]:
1+1