using Pkg
Pkg.add("MultivariateStats")
Pkg.add("MultipleTesting")

## Settings

Load packages

In [None]:
using CSV, StatsBase, Statistics, DataFrames, UMAP, RCall, Random, MultivariateStats, Distributed, MultipleTesting
using Dates: now

In [None]:
@rlibrary ggplot2
@rlibrary extrafont

In [None]:
R"""
# Used later for MCD computation

library(robustbase)

# Customize ggplot appearance

library(ggplot2)
library(extrafont)


# Load extra fonts
ttf_import(paths = "/tmp/.fonts/")
loadfonts()

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), 
                     legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}

cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
scale_fill_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)
"""

## Loading

Load generated datasets and combine them

In [None]:
R = CSV.read("Data/matR.csv", header = false) # Reference
N = CSV.read("Data/matN.csv", header = false) # Negative control
PS = CSV.read("Data/matPS.csv", header = false) # Shifted
PR = CSV.read("Data/matPR.csv", header = false) # Rescaled
1 # Do not display last element

In [None]:
dataset = vcat(R, N, PS, PR)

In [None]:
# Remember how these records were generated
origDataset = vcat(repeat(["Reference"], size(R, 1)),
                   repeat(["Negative control"], size(N, 1)),
                   repeat(["Shifted control"], size(PS, 1)),
                   repeat(["Rescaled control"], size(PR, 1)))

## Feature filtering

In [None]:
FILT_MAX_CORR = 0.6 # Keep uncorrelated variables
dimUMAP = 5

In [None]:
# Center and scale on control values
transfNorm(x,y) = (x .- median(y)) ./ mad(y)
indRef = origDataset .== "Reference"
normDataset = DataFrame(map(x -> transfNorm(x, x[indRef]), eachcol(dataset)))
1

In [None]:
# Order features from biggest mad to smallest mad
# Since features have mad(reference) = 1, it means that we rank features by how more variable they are
# for all conditions compared to the reference
orderFt = sortperm(convert(Array, map(x -> mad(x, normalize = true), eachcol(normDataset))), rev=true)

In [None]:
function decorrelate(data::DataFrame; orderCol = nothing, threshold = 0.8)
    """Returns column  of 'data' that are never pairwise-correlated more than 'threshold',
    prioritizing columns by a giver order 'orderCol' (defaults to left to right).
    """
    if isnothing(orderCol)
        orderCol = 1:size(normDataset, 2)
    end
    # Columns to sort
    L1 = orderCol
    # Sorted columns to keep
    L2 = Array{Int64,1}()
    while length(L1) > 0
        refFt = first(L1)
        append!(L2, refFt)
        popfirst!(L1)
        stillToKeep = []
        for (ift, ft) in enumerate(L1)
            if abs(cor(data[refFt], data[ft])) < threshold
                append!(stillToKeep, ift)
            end
        end
        L1 = L1[stillToKeep]
    end
    return(L2)
end

In [None]:
uncorrFt = decorrelate(normDataset, orderCol = orderFt, threshold = FILT_MAX_CORR)
normDataset = normDataset[uncorrFt]

In [None]:
# Visualization
Random.seed!(3895)
umND = umap(convert(Matrix, normDataset)', 2; min_dist = 0.01, n_epochs = 200)
umND = convert(DataFrame, umND')
names!(umND, [:UMAP1, :UMAP2])

umND[:Condition] = origDataset

gp = ggplot(umND, aes(:UMAP1, :UMAP2)) + geom_point(aes(color = :Condition), alpha = 0.3)
print(gp)
ggsave("Fig/UMAP_test.pdf", gp)
1

In [None]:
function mahalanobis(x::DataFrameRow, µ::Array{Float64,1}, S::Array{Float64,2})
    """Squared mahalanobis distance for covariance estimator S and center µ"""
    arrX = convert(Vector, x)  
    return((arrX - µ)'*inv(S)*(arrX - µ))
end

In [None]:
function RMD(data, iPert, iRef)
    """ Compute the median Robust Mahalanobis Distance (RMD)
        in a dataset 'data' for a given perturbation of indices 'iPert' 
        compared to a reference of indices 'iRef'.
        See https://e-archivo.uc3m.es/bitstream/handle/10016/24613/ws201710.pdf """
    setPert = data[iPert,:]
    setRef = data[iRef,:] 

    # Ensure that we have enough points to compute distance
    if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
        return(missing)
    end
    # NB: having less points than twice the number of dimensions leads to singularity
    
    # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
    @rput setRef

    R"""
    set.seed(3895)
    mcd <- covMcd(setRef)
    mcdCenter <- mcd$center
    mcdCov <- mcd$cov
    """
    @rget mcdCenter
    @rget mcdCov
    
    RMD = median(map(x -> mahalanobis(x, mcdCenter, mcdCov), eachrow(setPert)))
    return(RMD)
end

In [None]:
function shuffRMD(data, iPert, iRef; nbRep = 250)
    """ Permute labels and compute the median Robust Mahalanobis Distance (RMD)
        in a dataset 'data' for a given perturbation of indices 'iPert' 
        compared to a reference of indices 'iRef', to create an empirical distribution."""
    setPert = data[iPert,:]
    setRef = data[iRef,:]  
    set = vcat(setRef, setPert)
    
    # Ensure that we have enough points to compute distance
    if ((size(setPert)[1] < 2*size(data, 2))|(size(setRef)[1] < 2*size(data, 2)))
        return(repeat([missing], nbRep))
    end
    # NB: having less points than twice the number of dimensions leads to singularity
    
    function iterShufRMD()
        shuffSet = set[sample(1:nrow(set), nrow(set); replace = false),:]
        # Take random subsets of corresponding sizes
        shuffSetPert = shuffSet[1:nrow(setPert),:]
        shuffSetRef = shuffSet[(nrow(setPert)+1):(nrow(setPert)+nrow(setRef)),:]

        # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
        @rput shuffSetRef
        
        R"""
        set.seed(3895)
        mcd <- covMcd(shuffSetRef)
        mcdCenter <- mcd$center
        mcdCov <- mcd$cov
        """
        @rget mcdCenter
        @rget mcdCov

        RMD = median(map(x -> mahalanobis(x, mcdCenter, mcdCov), eachrow(shuffSetPert)))
        return(RMD)
    end       
    
    return(map(x -> iterShufRMD(), 1:nbRep))
end

In [None]:
# Re-run UMAP with more dimensions (to preserve more of the total information)
Random.seed!(3895)
umND = umap(convert(Matrix, normDataset)', dimUMAP; min_dist = 0.01, n_epochs = 200)
umND = convert(DataFrame, umND')
names!(umND, Symbol.(string.("UMAP", 1:dimUMAP)))

umND[:Condition] = origDataset

In [None]:
allRMD = map(x -> RMD(umND[:,1:dimUMAP], umND.Condition.==x, umND.Condition.=="Reference"), unique(umND.Condition))

In [None]:
@time allShuffRMD = map(x -> shuffRMD(umND[:,1:dimUMAP], umND.Condition .== x, umND.Condition .== "Reference", nbRep = 250), 
    unique(umND.Condition))

In [None]:
# Missing values need to be handled in real case applications
@assert !any(ismissing.(allRMD))

In [None]:
# Compute the Robust Morphological Perturbation Value
plateRMPV = DataFrame()
plateRMPV[:RMPV] = adjust([mean(obs .< sim) for (obs, sim) 
            in zip(allRMD, allShuffRMD)], BenjaminiHochberg())
plateRMPV[:RMD] = allRMD
plateRMPV[:Condition] = unique(umND.Condition)

In [None]:
allShuffRMD

In [None]:
plateRMPV

In [None]:
# Display number of active drugs
gp = ggplot(plateRMPV) + geom_point(aes(:RMPV, :RMD, color = :Condition)) + 
    geom_vline(xintercept = 0.05, linetype = "dashed")
print(gp)
ggsave("Fig/RMPV_test.pdf", gp)

In [None]:
modelPCA = fit(PCA, convert(Matrix, normDataset)'; pratio = 0.8)
pcaND = transform(modelPCA, convert(Matrix, normDataset)')
pcaND = convert(DataFrame, pcaND')
# Scale by importance of each principal component
pcaND = DataFrame(principalvars(modelPCA) .* eachcol(pcaND))
names!(pcaND, Symbol.(string.("PC", 1:outdim(modelPCA))))

pcaND[:Condition] = origDataset

gp = ggplot(pcaND, aes(:PC1, :PC2)) + geom_point(aes(color = :Condition), alpha = 0.3)
print(gp)
ggsave("Fig/PCA_test.pdf", gp)
1