In [None]:
# Install the latest version of BioProfiling if needed
import Pkg
Pkg.add(Pkg.PackageSpec(url = "https://github.com/menchelab/BioProfiling.jl.git", rev="renaming"))

In [None]:
using CSV, StatsBase, Statistics, DataFrames, RCall, FreqTables
using MultipleTesting, Random, MultivariateStats, Distributed
using ParallelDataTransfer
using BioProfiling, UMAP

In [None]:
using Dates: now
now()

In [None]:
# Set the number of processes to use for parallel computing
addprocs(16)
pool = CachingPool(workers())

## R Calls

In [None]:
@rlibrary ggplot2
@rlibrary extrafont
@rlibrary viridis
@rlibrary heatmaply
@rlibrary ggrepel

In [None]:
R"""
# Used later for MCD computation

library(robustbase)

# Customize ggplot appearance

library(ggplot2)
library(extrafont)


# Load extra fonts
# ttf_import("/tmp/.fonts")
# loadfonts()

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), 
                     legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}

cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
scale_fill_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)
"""

## Load measurements

### Load annotations

In [None]:
annotations = CSV.read("data/transferList.txt", DataFrame, header = 1, delim = "\t")
# Convert plate number to strings to make clear it is an ID and should not be used for computations
annotations[!,:DestinationPlate] = string.(annotations[:,:DestinationPlate])
# Wells in the transfer list without any compound name are filled with DMSO only
annotations[!,:CompoundName][ismissing.(annotations[:,:CompoundName])] .= "DMSO"
# A non-ASCII characters needs to be converted
annotations.CompoundName = replace.(annotations.CompoundName, "\xb1" => s"±");

In [None]:
annotations

### Load image data

In [None]:
image = CSV.read("data/compiled_PilotDrugScreen_Image_750.csv", DataFrame)
println(string.(names(image))[1:8])
# Number of images and features available
println(nrow(image))
println(ncol(image))

## Coarse-grain aggregation

In [None]:
aggregatedData = CSV.read("data/aggregatedData_750cells_noSparse.csv", DataFrame);

### Transform aggregated data - Normalization
We want to focus on variables that are changing more overall than inside of reference condition (untreated WT).

In [None]:
expAgg = Experiment(aggregatedData, description = "Median values for aggregated FOV measurements")

In [None]:
filters = Array{BioProfiling.AbstractSelector,1}()
# Remove metadata
strToRemove = ["Metadata_Well", "CompoundName", "Metadata_Field", "Metadata_Row", "Metadata_Column"]
push!(filters, NameSelector(x -> !any(occursin.(strToRemove, String(x)))))
# Remove constant columns
push!(filters, Selector(x -> mad(x, normalize = true) != 0, description = "Remove constant features"));
push!(filters, Selector(x -> mad(x, normalize = true) != 0, 
                        subset = x -> x.CompoundName .== "DMSO", 
                        description = "Remove features constant for reference"))
select_features!(expAgg, filters)

In [None]:
expTransformed = deepcopy(expAgg)
logtransform!(expTransformed)
expTransformed.description = "Transformed values for aggregated FOV measurements"

Here we apply a correction based on the specific details of the experimental design:
All rows and columns include DMSO (negative) controls and we normalize all values based on these matchings controls (same row and column).  

This examplifies how to directly modify the data of an `Experiment` object.  

NB: One might want to check that more iterations are not needed (cf. Median-polish method).

In [None]:
# Normalize on matching DMSO wells median values

# Entries in both data frames are matching
@assert nrow(aggregatedData) == nrow(getdata(expTransformed))

# Copy data before correction
ndf = getdata(expTransformed)

for (i, (fx, fy)) in enumerate(eachrow(aggregatedData[:,[:Metadata_Row, :Metadata_Column]])) 
    c1 = aggregatedData.CompoundName .== "DMSO"
    c2 = aggregatedData.Metadata_Row .== fx
    c3 = aggregatedData.Metadata_Column .== fy
    @assert sum((c1 .& (c2 .| c3))) > 0
    expTransformed.data[i:i, expTransformed.selected_features] .-= 
        mapcols(median, ndf[(c1 .& (c2 .| c3)),:]) 
    expTransformed.data[i:i, expTransformed.selected_features] ./= 
        mapcols(mad, ndf[(c1 .& (c2 .| c3)),:]) 
end

select_features!(expTransformed,
                 Selector(x -> !any(isinf.(x)), 
                        description = "Remove features with infinite values " *
                            "(i.e. with no variation for a subset of the DMSO images)"));

decorrelate_by_mad!(expTransformed);

## Visualization

In [None]:
using Distances

In [None]:
Random.seed!(3895)
umTPM = umap(expTransformed, metric = CosineDist(), min_dist = 2)
umTPM = convert(DataFrame, umTPM')
rename!(umTPM, [:UMAP1, :UMAP2]);
umTPM.Compound = aggregatedData.CompoundName

subsetCompounds = ["Vinblastine", "Pentamidine", "Furamidine", "DMSO"]
subsetEntries = Bool.(map(sum, eachcol(map(x -> occursin.(x, umTPM.Compound), subsetCompounds)))[1])
ggplot(umTPM[subsetEntries,:], aes(:UMAP1, :UMAP2)) + 
    geom_point(aes(color = :Compound), alpha = 0.8) +
    coord_fixed() + 
    theme(var"legend.position"="bottom", var"legend.spacing.x" = unit(0.35, "cm"), 
    var"legend.spacing.y" = unit(0, "cm")) + 
    guides(color=guide_legend(nrow=3,byrow=true))

## Distance to DMSO

### UMAP-based distance

In [None]:
Random.seed!(3895)
umTPM = umap(expTransformed, 4, metric = CosineDist(), min_dist = 2)
umTPM = convert(DataFrame, umTPM')
rename!(umTPM, [:UMAP1, :UMAP2, :UMAP3, :UMAP4])
umTPM.Compound = aggregatedData.CompoundName;

In [None]:
expUMAP = Experiment(umTPM, description = "UMAP projection of profiling data")

In [None]:
filters = Array{BioProfiling.AbstractReduce,1}()
# Remove (categorical) compound column from analysis
push!(filters, NameSelector(x -> x != "Compound"))
# Remove entries for compounds not present often enough
cmpd_to_keep = levels(umTPM.Compound)[freqtable(umTPM.Compound) .>= 8]
compare_in(x,y) = x in y
Broadcast.broadcasted(::typeof(compare_in), x, y) = broadcast(in, x, Ref(y))
push!(filters, Filter(cmpd_to_keep, :Compound, compare = compare_in))
# Apply filters
filter!(expUMAP, filters)
expUMAP

NB: because the `compare` function of Filter `f` is applied as f.compare.(entries, f.value)
the value provided needs to be of the length of the entries or of length 1.  
Otherwise, when broadcasting (calling `f.compare` on all elements with `f.compare.`), we need to specify that the `f.value` should be used "as is". This is done by overloading the broadcasting of the function. See:   
https://discourse.julialang.org/t/how-to-broadcast-over-only-certain-function-arguments/19274/5  
Another "trick" is to use a function that takes a length 1 value that is in practice not used:  
```julia
push!(filters, Filter("NotUsed", :Compound, compare = (x,y) -> (x in cmpd_to_keep)))
```

In [None]:
# The following RMD values are displayed in the following order:
selectedCompounds = expUMAP.data[expUMAP.selected_entries, :Compound]
levels(selectedCompounds)

### Optimize rmpv runtime

Note: with previous versions of Julia, pmap was slower than using SharedArrays and a `@sync @distributed for` loops. This is not the case for Julia 1.5, so we use pmap for the sake of readability.

In [None]:
now()

Takes ~1h20mn with 16 cores

In [None]:
@everywhere using BioProfiling
plateRMPV = robust_morphological_perturbation_value(expUMAP, :Compound, "DMSO"; 
    nb_rep = 5000, dist = :RobHellinger, process_pool = pool)

In [None]:
now()

In [None]:
# Missing values might need to be handled
@assert !any(ismissing.(plateRMPV.RMPV))

In [None]:
# Export to avoid recomputing if not necessary
CSV.write("data/RMPV.csv", plateRMPV);

In [None]:
plateRMPV = CSV.read("data/RMPV.csv", DataFrame)

In [None]:
cpd_to_label = ["DMSO", "Vinblastine", "Pentamidine", "Wiskostatin", 
     "Gefitinib", "Imatinib", "Hydroxychloroquine"]
plateRMPV.Label = [x in cpd_to_label ? x : "" for x in plateRMPV.Condition];

In [None]:
# Display number of positive tests
gp = ggplot(plateRMPV, aes(x = :RMPV, y = :Distance)) + geom_point(aes(color = :Condition)) + 
    geom_vline(xintercept = 0.05, linetype = "dashed") +
    geom_label_repel(aes(label = :Label), size = 5) + 
    guides(color = false) +
    xlab("FDR-corrected p-value") +
    ylab("Robust Hellinger Distance") +
    RObject(nothing)
print(gp);

In [None]:
ggsave("fig/UMAP_RMPV_test_hellinger_annotated.pdf", gp);

With this method, we find 232 strong hits (FDR .05) or 248 (FDR .1).

In [None]:
println.(plateRMPV[plateRMPV.RMPV .< 0.1,:Condition]);

In [None]:
println.(plateRMPV[plateRMPV.RMPV .< 0.05,:Condition]);

## Display tops and flops

In [None]:
plateRMPV.Condition[sortperm(plateRMPV.Distance)[1:10]]

In [None]:
plateRMPV.Condition[sortperm(plateRMPV.Distance)[(end-9):end]]

In [None]:
Random.seed!(3895)
umTPM = umap(expTransformed, metric = CosineDist(), min_dist = 2)
umTPM = convert(DataFrame, umTPM')
rename!(umTPM, [:UMAP1, :UMAP2]);
umTPM.Compound = aggregatedData.CompoundName;

In [None]:
subsetCompounds = ["Iodoacetamide", "Ellipticine", "Loperamide", "DMSO", "ML324"]
subsetEntries = Bool.(map(sum, eachcol(map(x -> occursin.(x, umTPM.Compound), subsetCompounds)))[1])
ggplot(umTPM[subsetEntries,:], aes(:UMAP1, :UMAP2)) + 
    geom_point(aes(color = :Compound), alpha = 0.8) +
    coord_fixed() + 
    theme(var"legend.position"="bottom", var"legend.spacing.x" = unit(0.35, "cm"), 
    var"legend.spacing.y" = unit(0, "cm")) + 
    guides(color=guide_legend(nrow=3,byrow=true))

In [None]:
subsetCompounds = ["Mitoxantrone", "U-62066", "5HPP-33", "DMSO", "Dipyridamole"]
subsetEntries = Bool.(map(sum, eachcol(map(x -> occursin.(x, umTPM.Compound), subsetCompounds)))[1])
ggplot(umTPM[subsetEntries,:], aes(:UMAP1, :UMAP2)) + 
    geom_point(aes(color = :Compound), alpha = 0.8) +
    coord_fixed() + 
    theme(var"legend.position"="bottom", var"legend.spacing.x" = unit(0.35, "cm"), 
    var"legend.spacing.y" = unit(0, "cm")) + 
    guides(color=guide_legend(nrow=3,byrow=true))