In [None]:
using CSV, StatsBase, Statistics, DataFrames, UMAP, RCall, FreqTables
using MultipleTesting, Random, MultivariateStats, Distributed, CategoricalArrays
using HTTP, JSON
using RMP

In [None]:
using Dates: now
now()

## R Calls

In [None]:
@rlibrary ggplot2
@rlibrary extrafont
@rlibrary viridis
@rlibrary heatmaply
@rlibrary ggrepel

In [None]:
R"""
# Used later for MCD computation

library(robustbase)

# Customize ggplot appearance

library(ggplot2)
library(extrafont)


# Load extra fonts
ttf_import("/tmp/.fonts")
loadfonts()

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), 
                     legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}

cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
scale_fill_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)
"""

## Load pre-computed statistical distances to DMSO

In [None]:
RMPV = CSV.read("data/DrugScreen/RMPV.csv", DataFrame) 

In [None]:
RMPV.Condition[RMPV.RMPV .< 0.05]

## Get annotations

In [None]:
salt_to_remove = [" maleate", " hydrochloride", " nitrate", 
                  " dihydrochloride", " chloride", " sulfate", 
                  " hydrate", " mesylate", " oxalate", " salt",
                  " from Penicillium brefeldianum", " monohydrate",
                  " trifluoroacetate", " acetate", " isethionate",
                  " hemisulfate", " angular", " sodium", " fumarate",
                  " methanesulfonate", " hemihydrate", " (MW = 374.83)"]

"""Other compounds might include salts but are anyway not found in the LINCS database at all:
    AC-93253 iodide
    N-p-Tosyl-L-phenylalanine chloromethyl ketone
    4-(2-Aminoethyl)benzenesulfonyl fluoride hydrochloride
    UNC0379 trifluoroacetate salt"""

salt_dict = Dict(s => s"" for s in salt_to_remove)
salt_dict[" "] = s"-";

In [None]:
"""
This function parse a list of compound canonical IDs and extract mechanism of actions (MOA)
annotated in the LINCS perturbation database when provided. Returns a list of annotates MOAs
See https://clue.io/developer-resources#apisection
"""
function getMOA(cpd::String)
    user_key = "3a73a242e38f9fb6375a1100354e4107"
    rootURL = "https://api.clue.io/api/perts?filter={\"where\":{\"pert_iname\":\""
    typeURL = "\"},\"fields\":{\"moa\":true}}&user_key="
    # NB: case-dependent. Dashes are handled. Spaces are usually replaced by dashes.
    
    cpd_no_salt = reduce(replace, salt_dict, init=cpd)
    
    rq = rootURL * lowercase(cpd_no_salt) * typeURL * user_key
    
    try
        resRq = HTTP.get(rq)
        moaRQ = JSON.Parser.parse(String(resRq.body))[1]
        return(moaRQ["moa"])
    catch e
        if isa(e, BoundsError)
            rq = rootURL * uppercase(cpd_no_salt) * typeURL * user_key
            try
                resRq = HTTP.get(rq)
                moaRQ = JSON.Parser.parse(String(resRq.body))[1]
                return(moaRQ["moa"])
            catch e
                if isa(e, BoundsError)
                println(cpd*" does not have an MOA annotation.")
                end
            end
        elseif isa(e, HTTP.ExceptionRequest.StatusError)
            println(cpd*" raises a 502 error.")
        end
        return()
    end
end

In [None]:
MOA = Dict()
for cpd in RMPV.Condition
    moa = getMOA(cpd)
    if length(moa) > 0
        MOA[cpd] = moa
    end
end

In [None]:
cpd_list = Set(RMPV.Condition)
map(x -> pop!(cpd_list, x), collect(keys(MOA)));

In [None]:
for cpd in cpd_list
    moa = getMOA(cpd)
    if length(moa) > 0
        MOA[cpd] = moa
    end
end

In [None]:
length(MOA)

In [None]:
MOA

You can save the MOA dictionary for later use:
```julia
using HDF5, JLD
save("Data/MOA.jld", "data", MOA)
MOA = load("Data/MOA.jld")["data"]
```

In [None]:
using HDF5, JLD
MOA = load("Data/MOA.jld")["data"]

In [None]:
freqMOA = sort(freqtable(vcat(collect(values(MOA))...)), rev = true);

In [None]:
hitMOA = Dict(x => MOA[x] for x in RMPV.Condition[RMPV.RMPV .< 0.1] if x in keys(MOA))
freqHitMOA = sort(freqtable(vcat(collect(values(hitMOA))...)), rev = true);

In [None]:
dfMOA = DataFrame()
dfMOA.MOA = repeat(names(freqMOA)[1][freqMOA .> 3], 2)
dfMOA.Hits = repeat([true, false], inner = Int(length(dfMOA.MOA)/2))
dfMOA.Count = zeros(length(dfMOA.MOA));

In [None]:
for row in eachrow(dfMOA)
    if row.Hits
        if row.MOA in names(freqHitMOA)[1]
            row.Count = Dict(freqHitMOA)[row.MOA]
        end
    else
        if row.MOA in names(freqHitMOA)[1]
            row.Count = Dict(freqMOA)[row.MOA] - Dict(freqHitMOA)[row.MOA]
        else
            row.Count = Dict(freqMOA)[row.MOA]
        end
        
        
    end
end
dfMOA.MOA = CategoricalArray{String,1}(dfMOA.MOA, levels = reverse(names(freqMOA)[1][freqMOA .> 3]), ordered=true);

In [None]:
gp = ggplot(dfMOA, aes(x = :MOA, y = :Count, fill = :Hits)) + 
     geom_bar(position="stack", stat="identity") + coord_flip() + 
     theme(var"legend.position"="bottom")

In [None]:
ggsave("Fig/HitEnrichment.pdf", gp)

## UMAP of top MOAs

In [None]:
cpdTopMOA = Set([k for (k,v) in MOA for m in v if m in names(freqMOA)[1][freqMOA .> 3]])

.
```
["Serotonin receptor antagonist"
"Serotonin reuptake inhibitor"
"Tubulin inhibitor"
```


```
@0.05 FDR
Norepinephrine reuptake inhibitor
Tubulin inhibitor
Calcium channel blocker
```


```
@0.1 FDR
Tubulin inhibitor
Dopamine receptor antagonist
Calcium channel blocker
```

Is there overlap in drugs? Overlap in morphological space? Map of the 9 Compounds.

In [None]:
freqHitMOA

### Load aggregated data

In [None]:
aggregatedData = CSV.read("data/aggregatedData_750cells.csv", DataFrame);

### Transform aggregated data - Normalization
We want to focus on variables that are changing more overall than inside of reference condition (untreated WT).

In [None]:
expAgg = Experiment(aggregatedData, description = "Median values for aggregated FOV measurements")

In [None]:
filters = Array{RMP.AbstractSelector,1}()
# Remove metadata
strToRemove = ["Metadata_Well", "CompoundName", "Metadata_Field", "Metadata_Row", "Metadata_Column"]
push!(filters, NameSelector(x -> !any(occursin.(strToRemove, String(x)))))
# Remove constant columns
push!(filters, Selector(x -> mad(x, normalize = true) != 0, description = "Remove constant features"));
push!(filters, Selector(x -> mad(x, normalize = true) != 0, 
                        subset = x -> x.CompoundName .== "DMSO", 
                        description = "Remove features constant for reference"))
selectFeaturesExperiment!(expAgg, filters)

In [None]:
expTransformed = deepcopy(expAgg)
logtransform!(expTransformed)
expTransformed.description = "Transformed values for aggregated FOV measurements"

Here we apply a correction based on the specific details of the experimental design:
All rows and columns include DMSO (negative) controls and we normalize all values based on these matchings controls (same row and column).  

This examplifies how to directly modify the data of an `Experiment` object.  

NB: One might want to check that more iterations are not needed (cf. Median-polish method).

In [None]:
# Normalize on matching DMSO wells median values

# Entries in both data frames are matching
@assert nrow(aggregatedData) == nrow(getdata(expTransformed))

# Copy data before correction
ndf = getdata(expTransformed)

for (i, (fx, fy)) in enumerate(eachrow(aggregatedData[:,[:Metadata_Row, :Metadata_Column]])) 
    c1 = aggregatedData.CompoundName .== "DMSO"
    c2 = aggregatedData.Metadata_Row .== fx
    c3 = aggregatedData.Metadata_Column .== fy
    @assert sum((c1 .& (c2 .| c3))) > 0
    expTransformed.data[i:i, expTransformed.selectedFeatures] .-= 
        mapcols(median, ndf[(c1 .& (c2 .| c3)),:]) 
end

In [None]:
compare_in(x,y) = x in y
Broadcast.broadcasted(::typeof(compare_in), x, y) = broadcast(in, x, Ref(y)) 
# We now only keep compounds having one of the most common MOA
filterEntriesExperiment!(expTransformed, Filter(cpdTopMOA, :CompoundName, compare = compare_in));

## Visualization

In [None]:
Random.seed!(3895)
umTPM = umap(expTransformed)
umTPM = convert(DataFrame, umTPM')
names!(umTPM, [:UMAP1, :UMAP2]);
umTPM.Compound = expTransformed.data.CompoundName[expTransformed.selectedEntries]
umTPM.MOA = CategoricalArray([MOA[x] for x in umTPM.Compound]);

In [None]:
umTPM

In [None]:
ggplot(umTPM, aes(:UMAP1, :UMAP2)) + 
    geom_point(aes(color = :MOA), alpha = 0.8) +
    coord_fixed() + 
    theme(var"legend.position"="bottom", var"legend.spacing.x" = unit(0.35, "cm"), 
    var"legend.spacing.y" = unit(0, "cm")) + 
    guides(color=guide_legend(nrow=3,byrow=true))

NB: Morphological distance between MOA?
Start with 2 MOAs that could be similar and 2 that should be distinct.

In [None]:
"Serotonin receptor antagonist"
"Serotonin reuptake inhibitor"
"Tubulin inhibitor"

## MOA similarity network

### Based on median cosine similarity at the image-level.

In [None]:
# We focus on MOAs with at least 2 compounds
selected_moa = names(freqHitMOA)[1][freqHitMOA .>= 2]
selected_cpd = [k for (k,v) in hitMOA if any([x in v for x in selected_moa])]

In [None]:
expTransformed = deepcopy(expAgg)
logtransform!(expTransformed)
expTransformed.description = "Transformed values for aggregated FOV measurements"

Here we apply a correction based on the specific details of the experimental design:
All rows and columns include DMSO (negative) controls and we normalize all values based on these matchings controls (same row and column).  

This examplifies how to directly modify the data of an `Experiment` object.  

NB: One might want to check that more iterations are not needed (cf. Median-polish method).

In [None]:
# Normalize on matching DMSO wells median values

# Entries in both data frames are matching
@assert nrow(aggregatedData) == nrow(getdata(expTransformed))

# Copy data before correction
ndf = getdata(expTransformed)

for (i, (fx, fy)) in enumerate(eachrow(aggregatedData[:,[:Metadata_Row, :Metadata_Column]])) 
    c1 = aggregatedData.CompoundName .== "DMSO"
    c2 = aggregatedData.Metadata_Row .== fx
    c3 = aggregatedData.Metadata_Column .== fy
    @assert sum((c1 .& (c2 .| c3))) > 0
    expTransformed.data[i:i, expTransformed.selectedFeatures] .-= 
        mapcols(median, ndf[(c1 .& (c2 .| c3)),:]) 
end

In [None]:
compare_in(x,y) = x in y
Broadcast.broadcasted(::typeof(compare_in), x, y) = broadcast(in, x, Ref(y)) 
# We now only keep compounds having one of the most common MOA
filterEntriesExperiment!(expTransformed, Filter(selected_cpd, :CompoundName, compare = compare_in));

In [None]:
# Image cosine similarities
imgCosSim = 1 .- pairwise(CosineDist(), Array(getdata(expTransformed))');

In [None]:
# Store MOA cosine similarity
moaCosSim = Dict()
cmp_names = expTransformed.data[expTransformed.selectedEntries, :CompoundName] 
# For all combinations (avoiding duplicates)
for (moa1, moa2) in [(x,y) for (x,y) in Iterators.product(selected_moa, selected_moa) if x >= y]
    # Get compounds and corresponding images for each MOA
    cmp_moa1 = [k for (k,v) in hitMOA if moa1 in v]
    ind_moa1 = findall([x in cmp_moa1 for x in cmp_names])
    cmp_moa2 = [k for (k,v) in hitMOA if moa2 in v]
    ind_moa2 = findall([x in cmp_moa2 for x in cmp_names])
    # Take median value
    moaCosSim[(moa1, moa2)] = median([imgCosSim[x...] for x in Iterators.product(ind_moa1, ind_moa2)])
end

In [None]:
# Display for MOA self-similarity
for moa in selected_moa
    println(moa, ": ", moaCosSim[(moa, moa)])
end

In [None]:
# Get all link with values higher than x
edgesMoaNet = Dict(k => v for (k,v) in moaCosSim if v > 0.35)
println(length(unique([y for x in keys(edgesMoaNet) for y in x ])))
length(edgesMoaNet)

In [None]:
moaToInt = Dict(x => i for (i, x) in enumerate(selected_moa))
g = SimpleWeightedGraph(11)
# NB: self-edges are kept
[add_edge!(g, moaToInt[k[1]], moaToInt[k[2]], v) for (k,v) in edgesMoaNet];

In [None]:
moaToInt

In [None]:
savegraph("Fig/MOA_similarity_per_image.gml", g, "MOA_similarity", GraphIO.GML.GMLFormat())

### Based on cosine similarity of median profile at the MOA-level.

In [None]:
cmp_moas = [MOA[x] for x in cmp_names];

In [None]:
""" For a given `moa`, return the median profile across all
    selected images.
"""
function moaMedProfile(moa::String)
    moa_ind = [moa in cm for cm in cmp_moas]
    return(map(median, eachcol(getdata(expTransformed)[moa_ind,:])))
end

In [None]:
moaProf = hcat(map(moaMedProfile, selected_moa)...)
moaCosSim2 = 1 .- pairwise(CosineDist(), moaProf);

In [None]:
# Get all link with values higher than x
edgesMoaNet2 = Dict((x,y) => moaCosSim2[x,y] for x in 1:11 for y in 1:11 if (x < y) & (moaCosSim2[x,y] > 0.74))
println(length(unique([y for x in keys(edgesMoaNet2) for y in x ])))
length(edgesMoaNet2)

In [None]:
g = SimpleWeightedGraph(11)
[add_edge!(g, k[1], k[2], v) for (k,v) in edgesMoaNet2];

We export "manually" to GML format a the GraphIO library does not export edge weights.

In [None]:
io = open("Fig/MOA_similarity_per_moa.gml", "w")

str_pre = """graph
[
label "MOA_similarity" """

write(io, str_pre);

In [None]:
str1 = "\n	node\n	[\n		id "
str2 = "\n		moa \""
str3 = "\"\n	]"
for (k,v) in moaToInt
    str_moa = str1 * string(v) * str2 * k * str3
    write(io, str_moa)
end

In [None]:
str1 = "\n	edge\n	[\n		source "
str2 = "\n		target "
str3 = "\n		weight "
str4 = "\n	]"

for e in edges(g)
    str_moa = str1 * string(e.src) * str2 *
        string(e.dst) * str3 * string(e.weight) * str4
    write(io, str_moa)
end

In [None]:
str_post = "\n]\n"
write(io, str_post);

In [None]:
close(io);

## UMAP of 3 hits per top 3 hit MOAs

In [None]:
selected_moa = ["Tubulin inhibitor", "Dopamine receptor antagonist", "Calcium channel blocker"]
selected_cpd = [k for (k,v) in hitMOA if any([x in v for x in selected_moa])]

In [None]:
[MOA[x] for x in selected_cpd]

In [None]:
selected_cpd

In [None]:
expTransformed = deepcopy(expAgg)
logtransform!(expTransformed)
expTransformed.description = "Transformed values for aggregated FOV measurements"

Here we apply a correction based on the specific details of the experimental design:
All rows and columns include DMSO (negative) controls and we normalize all values based on these matchings controls (same row and column).  

This examplifies how to directly modify the data of an `Experiment` object.  

NB: One might want to check that more iterations are not needed (cf. Median-polish method).

In [None]:
# Normalize on matching DMSO wells median values

# Entries in both data frames are matching
@assert nrow(aggregatedData) == nrow(getdata(expTransformed))

# Copy data before correction
ndf = getdata(expTransformed)

for (i, (fx, fy)) in enumerate(eachrow(aggregatedData[:,[:Metadata_Row, :Metadata_Column]])) 
    c1 = aggregatedData.CompoundName .== "DMSO"
    c2 = aggregatedData.Metadata_Row .== fx
    c3 = aggregatedData.Metadata_Column .== fy
    @assert sum((c1 .& (c2 .| c3))) > 0
    expTransformed.data[i:i, expTransformed.selectedFeatures] .-= 
        mapcols(median, ndf[(c1 .& (c2 .| c3)),:]) 
end

In [None]:
compare_in(x,y) = x in y
Broadcast.broadcasted(::typeof(compare_in), x, y) = broadcast(in, x, Ref(y)) 
# We now only keep compounds having one of the most common MOA
filterEntriesExperiment!(expTransformed, Filter(selected_cpd, :CompoundName, compare = compare_in));

In [None]:
Random.seed!(3895)
umTPM = umap(expTransformed, 2, n_neighbors = 5, spread = 1, min_dist = 0.5)
umTPM = convert(DataFrame, umTPM')
names!(umTPM, [:UMAP1, :UMAP2]);
umTPM.Compound = expTransformed.data.CompoundName[expTransformed.selectedEntries]
umTPM.allMOAs = [MOA[x] for x in umTPM.Compound]
umTPM.MOA = [length(m) > 1 ? m[2] : m[1] for m in umTPM.allMOAs];

In [None]:
umTPM.Compound = replace.(umTPM.Compound, " dihydrochloride" => s"")
umTPM.Compound = replace.(umTPM.Compound, " hydrochloride" => s"");

In [None]:
# Order levels by MOA and alphabetical order
cpd_lvl = ["Albendazole", "Nocodazole", "Vinblastine", 
           "Domperidone", "Pimozide", "Triflupromazine", 
           "Cilnidipine", "Flunarizine", "Nisoldipine"]
umTPM.Compound = CategoricalArray(umTPM.Compound, levels = cpd_lvl, ordered = true);

In [None]:
# Highlight MOAs of each compound
moa_palette = ["#5A463C","#8B726A","#BDA7A3","#463C5A","#7D768B","#B5B1BD","#3C5A46","#768B7D","#B1BDB5"];

In [None]:
moa_palette = ["#FF1998","#FF5FB9","#FFA3D7","#1B9CFF","#5FB9FF","#A3D7FF","#9CFF1B","#B9FF5F","#D7FFA3"];

In [None]:
gp = ggplot(umTPM, aes(:UMAP1, :UMAP2)) + 
    geom_point(aes(color = :Compound), alpha = 1) +
    coord_fixed() + scale_color_manual(values = moa_palette) + 
    theme(var"legend.position"="bottom", var"legend.spacing.x" = unit(0.35, "cm"), 
    var"legend.spacing.y" = unit(0, "cm")) + 
    guides(color=guide_legend(nrow=3,byrow=false))

In [None]:
ggsave("Fig/topMOA_UMAP2.pdf", gp)