In [None]:
using CSV, StatsBase, Statistics, DataFrames, UMAP, RCall, FreqTables
using MultipleTesting, Random, MultivariateStats, Distributed, CategoricalArrays
using LightGraphs, SimpleWeightedGraphs
using HTTP, JSON
using BioProfiling
using LightGraphs

In [None]:
using Dates: now
now()

## R Calls

In [None]:
@rlibrary ggplot2
@rlibrary extrafont
@rlibrary viridis
@rlibrary heatmaply
@rlibrary ggrepel

In [None]:
R"""
# Used later for MCD computation

library(robustbase)

# Customize ggplot appearance

library(ggplot2)
library(extrafont)


# Load extra fonts
# ttf_import("/tmp/.fonts")
# loadfonts()

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), 
                     legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}

cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
scale_fill_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)
"""

## Load pre-computed statistical distances to DMSO

In [None]:
RMPV = CSV.read("data/RMPV.csv", DataFrame) 

In [None]:
RMPV.Condition[RMPV.RMPV .< 0.1]

## Get annotations

In [None]:
salt_to_remove = [" maleate", " hydrochloride", " nitrate", 
                  " dihydrochloride", " chloride", " sulfate", 
                  " hydrate", " mesylate", " oxalate", " salt",
                  " from Penicillium brefeldianum", " monohydrate",
                  " trifluoroacetate", " acetate", " isethionate",
                  " hemisulfate", " angular", " sodium", " fumarate",
                  " methanesulfonate", " hemihydrate", " (MW = 374.83)",
                  "(+/-)-", "(+)-", "(-)-", "S-(+)-", "(S)-", "(±)-", "D-"]

# It seems removing this set of enantiomer indications leads
# to retrieving the correct compound annotation, as of the current release of the API.

"""Other compounds might include salts but are anyway not found in the LINCS database at all:
    AC-93253 iodide
    N-p-Tosyl-L-phenylalanine chloromethyl ketone
    4-(2-Aminoethyl)benzenesulfonyl fluoride hydrochloride
    UNC0379 trifluoroacetate salt"""

salt_dict = Dict(s => s"" for s in salt_to_remove)

```
https://api.clue.io/api/perts?filter={"where":{"pert_iname":"(-)-Quinpirole"}}&user_key=1fee664f310c86c5cb009c323de941db
```

In [None]:
"""
This function parse a list of compound canonical IDs and extract mechanism of actions (MOA)
annotated in the LINCS perturbation database when provided. Returns a list of annotates MOAs
See https://clue.io/developer-resources#apisection
"""
function getMOA(cpd::String)
    user_key = "1fee664f310c86c5cb009c323de941db"
    rootURL = "https://api.clue.io/api/perts?filter={\"where\":{\"pert_iname\":\""
    typeURL = "\"},\"fields\":{\"moa\":true}}&user_key="
    # NB: case-dependent. Dashes are handled. Spaces are usually replaced by dashes.
    
    cpd_no_salt = reduce(replace, salt_dict, init=cpd)
    
    rq = rootURL * lowercase(cpd_no_salt) * typeURL * user_key
    
    try
        resRq = HTTP.get(rq)
        moaRQ = JSON.Parser.parse(String(resRq.body))[1]
        return(moaRQ["moa"])
    catch e
        if isa(e, BoundsError)
            rq = rootURL * uppercase(cpd_no_salt) * typeURL * user_key
            try
                resRq = HTTP.get(rq)
                moaRQ = JSON.Parser.parse(String(resRq.body))[1]
                return(moaRQ["moa"])
            catch e
                if isa(e, BoundsError)
                println(cpd*" does not have an MOA annotation.")
                end
            end
        elseif isa(e, HTTP.ExceptionRequest.StatusError)
            println(cpd*" raises a 502 error.")
        end
        return()
    end
end

In [None]:
"""
This function parse a list of compound canonical IDs and extract mechanism of actions (MOA)
annotated in the LINCS perturbation database when provided. Returns a list of annotates MOAs
See https://clue.io/developer-resources#apisection
"""
function get_MOA_and_target(cpd::String)
    user_key = "1fee664f310c86c5cb009c323de941db"
    rootURL = "https://api.clue.io/api/perts?filter={\"where\":{\"pert_iname\":\""
    typeURL = "\"},\"fields\":{\"moa\":true,\"target\":true}}&user_key="
    # NB: case-dependent. Dashes are handled. Spaces are usually replaced by dashes.
    
    cpd_no_salt = reduce(replace, salt_dict, init=cpd)
    
    rq = rootURL * lowercase(cpd_no_salt) * typeURL * user_key
    
    try
        resRq = HTTP.get(rq)
        moaRQ = JSON.Parser.parse(String(resRq.body))[1]
        return(moaRQ)
    catch e
        if isa(e, BoundsError)
            rq = rootURL * uppercase(cpd_no_salt) * typeURL * user_key
            try
                resRq = HTTP.get(rq)
                moaRQ = JSON.Parser.parse(String(resRq.body))[1]
                return(moaRQ)
            catch e
                if isa(e, BoundsError)
                println(cpd*" does not have an MOA annotation.")
                end
            end
        elseif isa(e, HTTP.ExceptionRequest.StatusError)
            println(cpd*" raises a 502 error.")
        end
        return()
    end
end

In [None]:
MOA = Dict()
targets = Dict()
for cpd in RMPV.Condition
    moa = get_MOA_and_target(cpd)
    if length(moa) > 0
        if "moa" in keys(moa)
            MOA[cpd] = moa["moa"]
        end
        if "target" in keys(moa)
            targets[cpd] = moa["target"]
        end
    end
end

In [None]:
cpd_list = RMPV.Condition
cpd_list = map(x -> reduce(replace, salt_dict, init=x), cpd_list)
cpd_list = map(x -> replace(x,  " " => s"-"), cpd_list)

In [None]:
short_to_initial_cpd = Dict(y => x for (x,y) in zip(RMPV.Condition, cpd_list))

In [None]:
for cpd in cpd_list
    moa = get_MOA_and_target(cpd)
    if length(moa) > 0
        if "moa" in keys(moa)
            MOA[short_to_initial_cpd[cpd]] = moa["moa"]
        end
        if "target" in keys(moa)
            targets[short_to_initial_cpd[cpd]] = moa["target"]
        end
    end
end

In [None]:
using HDF5, JLD
save("data/MOA2.jld", "data", MOA)
save("data/target2.jld", "data", targets)

In [None]:
MOA

In [None]:
targets

You can save the MOA dictionary for later use:
```julia
using HDF5, JLD
save("data/MOA.jld", "data", MOA)
save("data/target.jld", "data", targets)
MOA = load("data/MOA.jld")["data"]
targets = load("data/target.jld")["data"]
```

In [None]:
freqMOA = sort(freqtable(vcat(collect(values(MOA))...)), rev = true);

In [None]:
hitMOA = Dict(x => MOA[x] for x in RMPV.Condition[RMPV.RMPV .< 0.1] if x in keys(MOA))
freqHitMOA = sort(freqtable(vcat(collect(values(hitMOA))...)), rev = true);

In [None]:
dfMOA = DataFrame()
dfMOA.MOA = repeat(names(freqMOA)[1][freqMOA .> 3], 2)
dfMOA.Hits = repeat([true, false], inner = Int(length(dfMOA.MOA)/2))
dfMOA.Count = zeros(length(dfMOA.MOA));

In [None]:
for row in eachrow(dfMOA)
    if row.Hits
        if row.MOA in names(freqHitMOA)[1]
            row.Count = Dict(freqHitMOA)[row.MOA]
        end
    else
        if row.MOA in names(freqHitMOA)[1]
            row.Count = Dict(freqMOA)[row.MOA] - Dict(freqHitMOA)[row.MOA]
        else
            row.Count = Dict(freqMOA)[row.MOA]
        end
        
        
    end
end
dfMOA.MOA = CategoricalArray{String,1}(dfMOA.MOA, levels = reverse(names(freqMOA)[1][freqMOA .> 3]), ordered=true);

In [None]:
gp = ggplot(dfMOA, aes(x = :MOA, y = :Count, fill = :Hits)) + 
     geom_bar(position="stack", stat="identity") + coord_flip() + 
     scale_y_continuous(breaks = 0:2:12) +
     theme(var"legend.position"="bottom") +
     RObject(nothing)

In [None]:
ggsave("fig/HitEnrichment.pdf", gp);

## MOA analysis

Now we focus on MOAs with at least 2 hit compounds:

In [None]:
top_moa = names(freqHitMOA)[1][freqHitMOA .>= 2]

This means focusing on the following hit compounds:

In [None]:
top_moa_hit_cpd = Set([k for (k,v) in MOA 
                         for m in v 
                         if m in top_moa]) ∩
                  RMPV.Condition[RMPV.RMPV .< 0.1]

Targets are known for all these compounds:

In [None]:
@assert all([x in keys(targets) for x in top_moa_hit_cpd])

### MOA morphological similarity

#### Load aggregated data

In [None]:
aggregatedData = CSV.read("data/aggregatedData_750cells.csv", DataFrame);

#### Transform aggregated data - Normalization
We want to focus on variables that are changing more overall than inside of reference condition (untreated WT).

In [None]:
expAgg = Experiment(aggregatedData, description = "Median values for aggregated FOV measurements")

In [None]:
filters = Array{BioProfiling.AbstractSelector,1}()
# Remove metadata
strToRemove = ["Metadata_Well", "CompoundName", "Metadata_Field", "Metadata_Row", "Metadata_Column"]
push!(filters, NameSelector(x -> !any(occursin.(strToRemove, String(x)))))
# Remove constant columns
push!(filters, Selector(x -> mad(x, normalize = true) != 0, description = "Remove constant features"));
push!(filters, Selector(x -> mad(x, normalize = true) != 0, 
                        subset = x -> x.CompoundName .== "DMSO", 
                        description = "Remove features constant for reference"))
select_features!(expAgg, filters)

In [None]:
expTransformed = deepcopy(expAgg)
logtransform!(expTransformed)
expTransformed.description = "Transformed values for aggregated FOV measurements"

Here we apply a correction based on the specific details of the experimental design:
All rows and columns include DMSO (negative) controls and we normalize all values based on these matchings controls (same row and column).  

This examplifies how to directly modify the data of an `Experiment` object.  

NB: One might want to check that more iterations are not needed (cf. Median-polish method).

In [None]:
# Normalize on matching DMSO wells median values

# Entries in both data frames are matching
@assert nrow(aggregatedData) == nrow(getdata(expTransformed))

# Copy data before correction
ndf = getdata(expTransformed)

for (i, (fx, fy)) in enumerate(eachrow(aggregatedData[:,[:Metadata_Row, :Metadata_Column]])) 
    c1 = aggregatedData.CompoundName .== "DMSO"
    c2 = aggregatedData.Metadata_Row .== fx
    c3 = aggregatedData.Metadata_Column .== fy
    @assert sum((c1 .& (c2 .| c3))) > 0
    expTransformed.data[i:i, expTransformed.selected_features] .-= 
        mapcols(median, ndf[(c1 .& (c2 .| c3)),:]) 
end

In [None]:
compare_in(x,y) = x in y
Broadcast.broadcasted(::typeof(compare_in), x, y) = broadcast(in, x, Ref(y)) 
# We now only keep hit compounds
filter_entries!(expTransformed, Filter(top_moa_hit_cpd, :CompoundName, compare = compare_in));

In [None]:
expTransformed

### Dimensionality reduction

In [None]:
using Distances
Random.seed!(3895)
umTPM = umap(expTransformed, 4, metric = CosineDist())
umTPM = convert(DataFrame, umTPM')
rename!(umTPM, Symbol.("UMAP" .* string.(1:ncol(umTPM))));

In [None]:
top_moa

In [None]:
umTPM.Compound = expTransformed.data.CompoundName[expTransformed.selected_entries]
umTPM.MOA = [[y for y in MOA[x] if y in top_moa] for x in umTPM.Compound]
umTPM.MOA2 = CategoricalArray(string.(umTPM.MOA));

In [None]:
ggplot(umTPM, aes(:UMAP1, :UMAP2)) + 
    geom_point(aes(color = :MOA2), alpha = 0.8) +
    coord_fixed() + 
    theme(var"legend.position"="bottom", var"legend.spacing.x" = unit(0.35, "cm"), 
    var"legend.spacing.y" = unit(0, "cm")) + 
    guides(color=guide_legend(nrow=3,byrow=true))

In [None]:
ggplot(umTPM, aes(:UMAP3, :UMAP4)) + 
    geom_point(aes(color = :MOA2), alpha = 0.8) +
    coord_fixed() + 
    theme(var"legend.position"="bottom", var"legend.spacing.x" = unit(0.35, "cm"), 
    var"legend.spacing.y" = unit(0, "cm")) + 
    guides(color=guide_legend(nrow=3,byrow=true))

In [None]:
expUMAP = Experiment(umTPM, description = "UMAP projection of profiling data")
filters = Array{BioProfiling.AbstractReduce,1}()
# Remove (categorical) compound column from analysis
push!(filters, NameSelector(x -> x != "Compound"))
push!(filters, NameSelector(x -> x != "MOA"))
push!(filters, NameSelector(x -> x != "MOA2"))
# Apply filters
filter!(expUMAP, filters)
expUMAP

In [None]:
pairwise_hellinger = DataFrame([(x, y) for x in levels(top_moa_hit_cpd) for y in levels(top_moa_hit_cpd) if x != y]);
exp_cpd = expTransformed.data[expTransformed.selected_entries, :CompoundName]
pairwise_hellinger.distance = [distance_robust_hellinger(getdata(expUMAP), 
                                                         exp_cpd.==x, 
                                                         exp_cpd.==y) 
                               for (x,y) in eachrow(pairwise_hellinger)]
rename!(pairwise_hellinger, ["X", "Y", "distance"]);

In [None]:
RCall.rcall_p(:options, rcalljl_options=Dict(:width => 1000, :height => 800))
ggplot(pairwise_hellinger, aes(x = :X, y = :Y, fill = :distance)) +
    geom_tile() +
    xlab("") +
    ylab("") + 
    theme(var"axis.text.x" = element_text(angle = 45, hjust = 1)) +
    RObject(nothing)

In [None]:
MOA["Nisoldipine"]

In [None]:
MOA["Paroxetine hydrochloride hemihydrate (MW = 374.83)"]

In [None]:
MOA["Fluoxetine hydrochloride"]

In [None]:
MOA["Cilnidipine"]

In [None]:
MOA["Flunarizine dihydrochloride"]

In [None]:
top_hits_per_moa = Dict(moa => [x for x in top_moa_hit_cpd if moa in MOA[x]] for moa in top_moa)

In [None]:
moa1 = "Selective serotonin reuptake inhibitor (SSRI)"
moa2 = "Calcium channel blocker"

In [None]:
[mean(pairwise_hellinger[
        [x in top_hits_per_moa[moa1] for x in pairwise_hellinger.X] .&
        [x in top_hits_per_moa[moa2] for x in pairwise_hellinger.Y],
        :distance])
 for (moa1,moa2) in eachrow(moa_pairwise_hellinger)]

In [None]:
moa_pairwise_hellinger = DataFrame([(x, y) for x in top_moa 
                                           for y in top_moa if x != y]);
moa_pairwise_hellinger.distance = [mean(pairwise_hellinger[
                                        [x in top_hits_per_moa[moa1] for x in pairwise_hellinger.X] .&
                                        [x in top_hits_per_moa[moa2] for x in pairwise_hellinger.Y],
                                        :distance])
                                   for (moa1,moa2) in eachrow(moa_pairwise_hellinger)]
rename!(moa_pairwise_hellinger, ["X", "Y", "distance"]);

In [None]:
RCall.rcall_p(:options, rcalljl_options=Dict(:width => 1000, :height => 800))
ggplot(moa_pairwise_hellinger, aes(x = :X, y = :Y, fill = :distance)) +
    geom_tile() +
    xlab("") +
    ylab("") + 
    theme(var"axis.text.x" = element_text(angle = 45, hjust = 1)) +
    RObject(nothing)

### MOA target PPI similarity

#### Load PPI from HIPPIE database

In [None]:
# Fetch last version of HIPPIE database (2.2 when writing this notebook)
hippie = HTTP.get("http://cbdm-01.zdv.uni-mainz.de/~mschaefer/hippie/hippie_current.txt")
hippie = CSV.read(hippie.body, DataFrame, header = false)
rename!(hippie, ["Uniprot1", "Entrez1", "Uniprot2", "Entrez2", "Confidence", "Experiments"])

HIPPIE's Q&A section suggests:  
```medium confidence (0.63 - second quartile of the HIPPIE score distribution) or high confidence (0.73 - third quartile)```

In [None]:
ggplot(hippie, aes(x = :Confidence, y = "All")) +
    geom_violin() +
    geom_vline(xintercept = 0.63)

In [None]:
hippie = hippie[hippie.Confidence .>= 0.63, 1:4]
hippie = hippie[.!(ismissing.(hippie.Entrez1)), :]
hippie = hippie[.!(ismissing.(hippie.Entrez2)), :]

#### Construct PPI network

In [None]:
allgenes = hippie.Entrez1 ∪ hippie.Entrez2
gene_to_ID = Dict(v => i  for (i,v) in enumerate(allgenes))
ID_to_gene = Dict(i => v  for (i,v) in enumerate(allgenes));

In [None]:
# Duplicated links and self-edges are discarded
G = SimpleGraph(length(allgenes))
map(x -> add_edge!(G, Tuple(x)), eachrow(map(x -> gene_to_ID[x], Array(hippie[:, [:Entrez1, :Entrez2]]))));

In [None]:
function symbol_to_entrez_mygeneinfo(s::String)
    prefix = "http://mygene.info/v3/query?q=symbol:"
    suffix = "&species=human&fields=entrez"
    rq = prefix*s*suffix
    resRq = HTTP.get(rq)
    entrezRQ = JSON.Parser.parse(String(resRq.body))
    if entrezRQ["total"] > 0
        return(entrezRQ["hits"][1]["_id"])
    else
        return(NaN)
    end
end

alltargets = unique(collect(Iterators.flatten(values(targets))))
alltargets_entrez = symbol_to_entrez_mygeneinfo.(alltargets);

# The following sections need to be reworked:

# ! The gene string values are not symbols but UniProt names!

In [None]:
hippie = hippie[.!(ismissing.(hippie.Symbol1)), :]
hippie = hippie[.!(ismissing.(hippie.Symbol2)), :]

In [None]:
@assert all([occursin("_HUMAN", x) for x in hippie.Symbol1])
@assert all([occursin("_HUMAN", x) for x in hippie.Symbol2])

In [None]:
hippie.Symbol1 = map(x -> replace(x, "_HUMAN" => s""), hippie.Symbol1)
hippie.Symbol2 = map(x -> replace(x, "_HUMAN" => s""), hippie.Symbol2);

In [None]:
allgenes = hippie.Symbol1 ∪ hippie.Symbol2
gene_to_ID = Dict(v => i  for (i,v) in enumerate(allgenes))
ID_to_gene = Dict(i => v  for (i,v) in enumerate(allgenes));

In [None]:
gene_to_ID["SRGN"]

In [None]:
gene_to_ID["CD44"]

In [None]:
(4, 6109) in edges(G)

To quantify closeness of drug modules, we use the $s_{AB}$ score from Menche et al. (Science, 2015) defined as follows:
$s_{AB} = <d_{AB}> - \frac{<d_{AA}>+<d_{BB}>}{2}$

In [None]:
IDlist(x) = [gene_to_ID[y] for y in targets[x] if y in keys(gene_to_ID)]

IDlist("Fluoxetine hydrochloride")
IDlist("Flunarizine dihydrochloride")

In [None]:
[y in keys(gene_to_ID) for y in targets["Fluoxetine hydrochloride"]]

In [None]:
"HTR2B" in allgenes

In [None]:
gene_to_ID["ANO1"]

In [None]:
[ID_to_gene[x] for x in IDlist("Fluoxetine hydrochloride")]

In [None]:
targets["Fluoxetine hydrochloride"]

In [None]:
hippie

In [None]:
Edge.(Array(hippie[:, [:Symbol1, :Symbol2]]))

In [None]:
Array(hippie[:, [:Symbol1, :Symbol2]])

In [None]:
cpdTopMOA = Set([k for (k,v) in MOA for m in v if m in names(freqMOA)[1][freqMOA .> 3]]) 

## Top MOAs 

In [None]:
cpdTopMOA = Set([k for (k,v) in MOA for m in v if m in names(freqMOA)[1][freqMOA .> 3]])

In [None]:
compare_in(x,y) = x in y
Broadcast.broadcasted(::typeof(compare_in), x, y) = broadcast(in, x, Ref(y)) 
# We now only keep compounds having one of the most common MOA
filterEntriesExperiment!(expTransformed, Filter(cpdTopMOA, :CompoundName, compare = compare_in));

## Visualization

In [None]:
Random.seed!(3895)
umTPM = umap(expTransformed)
umTPM = convert(DataFrame, umTPM')
rename!(umTPM, [:UMAP1, :UMAP2]);
umTPM.Compound = expTransformed.data.CompoundName[expTransformed.selectedEntries]
# Warning: compounds with multiple MOAs are not yet properly handled
umTPM.MOA = CategoricalArray([MOA[x][end] for x in umTPM.Compound]);

In [None]:
ggplot(umTPM, aes(:UMAP1, :UMAP2)) + 
    geom_point(aes(color = :MOA), alpha = 0.8) +
    coord_fixed() + 
    theme(var"legend.position"="bottom", var"legend.spacing.x" = unit(0.35, "cm"), 
    var"legend.spacing.y" = unit(0, "cm")) + 
    guides(color=guide_legend(nrow=3,byrow=true))

## Drug similarity network and embedding

In [None]:
expTransformed = deepcopy(expAgg)
logtransform!(expTransformed)
expTransformed.description = "Transformed values for aggregated FOV measurements"

In [None]:
# Normalize on matching DMSO wells median values

# Entries in both data frames are matching
@assert nrow(aggregatedData) == nrow(getdata(expTransformed))

# Copy data before correction
ndf = getdata(expTransformed)

for (i, (fx, fy)) in enumerate(eachrow(aggregatedData[:,[:Metadata_Row, :Metadata_Column]])) 
    c1 = aggregatedData.CompoundName .== "DMSO"
    c2 = aggregatedData.Metadata_Row .== fx
    c3 = aggregatedData.Metadata_Column .== fy
    @assert sum((c1 .& (c2 .| c3))) > 0
    expTransformed.data[i:i, expTransformed.selectedFeatures] .-= 
        mapcols(median, ndf[(c1 .& (c2 .| c3)),:]) 
end

Alternatively, one can discard unannotated compounds:
```
# We focus on all hits with known MOA
selected_moa = names(freqHitMOA)[1]
selected_cpd = [k for (k,v) in hitMOA if any([x in v for x in selected_moa])];
```

In [None]:
selected_cpd = RMPV.Condition[RMPV.RMPV .< 0.1]

In [None]:
compare_in(x,y) = x in y
Broadcast.broadcasted(::typeof(compare_in), x, y) = broadcast(in, x, Ref(y)) 
# We now only keep hit compounds
filterEntriesExperiment!(expTransformed, Filter(selected_cpd, :CompoundName, compare = compare_in));

In [None]:
using Distances

In [None]:
using Distances
Random.seed!(3895)
umTPM = umap(expTransformed, 2, metric = CosineDist())
umTPM = convert(DataFrame, umTPM')
rename!(umTPM, Symbol.("UMAP" .* string.(1:ncol(umTPM))));
umTPM.Compound = expTransformed.data.CompoundName[expTransformed.selectedEntries]

# Warning: compounds with multiple MOAs are not yet properly handled
umTPM.MOA = CategoricalArray([x in keys(MOA) ? MOA[x][end] : "Other" for x in umTPM.Compound]);

# Highlight top 3 MOAs (3 compounds each)
selected_moa = ["Tubulin inhibitor", "Dopamine receptor antagonist", "Calcium channel blocker"]
umTPM[!, Symbol("Mechanism of action")] = [m in selected_moa ? String(m) : "Other" for m in umTPM.MOA]
umTPM[!, Symbol("Mechanism of action")] = CategoricalArray(
    umTPM[:, Symbol("Mechanism of action")], levels = [selected_moa..., "Other"]);

In [None]:
freqtable(umTPM[:, Symbol("Mechanism of action")])

In [None]:
ggplot(umTPM, aes(:UMAP1, :UMAP2)) + 
    geom_point(aes(color = Symbol("Mechanism of action")), alpha = 0.8) +
    coord_fixed() + 
    theme(var"legend.position"="bottom", var"legend.spacing.x" = unit(0.35, "cm"), 
    var"legend.spacing.y" = unit(0, "cm")) + 
    guides(color=guide_legend(title = "", nrow=2,byrow=true)) +
    RObject(nothing)

In [None]:
Random.seed!(3895)
umTPM = umap(expTransformed, 6)
umTPM = convert(DataFrame, umTPM')
rename!(umTPM, Symbol.("UMAP" .* string.(1:ncol(umTPM))));
umTPM.Compound = expTransformed.data.CompoundName[expTransformed.selectedEntries]

# Warning: compounds with multiple MOAs are not yet properly handled
umTPM.MOA = CategoricalArray([x in keys(MOA) ? MOA[x][end] : "Other" for x in umTPM.Compound]);

In [None]:
ggplot(umTPM, aes(:UMAP1, :UMAP2)) + 
    geom_point(aes(color = :MOA), alpha = 0.8) +
    coord_fixed() + 
    theme(var"legend.position"="bottom", var"legend.spacing.x" = unit(0.35, "cm"), 
    var"legend.spacing.y" = unit(0, "cm")) + 
    guides(color=false)

In [None]:
expUMAP = Experiment(umTPM, description = "UMAP projection of profiling data")

In [None]:
filters = Array{RMP.AbstractReduce,1}()
# Remove (categorical) compound column from analysis
push!(filters, NameSelector(x -> x != "Compound"))
push!(filters, NameSelector(x -> x != "MOA"))
# Apply filters
filterExperiment!(expUMAP, filters)
expUMAP

In [None]:
pairwise_hellinger = DataFrame([(x, y) for x in levels(selected_cpd) for y in levels(selected_cpd) if x != y]);

In [None]:
exp_cpd = expTransformed.data[expTransformed.selectedEntries, :CompoundName]
pairwise_hellinger.distance = [distance_robust_hellinger(getdata(expUMAP), 
                                                         exp_cpd.==x, 
                                                         exp_cpd.==y) 
                               for (x,y) in eachrow(pairwise_hellinger)]

In [None]:
ggplot(pairwise_hellinger, aes(x = :distance, y = "All pairs")) +
    geom_violin() + ylab("")

In [None]:
# Get all link with values higher than x
# Distance is symmetrical so there is no issue with duplicates
# edgesDrugNet = Dict(sort([x,y]) => 1 - v for (x,y,v) in eachrow(pairwise_hellinger) if v < 0.8)
edgesDrugNet = Dict(sort([x,y]) => 1 - v for (x,y,v) in eachrow(pairwise_hellinger) if (!ismissing(v))&(v < 0.98))
length(edgesDrugNet)

In [None]:
drugToInt = Dict(x => i for (i, x) in enumerate(selected_cpd))
g = SimpleWeightedGraph(length(selected_cpd))
[add_edge!(g, drugToInt[k[1]], drugToInt[k[2]], v) for (k,v) in edgesDrugNet];

In [None]:
# Network density
length(edgesDrugNet)/((length(vertices(g))*(length(vertices(g)) - 1))/2)

In [None]:
using GraphRecipes, Plots
graphplot(g, curves=true, nodeshape = :circle, nodesize = 0.25)

### Network embedding

In [None]:
import Pkg
Pkg.add(Pkg.PackageSpec(url = "https://github.com/ollin18/Node2Vec.jl")) 

In [None]:
using Node2Vec

In [None]:
walks = simulate_walks(g,5,50,2,2)

In [None]:
embmodel = learn_embeddings(walks)

In [None]:
# Note why there is an extra entry but this follows
# the example of the node2vec package
embeddings = embmodel.vectors[:,2:end];

In [None]:
Random.seed!(3895)
umEmb = umap(embeddings)
umEmb = convert(DataFrame, umEmb')
rename!(umEmb, [:UMAP1, :UMAP2]);
intToDrug = Dict(v => k for (k,v) in drugToInt)
umEmb.Compound = [intToDrug[x] for x in 1:length(selected_cpd)]

# Warning: compounds with multiple MOAs are not yet properly handled
umEmb.MOA = CategoricalArray([x in keys(MOA) ? MOA[x][end] : "Other" for x in umEmb.Compound]);

In [None]:
# Highlight top 3 MOAs (3 compounds each)
selected_moa = ["Tubulin inhibitor", "Dopamine receptor antagonist", "Calcium channel blocker"]
umEmb[!, Symbol("Mechanism of action")] = [m in selected_moa ? String(m) : "Other" for m in umEmb.MOA]
umEmb[!, Symbol("Mechanism of action")] = CategoricalArray(
    umEmb[:, Symbol("Mechanism of action")], levels = [selected_moa..., "Other"]);

In [None]:
gp = ggplot(umEmb, aes(:UMAP1, :UMAP2)) + 
    geom_point(aes(color = Symbol("Mechanism of action")), alpha = 0.8) +
    coord_fixed() + 
    theme(var"legend.position"="bottom", var"legend.spacing.x" = unit(0.35, "cm"), 
    var"legend.spacing.y" = unit(0, "cm")) +
    guides(color=guide_legend(title = "", nrow=2,byrow=true)) +
    RObject(nothing)
print(gp)
ggsave("fig/allhits_embedding_topMOA.pdf", gp)

In [None]:
gp = ggplot(umEmb, aes(:UMAP1, :UMAP2)) + 
    geom_point(aes(color = Symbol("Mechanism of action")), alpha = 0.8) +
    coord_fixed() + 
    theme(var"legend.position"="bottom", var"legend.spacing.x" = unit(0.35, "cm"), 
    var"legend.spacing.y" = unit(0, "cm")) +
    guides(color=guide_legend(title = "", nrow=2,byrow=true)) +
    RObject(nothing)
print(gp)
ggsave("fig/allhits_embedding_topMOA.pdf", gp)

In [None]:
gp = ggplot(umEmb, aes(:UMAP1, :UMAP2)) + 
    geom_point(aes(color = Symbol("Mechanism of action")), alpha = 0.8) +
    coord_fixed() + 
    theme(var"legend.position"="bottom", var"legend.spacing.x" = unit(0.35, "cm"), 
    var"legend.spacing.y" = unit(0, "cm")) +
    guides(color=guide_legend(title = "", nrow=2,byrow=true)) +
    RObject(nothing)
print(gp)

## MOA similarity network

### Based on median cosine similarity at the image-level.

In [None]:
# We focus on MOAs with at least 2 compounds
selected_moa = names(freqHitMOA)[1][freqHitMOA .>= 2]
selected_cpd = [k for (k,v) in hitMOA if any([x in v for x in selected_moa])]

In [None]:
expTransformed = deepcopy(expAgg)
logtransform!(expTransformed)
expTransformed.description = "Transformed values for aggregated FOV measurements"

Here we apply a correction based on the specific details of the experimental design:
All rows and columns include DMSO (negative) controls and we normalize all values based on these matchings controls (same row and column).  

This examplifies how to directly modify the data of an `Experiment` object.  

NB: One might want to check that more iterations are not needed (cf. Median-polish method).

In [None]:
# Normalize on matching DMSO wells median values

# Entries in both data frames are matching
@assert nrow(aggregatedData) == nrow(getdata(expTransformed))

# Copy data before correction
ndf = getdata(expTransformed)

for (i, (fx, fy)) in enumerate(eachrow(aggregatedData[:,[:Metadata_Row, :Metadata_Column]])) 
    c1 = aggregatedData.CompoundName .== "DMSO"
    c2 = aggregatedData.Metadata_Row .== fx
    c3 = aggregatedData.Metadata_Column .== fy
    @assert sum((c1 .& (c2 .| c3))) > 0
    expTransformed.data[i:i, expTransformed.selectedFeatures] .-= 
        mapcols(median, ndf[(c1 .& (c2 .| c3)),:]) 
end

In [None]:
compare_in(x,y) = x in y
Broadcast.broadcasted(::typeof(compare_in), x, y) = broadcast(in, x, Ref(y)) 
# We now only keep compounds having one of the most common MOA
filterEntriesExperiment!(expTransformed, Filter(selected_cpd, :CompoundName, compare = compare_in));

In [None]:
using Distances

In [None]:
# Image cosine similarities
imgCosSim = 1 .- pairwise(CosineDist(), Array(getdata(expTransformed))');

In [None]:
# Store MOA cosine similarity
moaCosSim = Dict()
cmp_names = expTransformed.data[expTransformed.selectedEntries, :CompoundName] 
# For all combinations (avoiding duplicates)
for (moa1, moa2) in [(x,y) for (x,y) in Iterators.product(selected_moa, selected_moa) if x >= y]
    # Get compounds and corresponding images for each MOA
    cmp_moa1 = [k for (k,v) in hitMOA if moa1 in v]
    ind_moa1 = findall([x in cmp_moa1 for x in cmp_names])
    cmp_moa2 = [k for (k,v) in hitMOA if moa2 in v]
    ind_moa2 = findall([x in cmp_moa2 for x in cmp_names])
    # Take median value
    moaCosSim[(moa1, moa2)] = median([imgCosSim[x...] for x in Iterators.product(ind_moa1, ind_moa2)])
end

In [None]:
# Display for MOA self-similarity
for moa in selected_moa
    println(moa, ": ", moaCosSim[(moa, moa)])
end

In [None]:
# Get all link with values higher than x
edgesMoaNet = Dict(k => v for (k,v) in moaCosSim if v > 0.35)
println(length(unique([y for x in keys(edgesMoaNet) for y in x ])))
length(edgesMoaNet)

In [None]:
moaToInt = Dict(x => i for (i, x) in enumerate(selected_moa))
g = SimpleWeightedGraph(11)
# NB: self-edges are kept
[add_edge!(g, moaToInt[k[1]], moaToInt[k[2]], v) for (k,v) in edgesMoaNet];

In [None]:
moaToInt

In [None]:
savegraph("fig/MOA_similarity_per_image.gml", g, "MOA_similarity", GraphIO.GML.GMLFormat())

### Based on cosine similarity of median profile at the MOA-level.

In [None]:
cmp_moas = [MOA[x] for x in cmp_names];

In [None]:
""" For a given `moa`, return the median profile across all
    selected images.
"""
function moaMedProfile(moa::String)
    moa_ind = [moa in cm for cm in cmp_moas]
    return(map(median, eachcol(getdata(expTransformed)[moa_ind,:])))
end

In [None]:
moaProf = hcat(map(moaMedProfile, selected_moa)...)
moaCosSim2 = 1 .- pairwise(CosineDist(), moaProf);

In [None]:
# Get all link with values higher than x
edgesMoaNet2 = Dict((x,y) => moaCosSim2[x,y] for x in 1:11 for y in 1:11 if (x < y) & (moaCosSim2[x,y] > 0.74))
println(length(unique([y for x in keys(edgesMoaNet2) for y in x ])))
length(edgesMoaNet2)

In [None]:
g = SimpleWeightedGraph(11)
[add_edge!(g, k[1], k[2], v) for (k,v) in edgesMoaNet2];

We export "manually" to GML format a the GraphIO library does not export edge weights.

In [None]:
io = open("Fig/MOA_similarity_per_moa.gml", "w")

str_pre = """graph
[
label "MOA_similarity" """

write(io, str_pre);

In [None]:
str1 = "\n	node\n	[\n		id "
str2 = "\n		moa \""
str3 = "\"\n	]"
for (k,v) in moaToInt
    str_moa = str1 * string(v) * str2 * k * str3
    write(io, str_moa)
end

In [None]:
str1 = "\n	edge\n	[\n		source "
str2 = "\n		target "
str3 = "\n		weight "
str4 = "\n	]"

for e in edges(g)
    str_moa = str1 * string(e.src) * str2 *
        string(e.dst) * str3 * string(e.weight) * str4
    write(io, str_moa)
end

In [None]:
str_post = "\n]\n"
write(io, str_post);

In [None]:
close(io);

## UMAP of 3 hits per top 3 hit MOAs

In [None]:
selected_moa = ["Tubulin inhibitor", "Dopamine receptor antagonist", "Calcium channel blocker"]
selected_cpd = [k for (k,v) in hitMOA if any([x in v for x in selected_moa])]

In [None]:
[MOA[x] for x in selected_cpd]

In [None]:
selected_cpd

In [None]:
expTransformed = deepcopy(expAgg)
logtransform!(expTransformed)
expTransformed.description = "Transformed values for aggregated FOV measurements"

Here we apply a correction based on the specific details of the experimental design:
All rows and columns include DMSO (negative) controls and we normalize all values based on these matchings controls (same row and column).  

This examplifies how to directly modify the data of an `Experiment` object.  

NB: One might want to check that more iterations are not needed (cf. Median-polish method).

In [None]:
# Normalize on matching DMSO wells median values

# Entries in both data frames are matching
@assert nrow(aggregatedData) == nrow(getdata(expTransformed))

# Copy data before correction
ndf = getdata(expTransformed)

for (i, (fx, fy)) in enumerate(eachrow(aggregatedData[:,[:Metadata_Row, :Metadata_Column]])) 
    c1 = aggregatedData.CompoundName .== "DMSO"
    c2 = aggregatedData.Metadata_Row .== fx
    c3 = aggregatedData.Metadata_Column .== fy
    @assert sum((c1 .& (c2 .| c3))) > 0
    expTransformed.data[i:i, expTransformed.selectedFeatures] .-= 
        mapcols(median, ndf[(c1 .& (c2 .| c3)),:]) 
end

In [None]:
compare_in(x,y) = x in y
Broadcast.broadcasted(::typeof(compare_in), x, y) = broadcast(in, x, Ref(y)) 
# We now only keep compounds having one of the most common MOA
filterEntriesExperiment!(expTransformed, Filter(selected_cpd, :CompoundName, compare = compare_in));

In [None]:
Random.seed!(3895)
umTPM = umap(expTransformed, 2, n_neighbors = 5, spread = 1, min_dist = 0.5)
umTPM = convert(DataFrame, umTPM')
names!(umTPM, [:UMAP1, :UMAP2]);
umTPM.Compound = expTransformed.data.CompoundName[expTransformed.selectedEntries]
umTPM.allMOAs = [MOA[x] for x in umTPM.Compound]
umTPM.MOA = [length(m) > 1 ? m[2] : m[1] for m in umTPM.allMOAs];

In [None]:
umTPM.Compound = replace.(umTPM.Compound, " dihydrochloride" => s"")
umTPM.Compound = replace.(umTPM.Compound, " hydrochloride" => s"");

In [None]:
# Order levels by MOA and alphabetical order
cpd_lvl = ["Albendazole", "Nocodazole", "Vinblastine", 
           "Domperidone", "Pimozide", "Triflupromazine", 
           "Cilnidipine", "Flunarizine", "Nisoldipine"]
umTPM.Compound = CategoricalArray(umTPM.Compound, levels = cpd_lvl, ordered = true);

In [None]:
# Highlight MOAs of each compound
# moa_palette = ["#5A463C","#8B726A","#BDA7A3","#463C5A","#7D768B","#B5B1BD","#3C5A46","#768B7D","#B1BDB5"];
moa_palette = ["#FF1998","#FF5FB9","#FFA3D7","#1B9CFF","#5FB9FF","#A3D7FF","#9CFF1B","#B9FF5F","#D7FFA3"];

In [None]:
gp = ggplot(umTPM, aes(:UMAP1, :UMAP2)) + 
    geom_point(aes(color = :Compound), alpha = 1) +
    coord_fixed() + scale_color_manual(values = moa_palette) + 
    theme(var"legend.position"="bottom", var"legend.spacing.x" = unit(0.35, "cm"), 
    var"legend.spacing.y" = unit(0, "cm")) + 
    guides(color=guide_legend(nrow=3,byrow=false))

In [None]:
ggsave("Fig/topMOA_UMAP2.pdf", gp)