In [None]:
using CSV, StatsBase, Statistics, DataFrames, UMAP, RCall, FreqTables
using MultipleTesting, Random, MultivariateStats, Distributed
using BioProfiling

In [None]:
using Dates: now
now()

In [None]:
# This is the location where the images are stored
# (useful to highlight cells of a given morphology)
img_folder = "/images/"

In [None]:
readdir(img_folder)

## R Calls

In [None]:
@rlibrary ggplot2
@rlibrary extrafont
@rlibrary viridis
@rlibrary heatmaply
@rlibrary ggrepel

In [None]:
R"""
# Used later for MCD computation

library(robustbase)

# Customize ggplot appearance

library(ggplot2)
library(extrafont)


# Load extra fonts
# ttf_import("/tmp/.fonts")
# loadfonts()

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), 
                     legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}

cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
scale_fill_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)
"""

## Load measurements

### Load annotations

In [None]:
annotations = CSV.read("data/transferList.txt", DataFrame, header = 1, delim = "\t")
# Convert plate number to strings to make clear it is an ID and should not be used for computations
annotations[!, :DestinationPlate] = string.(annotations[:,:DestinationPlate])
# Wells in the transfer list without any compound name are filled with DMSO only
annotations[!, :CompoundName][ismissing.(annotations[:,:CompoundName])] .= "DMSO"
# A non-ASCII characters needs to be converted
annotations.CompoundName = replace.(annotations.CompoundName, "\xb1" => s"±");

In [None]:
annotations

### Load image data

In [None]:
image = CSV.read("data/compiled_PilotDrugScreen_Image_1500.csv", DataFrame)
println(string.(names(image))[1:8])
# Number of images and features available
println(nrow(image))
println(ncol(image))

### Load individual cell files

In [None]:
nucleus = CSV.read("data/compiled_PilotDrugScreen_Nucleus_1500.csv", DataFrame, header = 1);

In [None]:
# Remove duplicated columns
nucleus = nucleus[:, Not([1,6,8,12,14,16])];

In [None]:
cytocm = CSV.read("data/compiled_PilotDrugScreen_CytoCM_1500.csv", DataFrame, header = 1);

In [None]:
# Remove duplicated columns
cytocm = cytocm[:, Not([1,6,8,12,14,16])];

In [None]:
cytoplasm = CSV.read("data/compiled_PilotDrugScreen_Cytoplasm_1500.csv", DataFrame, header = 1);

In [None]:
# Remove duplicated columns
cytoplasm = cytoplasm[:, Not([1,6,8,12,14,16])];

### Merge measurements

In [None]:
cell = innerjoin(cytoplasm, cytocm, on = [:ImageNumber => :ImageNumber, :Parent_CytoCM => :ObjectNumber], 
       makeunique = true)

In [None]:
cell = innerjoin(cell, nucleus, on = [:ImageNumber => :ImageNumber, :Parent_Nucleus => :ObjectNumber], 
       makeunique = true)

In [None]:
# NB: columns with missing values should not be provided even if they are not used by ggplot
ggplot(cell[1:10000,[:AreaShape_Center_X, :AreaShape_Center_X_1]], 
       aes(x = :AreaShape_Center_X, y = :AreaShape_Center_X_1)) + geom_point()

In [None]:
# NB: columns with missing values should not be provided even if they are not used by ggplot
ggplot(cell[1:10000,[:AreaShape_Center_X_1, :AreaShape_Center_X_2]], 
       aes(x = :AreaShape_Center_X_1, y = :AreaShape_Center_X_2)) + geom_point()

In [None]:
# NB: columns with missing values should not be provided even if they are not used by ggplot
ggplot(cell[1:10000,[:AreaShape_Center_X, :AreaShape_Center_X_2]], 
       aes(x = :AreaShape_Center_X, y = :AreaShape_Center_X_2)) + geom_point()

In [None]:
# Free up memory
nucleus = nothing
cytoplasm = nothing
cytocm = nothing

## Experiment filtering

In [None]:
cell.URL = "/images/r".*lpad.(cell.Metadata_Row, 2, "0").*
    "c".*lpad.(cell.Metadata_Column, 2, "0").*
    "f".*lpad.(cell.Metadata_Field, 2, "0").*"p01rc1-ch1sk1fk1fl1.tiff";

In [None]:
cell.PlateNumber = [x == "1500CellsPerWell" ? "1914001" : "1914011" for x in cell.Metadata_Plate];

In [None]:
cell = leftjoin(cell, annotations[:,[:CompoundName,:DestWell,:DestinationPlate]], 
                    on = [:Metadata_Well => :DestWell, :PlateNumber => :DestinationPlate])

# Wells absent of the transfer list do not have any compound in them
cell.CompoundName[ismissing.(cell.CompoundName), :] .= "None";

```julia
# Export `cell` dataframe to avoid reprocessing if not needed
CSV.write("data/compiled_cell_1500.csv", cell)
```

```julia
# If data was already compiled and exported, load the `cell` dataframe.
cell = CSV.read("data/compiled_cell_1500.csv", DataFrame)
```

In [None]:
CSV.write("data/compiled_cell_1500.csv", cell)

In [None]:
ftb = freqtable(cell.CompoundName)
ftb[sortperm(ftb)]

In [None]:
xp = Experiment(cell, description = "All measurements for plate @1500c/well.")

In [None]:
# Make sure to remove columns that are not morphological measurements
strToRemove = ["Parent", "ObjectNumber", "ImageNumber", "Experiment",
                "Object_Number", "Location", "Center", "Metadata", "Orientation",
                "PlateNumber", "CompoundName", "Texture", "Neighbors_AngleBetweenNeighbors_5",
                "Neighbors_SecondClosestDistance_5", "URL"]
# NB: Textural features have missing values
filters = Array{BioProfiling.AbstractReduce}([NameSelector(x -> !any(occursin.(strToRemove, String(x))))])

### Which cells to filter?

In [None]:
RCall.rcall_p(:options, rcalljl_options=Dict(:width => 500, :height => 400))

In [None]:
# Cells with cytoCM much larger than nucleus
ggplot(cell[1:10000,[:AreaShape_Area_2, :AreaShape_Area_1]], 
       aes(x = :AreaShape_Area_2, y = :AreaShape_Area_1)) + 
    geom_point() +
    geom_abline(intercept = 0, slope = 4) +
    xlab("Nucleus area") +
    ylab("Cytoplasm area") +
    RObject(nothing)

In [None]:
cell.Metadata_Ratio1 = cell.AreaShape_Area_1 ./ cell.AreaShape_Area_2
push!(filters, Filter(4, :Metadata_Ratio1, 
        compare = <, 
        description = "Exclude cells with really large cytoCM"))
negf1 = negation(filters[end]);

In [None]:
RCall.rcall_p(:options, rcalljl_options=Dict(:width => 500, :height => 500))

In [None]:
# Cells with cytoplasm much larger than cytoCM
ggplot(cell[1:10000,[:AreaShape_Area, :AreaShape_Area_1]], 
       aes(x = :AreaShape_Area_1, y = :AreaShape_Area)) + geom_point() +
    geom_abline(intercept = 0, slope = 10)

In [None]:
cell.Metadata_Ratio2 = cell.AreaShape_Area ./ cell.AreaShape_Area_1
push!(filters, Filter(10, :Metadata_Ratio2, compare = <, description = "Exclude cells with really large cytoplasm"))
negf2 = negation(filters[end]);

In [None]:
# Cells with non-round nuclei (mostly segmentation mistakes)
ggplot(cell[1:10000,[:AreaShape_Area_2, :AreaShape_FormFactor_2]], 
       aes(x = :AreaShape_Area_2, y = :AreaShape_FormFactor_2)) + geom_point() +
    geom_hline(yintercept = 0.5)

In [None]:
push!(filters, Filter(0.5, :AreaShape_FormFactor_2, compare = >, description = "Keep roundish nuclei"))
negf3 = negation(filters[end]);

In [None]:
# Cells with CellMask clots
ggplot(cell[1:10000,[:Intensity_MaxIntensity_CorrCM_1, :Intensity_StdIntensity_CorrCM_1]], 
       aes(x = :Intensity_MaxIntensity_CorrCM_1, y = :Intensity_StdIntensity_CorrCM_1)) + geom_point() +
    geom_vline(xintercept = 0.1)

In [None]:
push!(filters, Filter(0.1, :Intensity_MaxIntensity_CorrCM_1, compare = <, description = "Keep roundish nuclei"))
negf4 = negation(filters[end]);

### Example images
Here we highlight examples of cells which were filtered out and cells which were kept. 

In [None]:
# Convert paths to all channels
rgbrgx = [
    ["rc1" => "rc3"],
    ["rc1" => "rc2"],
    []]

In [None]:
# This is the description of the cells highlighted (i.e. they have an usually large cytoCM)
allImgToCopy = [diagnostic_path(xp, negf1, :URL, center = false)[1:3]...,
diagnostic_path(xp, negf2, :URL, center = false)[1:3]...,
diagnostic_path(xp, negf3, :URL, center = false)[1:3]...,
diagnostic_path(xp, negf4, :URL, center = false)[1:3]...]

In [None]:
# This is the description of the cells highlighted (i.e. they have an usually large cytoCM)
println(negf1.description)
diagnostic_images(xp, negf1, :URL, 
                    saveimages = false, show = true,
                    rgb = rgbrgx, showlimit = 3, center = true)

In [None]:
println(negf2.description)
diagnostic_images(xp, negf2, :URL, 
                    saveimages = false, show = true,
                    rgb = rgbrgx, showlimit = 3, center = true)

In [None]:
println(negf3.description)
diagnostic_images(xp, negf3, :URL, 
                    saveimages = false, show = true,
                    rgb = rgbrgx, showlimit = 3, center = true)

In [None]:
println(negf4.description)
diagnostic_images(xp, negf4, :URL, 
                    saveimages = false, show = true,
                    rgb = rgbrgx, showlimit = 3, center = true)

### Apply filters

In [None]:
select!(xp, filters)

In [None]:
# No measurements are missing!
tdf = xp.data[xp.selected_entries, xp.selected_features]
@assert sum(Array(mapcols(x -> sum(ismissing.(x)), tdf))[1,:]) == 0
# Free up memory
tdf = nothing

```julia
# Remove constant columns
push!(filters, Selector(x -> mad(x, normalize = true) != 0, description = "Remove constant features"));
push!(filters, Selector(x -> mad(x, normalize = true) != 0, 
                        subset = x -> x.CompoundName .== "DMSO", 
                        description = "Remove features constant for reference"));
```

In [None]:
xp

## Coarse-grain aggregation

In [None]:
gd = groupby(xp.data[xp.selected_entries,[xp.selected_features...,
                                           findfirst(names(xp.data) .== "Metadata_Well"),
                                           findfirst(names(xp.data) .== "Metadata_Row"),
                                           findfirst(names(xp.data) .== "Metadata_Column"),
                                           findfirst(names(xp.data) .== "CompoundName"),
                                           findfirst(names(xp.data) .== "Metadata_Field")]],
    [:Metadata_Well, :CompoundName, :Metadata_Field, :Metadata_Row, :Metadata_Column])
aggregatedData = DataFrames.combine(gd, valuecols(gd) .=> median);

In [None]:
# Checkpoint to avoid recomputing the whole filtering
CSV.write("data/aggregatedData_1500cells.csv", aggregatedData)

If the data was already exported:
```julia
aggregatedData = CSV.read("data/aggregatedData_1500cells.csv");
```

### Subset data to images with at least 3 cells

In [None]:
cell_per_image = freqtable(xp.data[xp.selected_entries,:ImageNumber])
non_sparse_images = Set(k for (k,v) in cell_per_image.dicts[1] if cell_per_image.array[v] >= 3);

In [None]:
compare_in(x,y) = x in y
Broadcast.broadcasted(::typeof(compare_in), x, y) = broadcast(in, x, Ref(y))
sparcity_filter = Filter(non_sparse_images, :ImageNumber, compare = compare_in)

In [None]:
filter_entries!(xp, sparcity_filter);

In [None]:
gd = groupby(xp.data[xp.selected_entries,[xp.selected_features...,
                                           findfirst(names(xp.data) .== "Metadata_Well"),
                                           findfirst(names(xp.data) .== "Metadata_Row"),
                                           findfirst(names(xp.data) .== "Metadata_Column"),
                                           findfirst(names(xp.data) .== "CompoundName"),
                                           findfirst(names(xp.data) .== "Metadata_Field")]],
    [:Metadata_Well, :CompoundName, :Metadata_Field, :Metadata_Row, :Metadata_Column])
aggregatedData = DataFrames.combine(gd, valuecols(gd) .=> median);

In [None]:
# Checkpoint to avoid recomputing the whole filtering
CSV.write("data/aggregatedData_1500cells_noSparse.csv", aggregatedData)