# Drug plates

In [None]:
library(ggplot2)
library(extrafont)
library(stringr)
library(heatmaply)
library(ggrepel)
library(reticulate)
library(gridExtra)
library(robustbase)
library(randomForest)
library(reshape2)

In [None]:
# Load external Python functions
source_python("reticulate_functions.py")
# Load extra fonts
ttf_import(paths = "/tmp/.fonts/")
loadfonts()
# Set this to true for faster compilation or false for more precise results and all outputs
TEST_MODE = F

In [None]:
# Number of permutations used for empirical p-value computations
PERM_NB_ITER = ifelse(TEST_MODE, 20, 2000)
# Number of genes selected for hierarchical heatmap
CLUST_NB_GENES = ifelse(TEST_MODE, 30, 300)
# Number of trees used in random forest classifier
RF_NB_TREES = ifelse(TEST_MODE, 50, 1000)

In [None]:
# Customize ggplot appearance

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}

cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
cust_pal = colorRampPalette(c("#008CAD", "#40B9D4", "#D4ECF2", "#F8B100", "#C00000", "#2D0000"))
yolla_pal = colorRampPalette(c('#FC7070', '#C00000', '#2D0000'))
scale_fill_discrete <- function (..., type = "Cust", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else if (type == "Yolla"){
        discrete_scale(aesthetics, "Yolla", yolla_pal, na.value = na.value, ...)
    } else if (type == "Cust"){
        discrete_scale(aesthetics, "Cust", cust_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "Cust", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else if (type == "Cust"){
        discrete_scale(aesthetics, "Cust", cust_pal, na.value = na.value, ...)
    } else if (type == "Yolla"){
        discrete_scale(aesthetics, "Yolla", yolla_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)

## NK92

In [None]:
# Fix t-SNE layout and random number generation
set.seed(38)

In [None]:
NK = read.csv("Rsc/allImages_NK_Drugs.csv", header=T)

In [None]:
Layout = read.csv("Rsc/DrugPlateLayout.csv", header=T)
Layout$Well <- paste0(Layout$Row, str_pad(Layout$Column, 2, pad = "0"))

In [None]:
levels(NK$Drug)

In [None]:
NK$Drug = sapply(NK$Metadata_Well, function(x) Layout$Drug[x == Layout$Well])
levels(NK$Drug)[levels(NK$Drug)=="Y27"] <- "Y-27632"
levels(NK$Drug)[levels(NK$Drug)=="Jasplaknolide"] <- "Jasplakinolide"
NK$Concentration = sapply(NK$Metadata_Well, function(x) Layout$Concentration[x == Layout$Well])

In [None]:
gpNK = ggplot(NK[!is.na(NK$Count_FilteredNucleus),]) + geom_histogram(aes(Count_FilteredNucleus, fill = as.factor(Metadata_Row)), binwidth=2) +
       scale_fill_discrete(name="Row")
gpNK

if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_count.pdf", plot = gpNK)
}

In [None]:
gpNK = ggplot(NK[!is.na(NK$Mean_FilterCytoplasm_AreaShape_Area),]) + geom_histogram(aes(Mean_FilterCytoplasm_AreaShape_Area, 
                                                                                    fill = as.factor(Metadata_Row)), binwidth=100) +
       scale_fill_discrete(name="Row")
gpNK
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_area.pdf", plot = gpNK)
}

### Filtering

In [None]:
FILT_MAX_INT_DNA = 0.05 # Remove empty images and small DNA precipitations
FILT_MIN_CELLS = 4 # 8 seems safe from distribution and images, 3 seems in poor shape
FILT_NB_MAX_NA_IMAGE = 10
FILT_MAX_CORR = 0.6 # Keep uncorrelated variables
dimUMAP = 3

In [None]:
ftToKeep = 1:dim(NK)[2]
# Make sure that the fields are numeric
ftToKeep <- ftToKeep[which(sapply(NK[,ftToKeep], class) == "numeric")]
# Remove execution time and count features
ftToKeep <- ftToKeep[grep("(Execution)|(Count)|(Concentration)", colnames(NK)[ftToKeep], invert = T)]

In [None]:
# Store all remaining features before filtering for downstream comparison to selected set
preFiltFt = colnames(NK)[ftToKeep]

In [None]:
# Remove wells with low max DNA intensity
fieldToKeep <- which(NK$ImageQuality_MaxIntensity_DNA >= FILT_MAX_INT_DNA)
# Remove wells with low cell count
fieldToKeep <- fieldToKeep[NK[fieldToKeep,]$Count_FilteredNucleus >= FILT_MIN_CELLS]

In [None]:
# Few bad quality pictures are generating a lot of missing values and are removed
fieldToKeep <- fieldToKeep[rowSums(is.na(NK[fieldToKeep,ftToKeep])) < FILT_NB_MAX_NA_IMAGE]
# We exclude drugs for which more than half of the images were filtered
drugToRemove <- levels(NK$Drug)[table(NK$Drug[fieldToKeep]) < 180]
fieldToKeep <- fieldToKeep[!NK$Drug[fieldToKeep] %in% drugToRemove]

In [None]:
# Remove remaining features with missing values
ftToKeep <- ftToKeep[colSums(is.na(NK[fieldToKeep,ftToKeep])) == 0] 
# Remove constant columns
indWT = NK[fieldToKeep,]$Drug == "WT"
ftToKeep <- ftToKeep[sapply(NK[fieldToKeep,ftToKeep], function(x) mad(x) != 0)]
ftToKeep <- ftToKeep[sapply(NK[fieldToKeep[indWT],ftToKeep], function(x) mad(x) != 0)]

In [None]:
dim(NK)
dim(NK[fieldToKeep, ftToKeep])

In [None]:
# Approximate normal distribution
transfLog <- function (x){
    minVal = min(x)
    return(log(x+1-minVal))
}

# Center and scale on control values
transfNorm <- function(x, y){
    return((x - median(y))/mad(y))
}

In [None]:
transformedNK = apply(NK[fieldToKeep, ftToKeep], 2, transfLog)
transformedNK = apply(transformedNK, 2, function(x) transfNorm(x, x[indWT]))

In [None]:
# Order features from biggest mad to smallest mad
# Since features have mad(WT) = 1, it means that we rank features by how more variable they are
# for drug perturbations than for WT
orderFt = rev(order(apply(transformedNK, 2, mad)))

In [None]:
uncorrFt = uncorrelate(transformedNK, orderCol = orderFt-1, threshold = FILT_MAX_CORR)
uncorrFt = unlist(uncorrFt) + 1

In [None]:
transformedNK = transformedNK[,uncorrFt]

### Export subset of features

For NK cells, the following features are selected and explored separately for their biological interpretability:
```
* Actin intensity/cell (mean/well): NK$Intensity_MeanIntensity_CorrActin_FilterCytoplasm
* Cell area: NK$Mean_FilterCytoplasm_AreaShape_Area
* Cell roundness: NK$Mean_FilterCytoplasm_AreaShape_FormFactor
* Cell width: NK$Mean_FilterCytoplasm_AreaShape_MajorAxisLength 
* Cell length: NK$Mean_FilterCytoplasm_AreaShape_MinorAxisLength
* Cell length to width ratio: NK$Mean_FilterCytoplasm_AreaShape_MinorAxisLength / NK$Mean_FilterCytoplasm_AreaShape_MajorAxisLength
* Average number of perforin granules / cell: NK$Count_PerfGranules  / NK$Count_FilterCytoplasm
* Perforin area / cell area: (NK$Count_PerfGranules * NK$Mean_PerfGranules_AreaShape_Area)  / (NK$Count_FilterCytoplasm * NK$Mean_FilterCytoplasm_AreaShape_Area)
* Perforin intensity: NK$Intensity_MeanIntensity_CorrPerf_FilterCytoplasm
* Perforin area: NK$Mean_PerfGranules_AreaShape_Area
* Nucleus intensity: NK$Intensity_MeanIntensity_CorrDNA_FilteredNucleus
* Nucleus area: NK$Mean_FilteredNucleus_AreaShape_Area
* Nucleus roundness: NK$Mean_FilteredNucleus_AreaShape_FormFactor
* Nucleus width: NK$Mean_FilteredNucleus_AreaShape_MajorAxisLength 
* Nucleus length: NK$Mean_FilteredNucleus_AreaShape_MinorAxisLength
* Nucleus ratio: NK$Mean_FilteredNucleus_AreaShape_MinorAxisLength / NK$Mean_FilteredNucleus_AreaShape_MajorAxisLength
* Nucleus area / cell area: (NK$Count_FilteredNucleus * NK$Mean_FilteredNucleus_AreaShape_Area)  / (NK$Count_FilterCytoplasm * NK$Mean_FilterCytoplasm_AreaShape_Area)```

NB (from CellProfiler docs): FormFactor = $4 \times π \times Area / Perimeter^2$. Equals 1 for a perfectly circular object.

In [None]:
subsetNK = cbind(Field = str_extract(as.character(NK[fieldToKeep,]$URL_Actin), "r..c..f.."),
                 Drug = as.character(NK[fieldToKeep,]$Drug),
                 Concentration = NK[fieldToKeep,]$Concentration,
                 ActinIntensity = NK[fieldToKeep,]$Intensity_MeanIntensity_CorrActin_FilterCytoplasm,
                 CellArea = NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_Area,
                 CellRoundness = NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_FormFactor,
                 CellWidth = NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_MajorAxisLength,
                 CellLength = NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_MinorAxisLength,
                 CellLengthOverWidth = NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_MinorAxisLength / 
                     NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_MajorAxisLength,
                 PerforinGranulesPerCell = NK[fieldToKeep,]$Count_PerfGranules  / NK[fieldToKeep,]$Count_FilterCytoplasm,
                 PerforinAreaOverCellArea = (NK[fieldToKeep,]$Count_PerfGranules * NK[fieldToKeep,]$Mean_PerfGranules_AreaShape_Area) / 
                     (NK[fieldToKeep,]$Count_FilterCytoplasm * NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_Area),
                 PerforinIntensity = NK[fieldToKeep,]$Intensity_MeanIntensity_CorrPerf_FilterCytoplasm,
                 PerforinArea = NK[fieldToKeep,]$Mean_PerfGranules_AreaShape_Area,
                 NucleusIntensity = NK[fieldToKeep,]$Intensity_MeanIntensity_CorrDNA_FilteredNucleus,
                 NucleusArea = NK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_Area,
                 NucleusRoundness = NK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_FormFactor,
                 NucleusWidth = NK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_MajorAxisLength,
                 NucleusLength = NK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_MinorAxisLength,
                 NucleusLengthOverWidth = NK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_MinorAxisLength / 
                     NK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_MajorAxisLength,
                 NucleusAreaOverCellArea = (NK[fieldToKeep,]$Count_FilteredNucleus * NK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_Area) / 
                 (NK[fieldToKeep,]$Count_FilterCytoplasm * NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_Area)
                )

In [None]:
# Export list of "interpretable" features
if(!TEST_MODE){
    write.csv(subsetNK, "Tab/NK_Drug_features.csv", row.names = F)
}

In [None]:
CountPerWell = aggregate(NK[fieldToKeep,]$Count_FilterCytoplasm, by = list(NK[fieldToKeep,]$Metadata_Well), FUN = function(x) sum(x, na.rm = T))
names(CountPerWell) <- c("Well", "Count")
CountPerDrug = aggregate(NK[fieldToKeep,]$Count_FilterCytoplasm, by = list(NK[fieldToKeep,]$Drug), FUN = function(x) sum(x, na.rm = T))
names(CountPerDrug) <- c("Drug", "Count")
CountPerConcentration = aggregate(NK[fieldToKeep,]$Count_FilterCytoplasm, by = list(NK[fieldToKeep,]$Drug, 
    NK[fieldToKeep,]$Concentration), FUN = function(x) sum(x, na.rm = T))
names(CountPerConcentration) <- c("Drug", "Concentration", "Count")                    

In [None]:
# Export cell counts
if(!TEST_MODE){
    write.csv(CountPerWell, "Tab/NK_Drug_count_well.csv", row.names = F)
    write.csv(CountPerDrug, "Tab/NK_Drug_count_drug.csv", row.names = F)
    write.csv(CountPerConcentration, "Tab/NK_Drug_count_concentration.csv", row.names = F)    
}

### Look at which types of features are kept

In [None]:
catChannel = c("CorrDNA", "CorrActin", "CorrPerf")
table(rowSums(sapply(catChannel, function(x) grepl(x, colnames(transformedNK)))))
dtCat = data.frame(CountIni = sort(colSums(sapply(catChannel, function(x) grepl(x, preFiltFt)))), 
                   Count = sort(colSums(sapply(catChannel, function(x) grepl(x, colnames(transformedNK))))))

dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) + ylim(c(0,675)) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 2, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_SelecFt_Channel.pdf", plot = gp, width = 10)
}

In [None]:
catObjects = c("ActinGranules", "FilterCytoplasm", "ShrunkenCytoplasm", "FilteredNucleus", "PerfGranules")
table(rowSums(sapply(catObjects, function(x) grepl(x, colnames(transformedNK)))))
dtCat = data.frame(CountIni = colSums(sapply(catObjects, function(x) grepl(x, preFiltFt))), 
                   Count = colSums(sapply(catObjects, function(x) grepl(x, colnames(transformedNK)))))

dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) + ylim(c(0,510)) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 2, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_SelecFt_Object.pdf", plot = gp, width = 10)
}

In [None]:
catType = c("Threshold", "Granularity", "ImageQuality", "Texture", "Distance", "AreaShape", "RadialDistribution", "Neighbors", 
            "Correlation", "Intensity", "Overlap", "Location")
which(rowSums(sapply(catType, function(x) grepl(x, colnames(transformedNK)))) == 0) # All features are covered

dtCat = data.frame(CountIni = colSums(sapply(catType, function(x) grepl(x, preFiltFt))), 
                   Count = colSums(sapply(catType, function(x) grepl(x, colnames(transformedNK)))))
dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 5, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_SelecFt_Type_All.pdf", plot = gp, width = 10)
}

In [None]:
catType = c("Granularity", "Texture", "AreaShape", "RadialDistribution",
            "Correlation", "Intensity")
table(rowSums(sapply(catType, function(x) grepl(x, colnames(transformedNK)))))

dtCount = as.data.frame(sapply(catType, function(x) grepl(x, colnames(transformedNK))))
dtCount$Other = !apply(dtCount, 1, any)
dtCountIni = as.data.frame(sapply(catType, function(x) grepl(x, preFiltFt)))
dtCountIni$Other = !apply(dtCountIni, 1, any)
dtCat = data.frame(CountIni = colSums(dtCountIni), 
                   Count = colSums(dtCount))
dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 5, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_SelecFt_Type_Short.pdf", plot = gp, width = 10)
}

In [None]:
# Export list of features kept
if(!TEST_MODE){
    write.table(colnames(transformedNK), file = "Tab/NK_Drug_list_features_kept.csv", col.names = F, row.names = F)
}

### Look at the morphological distribution of the fields of view

#### UMAP visualizations

In [None]:
umTNK = umap(transformedNK, min_dist = 0.1, neighbors = 10, n = dimUMAP, metric = "euclidean")
umTNK = as.data.frame(umTNK)
names(umTNK) = c("UMAP1", "UMAP2", "UMAP3")
umTNK$Row <- as.factor(NK$Metadata_Row[fieldToKeep])
umTNK$Col <- as.factor(NK$Metadata_Column[fieldToKeep])
umTNK$Drug <- droplevels(as.factor(NK$Drug[fieldToKeep]))
umTNK$Conc <- as.factor(NK$Concentration[fieldToKeep])
umTNK$URL <- as.factor(NK$URL_Actin[fieldToKeep])

In [None]:
# Order levels to separate DMSO and WT from drugs
umTNK$Drug <- factor(umTNK$Drug, levels = c("DMSO", 'Latrunculin B', 'Jasplakinolide', 'Blebbistatin',
                                            'Y-27632', 'CK869', 'Wiskostatin', 'SMIFH2', "WT"))

In [None]:
umTNK$Shape = ifelse(umTNK$Drug %in% c("DMSO", "WT"), "2", "1")
levels(umTNK$Drug)[levels(umTNK$Drug)=="WT"] <- "Untreated"

In [None]:
gp <- ggplot(umTNK) + geom_point(aes(UMAP1, UMAP2, color = Drug, shape = Shape)) + 
                      guides(color = guide_legend(reverse = TRUE, 
                      override.aes = list(shape = ifelse(levels(umTNK$Drug) %in% c("DMSO", "Untreated"), 1, 16))),
                      shape = F) + scale_shape_manual(values = c(16,1))
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_UMAP.pdf", plot = gp, width = 10)
}

In [None]:
umXlim = ggplot_build(gp)$layout$panel_scales_x[[1]]$range$range
umYlim = ggplot_build(gp)$layout$panel_scales_y[[1]]$range$range

In [None]:
gp <- ggplot(umTNK) + geom_point(aes(UMAP1, UMAP3, color = Drug, shape = Shape)) + 
                      guides(color = guide_legend(reverse = TRUE, 
                      override.aes = list(shape = ifelse(levels(umTNK$Drug) %in% c("DMSO", "Untreated"), 1, 16))),
                      shape = F) + scale_shape_manual(values = c(16,1))
gp

In [None]:
umTNK$DrugOrControl <- as.factor(ifelse(umTNK$Drug == "DMSO", "DMSO", 
                                        ifelse(umTNK$Drug == "Untreated", "Untreated", "Drug")))

gp <- ggplot(umTNK) + geom_point(aes(UMAP1, UMAP2, color = DrugOrControl, shape = Shape)) + 
            theme(legend.title = element_blank()) + guides(color = guide_legend(reverse = TRUE, 
            override.aes = list(shape = ifelse(levels(umTNK$DrugOrControl) %in% c("DMSO", "Untreated"), 1, 16))),
            shape = F) + scale_shape_manual(values = c(16,1))
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_UMAP_Control.pdf", plot = gp, width = 10)
}

In [None]:
# Visualize drug perturbations one by one
getDrugUmap <- function(drugPert){
    drug = levels(umTNK$Drug)[drugPert]
    rowPert = NK[fieldToKeep[NK[fieldToKeep,]$Drug == drug],]$Metadata_Row[1]

    umDrug = umTNK[umTNK$Row == rowPert,]
    cols = cust_pal(length(levels(umTNK$Drug)))[c(length(levels(umTNK$Drug)), drugPert, 1)]
    names(cols) = c("Untreated", drug, "DMSO")
    gp <- ggplot(umDrug) + geom_point(aes(UMAP1, UMAP2, color = Drug, shape = Shape)) + 
        scale_color_manual(values = cols) + ggtitle(drug) +
        theme(legend.title = element_blank(), legend.position = "bottom", legend.key.size = unit(0.08,"cm"),
              legend.text = element_text(size = 10), legend.spacing.x = unit(0.08,"cm"), text=element_text(size=12),
              plot.title = element_text(hjust = 0.5)) +
        xlim(umXlim) + ylim(umYlim) + scale_shape_manual(values = c(16,1)) +
        guides(color = guide_legend(override.aes = list(shape = c(1, 16, 1))), shape = F)
    return(gp)
}

In [None]:
if(!TEST_MODE){
    gpl = lapply(2:(length(levels(umTNK$Drug))-1), getDrugUmap)
    gp = do.call("grid.arrange", c(gpl, ncol=4))
    gp
    ggsave(filename = "Fig/NK_Drug_UMAP_by_drug.pdf", plot = gp, width = 10.8)
}

In [None]:
# Visualize drug perturbations one by one
getDrugConcUmap <- function(drugPert){
    drug = levels(umTNK$Drug)[drugPert]
    rowPert = NK[fieldToKeep[NK[fieldToKeep,]$Drug == drug],]$Metadata_Row[1]

    umDrug = umTNK[umTNK$Row == rowPert,]
    umDrug = umDrug[umDrug$Drug != "DMSO",]
    cols = c(yolla_pal(length(unique(umDrug[!(umDrug$Drug %in% c("Untreated")),]$Conc))), "#DDDDDD")
    names(cols) = c(as.character(sort(as.numeric(
                    unique(as.character(umDrug[!(umDrug$Drug %in% c("Untreated")),]$Conc))))), 
                                      "Untreated")
    
    umDrug$Conc = factor(ifelse(umDrug$Drug %in% c("Untreated"), "Untreated", as.character(umDrug$Conc)),
                         levels = names(cols))
    
    gp <- ggplot(umDrug) + geom_point(aes(UMAP1, UMAP2, color = Conc, shape = Shape)) + 
        scale_color_manual(values = cols) + ggtitle(drug) +
        theme(legend.title = element_blank(), legend.position = "bottom", legend.key.size = unit(0.08,"cm"),
              legend.text = element_text(size = 10), legend.spacing.x = unit(0.08,"cm"), text=element_text(size=12),
              plot.title = element_text(hjust = 0.5)) +
        xlim(umXlim) + ylim(umYlim) + scale_shape_manual(values = c(16,1)) +
        guides(color = guide_legend(override.aes = list(shape = c(rep(16, length(unique(umDrug$Conc))-1), 1))), shape = F)
    return(gp)
}

In [None]:
if(!TEST_MODE){
    gpl = lapply(2:(length(levels(umTNK$Drug))-1), getDrugConcUmap)
    gp = do.call("grid.arrange", c(gpl, ncol=4))
    gp
    ggsave(filename = "Fig/NK_Drug_UMAP_by_drug_concentration.pdf", plot = gp, width = 10.8)
}

### Prediction of drug based on morphology
Which features are useful to predict compound of origin?

5CV + 1 fold validation

In [None]:
set.seed(38)

In [None]:
# Number of images per fold (5CV + validation set)
nbSampTestSet = round(nrow(umTNK)/6)

In [None]:
# Shuffle images
idImages = sample(length(fieldToKeep))

# Last fold will be used as validation set
validationSet = as.data.frame(transformedNK[-idImages[1:(nbSampTestSet*5)],])
validationSet$Drug = droplevels(NK$Drug[fieldToKeep][-idImages[1:(nbSampTestSet*5)]])

# All drugs are present
table(validationSet$Drug)

In [None]:
# F1 score for a given level
F1pred <- function(preds, obs, curLvl){
    TP = sum((preds == curLvl) & (obs == curLvl))
    FP = sum((preds == curLvl) & (obs != curLvl))
    TN = sum((preds != curLvl) & (obs != curLvl))
    FN = sum((preds != curLvl) & (obs == curLvl))
    stopifnot(length(preds) == TP+FP+TN+FN)
    stopifnot(length(obs) == TP+FP+TN+FN)
    return((2*TP)/(2*TP+FN+FP))
}

In [None]:
# Perform random forest |cross-validation on a given dataset
crossValRF <- function(dataset, folds, nbSampTestSet, idImages, mtryRange, nbTrees = RF_NB_TREES){
    # Fitness matrix (F1 score between 0 and 1, with 1 optimal)
    fitMat = matrix(ncol = folds, nrow = length(mtryRange))
    rownames(fitMat) <- as.character(mtryRange)
    for (mtryId in 1:length(mtryRange)) {
        curMtry = mtryRange[mtryId]
        print(paste("Mtry", curMtry))
        for (fold in 1:folds) {
            print(paste("Fold", fold))
            foldInd = ((fold-1)*nbSampTestSet+1):(fold*nbSampTestSet)
            testSet = dataset[foldInd,]
            trainSet = dataset[-foldInd,]

            # All classes are present in train and test sets
            stopifnot(length(table(testSet$Drug)) == 9)
            stopifnot(length(table(trainSet$Drug)) == 9)

            rf = randomForest(Drug ~ ., data = trainSet, mtry = curMtry, ntree = nbTrees)

            print(paste("Mean class error", mean(rf$confusion[,10])))
            # Prediction on test set
            preds <- predict(rf, testSet)
            # Accuracy
            print(paste("Accuracy", mean(preds == testSet$Drug)))
            # Macro F1 score
            fitMat[mtryId, fold] = mean(sapply(levels(testSet$Drug), 
                        function(x) F1pred(preds, testSet$Drug, x)))
            flush.console()
        }
    }
    return(fitMat)
}

In [None]:
datasetCV = as.data.frame(transformedNK[idImages[1:(nbSampTestSet*5)],])
datasetCV$Drug = droplevels(NK$Drug[fieldToKeep][idImages[1:(nbSampTestSet*5)]])
CM = crossValRF(dataset = datasetCV, folds = 5, nbSampTestSet = nbSampTestSet, 
                idImages = idImages, mtryRange = c(20, 30, 40, 50, 60, 70, 80, 90))

In [None]:
CM
rowMeans(CM)
optiMtry = as.numeric(rownames(CM)[rowMeans(CM) == max(rowMeans(CM))])

In [None]:
optiMtry
fullRF = randomForest(Drug ~ ., data = datasetCV, mtry = optiMtry, ntree = RF_NB_TREES, localImp = T)

In [None]:
preds <- predict(fullRF, validationSet)
# Accuracy
print(paste("Validation accuracy", mean(preds == validationSet$Drug)))
# Macro F1-score
mean(sapply(levels(validationSet$Drug), 
            function(x) F1pred(preds, validationSet$Drug, x)))

In [None]:
fullRF

### Output Confusion matrix

In [None]:
confMat = data.frame(Observed = rep(levels(validationSet$Drug), each = length(levels(validationSet$Drug))),
                     Predicted = rep(levels(validationSet$Drug), length(levels(validationSet$Drug))),
                     Count = 0)
# Color: white on diagonal (for text readability)
confMat$Color = ifelse(confMat$Observed == confMat$Predicted, "#FFFFFF", "#333333")
# Change drug order
confMat$Observed = factor(confMat$Observed, levels = rev(c('Latrunculin B', 'Jasplakinolide', 'Blebbistatin', 
                                                     'Y-27632', 'CK869', 'Wiskostatin', 'SMIFH2', "DMSO", "WT")))
confMat$Predicted = factor(confMat$Predicted, levels = c('Latrunculin B', 'Jasplakinolide', 'Blebbistatin', 
                                                       'Y-27632', 'CK869', 'Wiskostatin', 'SMIFH2', "DMSO", "WT"))

In [None]:
for (i in 1:nrow(validationSet)){
    idMat = (confMat$Observed == validationSet$Drug[i]) & (confMat$Predicted == preds[i])
    confMat$Count[idMat] = confMat$Count[idMat] + 1
}
head(confMat)

In [None]:
totalDF = aggregate(confMat$Count, by = list(confMat$Observed), FUN = sum)
names(totalDF) <- c("Observed", "Count")
totalDF$Predicted = "Total"
totalDF$Color = "#FFFFFF"
confMat = rbind(confMat, totalDF)


In [None]:
f1DF = as.data.frame(round(sapply(levels(validationSet$Drug), 
            function(x) F1pred(preds, validationSet$Drug, x)), 2))
names(f1DF) <- "Count"
f1DF$Observed = rownames(f1DF)
f1DF$Predicted = "F1 score"
f1DF$Color = "#333333"
confMat = rbind(confMat, f1DF)

In [None]:
levels(confMat$Observed)[levels(confMat$Observed) == "WT"] <- "Untreated"
levels(confMat$Predicted)[levels(confMat$Predicted) == "WT"] <- "Untreated"

In [None]:
gp <- ggplot(confMat, aes(Predicted, Observed)) + geom_tile(aes(fill = Count)) +
    geom_text(aes(label = Count, color = Color), size = 8) + scale_color_identity() +
    guides(fill = F) + theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), 
                             axis.text = element_text(size = 24), axis.title = element_text(size = 24)) +
    geom_vline(xintercept = 9.5, color = "#FFFFFF", size = 1.5)
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_full_prediction.pdf", plot = gp, width = 10)
}

### Feature importance for prediction
#### Individual features

In [None]:
# Associate categories and colors to features
catType = c("Granularity", "Texture", "AreaShape", "RadialDistribution",
            "Intensity")
colType = cust_pal(length(catType)+1)

dtCount = as.data.frame(sapply(catType, function(x) grepl(x, colnames(transformedNK))))
dtCount$Other = !apply(dtCount, 1, any)
 
ftCat = as.factor(apply(dtCount, 1, function(x) colnames(dtCount)[which(x)]))

# Order levels so that "Other" is last
lvOther = which(levels(ftCat) == "Other")
ftCat = factor(ftCat, levels = c(levels(ftCat)[-lvOther], "Other"))
       
# Corresponding colors
ftCol = colType[ftCat]

In [None]:
stopifnot(all(colnames(transformedNK) == rownames(fullRF$importance)))

In [None]:
dfSMIFH2 = data.frame(Feature = rownames(fullRF$importance), 
                      Importance = fullRF$importance[,colnames(fullRF$importance) == "SMIFH2"],
                      Color = ftCol)
dfSMIFH2$Feature = factor(dfSMIFH2$Feature, levels = dfSMIFH2$Feature[order(dfSMIFH2$Importance, decreasing = T)])
dfSMIFH2 = dfSMIFH2[order(dfSMIFH2$Importance, decreasing = T)[1:12],]
dfSMIFH2$Feature = factor(dfSMIFH2$Feature, levels = dfSMIFH2$Feature[order(dfSMIFH2$Importance)])

In [None]:
gp <- ggplot(dfSMIFH2) + geom_col(aes(Feature, Importance, fill = Color)) + 
      xlab("") + scale_fill_identity() + coord_flip() + theme(axis.text.x = element_text(size = 11))
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_full_prediction_SMIFH2_top_features.pdf", plot = gp, height = 7, width = 10)
}

In [None]:
dfCK869 = data.frame(Feature = rownames(fullRF$importance), 
                      Importance = fullRF$importance[,colnames(fullRF$importance) == "CK869"],
                      Color = ftCol)
dfCK869$Feature = factor(dfCK869$Feature, levels = dfCK869$Feature[order(dfCK869$Importance, decreasing = T)])
dfCK869 = dfCK869[order(dfCK869$Importance, decreasing = T)[1:12],]
dfCK869$Feature = factor(dfCK869$Feature, levels = dfCK869$Feature[order(dfCK869$Importance)])

In [None]:
gp <- ggplot(dfCK869) + geom_col(aes(Feature, Importance, fill = Color)) + coord_flip() +
      xlab("") + scale_fill_identity() + theme(axis.text.x = element_text(size = 11))
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_full_prediction_CK869_top_features.pdf", plot = gp, height = 7, width = 10)
}

#### Combination of features by feature type and biological object

In [None]:
catObjects = c("ActinGranules", "FilterCytoplasm", "FilteredNucleus", "PerfGranules")
dtCount = as.data.frame(sapply(catObjects, function(x) grepl(x, colnames(transformedNK))))
dtCount$Other = apply(dtCount, 1, function(x) sum(x) != 1)
# Features matching several objects are classified as "Other"
dtCount[dtCount$Other,-5] <- F

In [None]:
ftCatObj = as.factor(apply(dtCount, 1, function(x) colnames(dtCount)[which(x)]))

# Order levels so that "Other" is last
lvOther = which(levels(ftCatObj) == "Other")
ftCatObj = factor(ftCatObj, levels = c(levels(ftCatObj)[-lvOther], "Other"))
                           
# Combination of feature category and biological object described
combFtCat = paste(ftCat, ftCatObj, sep = " / ")

In [None]:
combImpDF = aggregate(fullRF$importance[,colnames(fullRF$importance) == "SMIFH2"],
                      by = list(combFtCat), FUN = sum)
combImpDF = cbind(combImpDF, aggregate(fullRF$importance[,colnames(fullRF$importance) == "SMIFH2"],
                      by = list(combFtCat), FUN = length)[,2])
names(combImpDF) = c("Type", "Count", "Cardinality")
combImpDF$Type = factor(combImpDF$Type, levels = combImpDF$Type[order(combImpDF$Count)])

In [None]:
gp <- ggplot(combImpDF) + geom_col(aes(Type, Count, fill = Cardinality)) +
                    xlab("Feature type / object") + ylab("Cumulated importance") + coord_flip() +
                    guides(fill = F) + geom_text(aes(x = Type, y = Count + 0.0015, label = Cardinality), size = 4)
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_full_prediction_SMIFH2_feature_types.pdf", plot = gp, height = 7, width = 10)
}

In [None]:
combImpDF = aggregate(fullRF$importance[,colnames(fullRF$importance) == "CK869"],
                      by = list(combFtCat), FUN = sum)
combImpDF = cbind(combImpDF, aggregate(fullRF$importance[,colnames(fullRF$importance) == "CK869"],
                      by = list(combFtCat), FUN = length)[,2])
names(combImpDF) = c("Type", "Count", "Cardinality")
combImpDF$Type = factor(combImpDF$Type, levels = combImpDF$Type[order(combImpDF$Count)])

In [None]:
gp <- ggplot(combImpDF) + geom_col(aes(Type, Count, fill = Cardinality)) +
                    xlab("Feature type / object") + ylab("Cumulated importance") + coord_flip() +
                    guides(fill = F) + geom_text(aes(x = Type, y = Count + 0.0055, label = Cardinality), size = 4)
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_full_prediction_CK869_feature_types.pdf", plot = gp, height = 7, width = 10)
}

### Repeat classification based on 13 hand-picked features only

In [None]:
set.seed(38)

In [None]:
# nbSampTestSet and idImages are kept

# Last fold will be used as validation set
validationSet = as.data.frame(subsetNK[-idImages[1:(nbSampTestSet*5)],-c(1,3,7,8,17,18)])
validationSet[,-1] = sapply(validationSet[,-1], function(x) as.numeric(as.character(x)))

# All drugs are present
table(validationSet$Drug)

In [None]:
datasetCV = as.data.frame(subsetNK[idImages[1:(nbSampTestSet*5)],-c(1,3,7,8,17,18)])
datasetCV[,-1] = sapply(datasetCV[,-1], function(x) as.numeric(as.character(x)))
CM = crossValRF(dataset = datasetCV, folds = 5, nbSampTestSet = nbSampTestSet, 
                idImages = idImages, mtryRange = c(1, 4, 7, 10, 13))

In [None]:
CM
rowMeans(CM)
optiMtry = as.numeric(rownames(CM)[rowMeans(CM) == max(rowMeans(CM))])

In [None]:
optiMtry
fullRF = randomForest(Drug ~ ., data = datasetCV, mtry = optiMtry, ntree = RF_NB_TREES)

In [None]:
preds <- predict(fullRF, validationSet)
# Accuracy
print(paste("Validation accuracy", mean(preds == validationSet$Drug)))
# Macro F1-score
mean(sapply(levels(validationSet$Drug), 
            function(x) F1pred(preds, validationSet$Drug, x)))

In [None]:
fullRF

### Output Confusion matrix

In [None]:
confMat = data.frame(Observed = rep(levels(validationSet$Drug), each = length(levels(validationSet$Drug))),
                     Predicted = rep(levels(validationSet$Drug), length(levels(validationSet$Drug))),
                     Count = 0)
# Color: white on diagonal (for text readability)
confMat$Color = ifelse(confMat$Observed == confMat$Predicted, "#DDDDDD", "#000000")
# Change drug order
confMat$Observed = factor(confMat$Observed, levels = rev(c('Latrunculin B', 'Jasplakinolide', 'Blebbistatin', 
                                                     'Y-27632', 'CK869', 'Wiskostatin', 'SMIFH2', "DMSO", "WT")))
confMat$Predicted = factor(confMat$Predicted, levels = c('Latrunculin B', 'Jasplakinolide', 'Blebbistatin', 
                                                       'Y-27632', 'CK869', 'Wiskostatin', 'SMIFH2', "DMSO", "WT"))

In [None]:
for (i in 1:nrow(validationSet)){
    idMat = (confMat$Observed == validationSet$Drug[i]) & (confMat$Predicted == preds[i])
    confMat$Count[idMat] = confMat$Count[idMat] + 1
}
head(confMat)

In [None]:
totalDF = aggregate(confMat$Count, by = list(confMat$Observed), FUN = sum)
names(totalDF) <- c("Observed", "Count")
totalDF$Predicted = "Total"
totalDF$Color = "#FFFFFF"
confMat = rbind(confMat, totalDF)


In [None]:
f1DF = as.data.frame(round(sapply(levels(validationSet$Drug), 
            function(x) F1pred(preds, validationSet$Drug, x)), 2))
names(f1DF) <- "Count"
f1DF$Observed = rownames(f1DF)
f1DF$Predicted = "F1 score"
f1DF$Color = "#333333"
confMat = rbind(confMat, f1DF)

In [None]:
levels(confMat$Observed)[levels(confMat$Observed) == "WT"] <- "Untreated"
levels(confMat$Predicted)[levels(confMat$Predicted) == "WT"] <- "Untreated"

In [None]:
gp <- ggplot(confMat, aes(Predicted, Observed)) + geom_tile(aes(fill = Count)) +
    geom_text(aes(label = Count, color = Color), size = 8) + scale_color_identity() +
    guides(fill = F) + theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), 
                             axis.text = element_text(size = 24), axis.title = element_text(size = 24)) +
    geom_vline(xintercept = 9.5, color = "#FFFFFF", size = 1.5)
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_handpicked_prediction.pdf", plot = gp, width = 10)
}

The performance is significanty reduced with the smaller set of features: 
in the additional data-driven features, there is information useful for the image classification.
### Is Actin intensity still the most discriminative feature?

In [None]:
dfImpRF = as.data.frame(fullRF$importance)
dfImpRF$Feature = factor(rownames(dfImpRF), 
                         levels = rownames(dfImpRF)[order(dfImpRF$MeanDecreaseGini, decreasing = T)])

In [None]:
gp <- ggplot(dfImpRF) + geom_col(aes(Feature, MeanDecreaseGini)) + 
                        theme(axis.text.x = element_text(angle = 55, vjust = 1, hjust = 1)) + xlab("")
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_handpicked_prediction_feature_importance.pdf", plot = gp, width = 10)
}

#### UMAP of morphological features

In [None]:
umNKFTfit = umap_fit(t(transformedNK), min_dist = 0.05, neighbors = 8, n = 2, metric = "euclidean")
umNKFT = umap_transform(umNKFTfit, t(transformedNK))
umNKFT = as.data.frame(umNKFT)
names(umNKFT) = c("UMAP1", "UMAP2")

In [None]:
catType = c("Granularity", "Texture", "AreaShape", "RadialDistribution",
            "Correlation", "Intensity")
umNKFT$type = apply(as.data.frame(sapply(catType, function(x) 
    grepl(x, colnames(transformedNK)))), 1, function(x) paste(catType[which(x)],collapse='-'))
umNKFT$type[umNKFT$type == ""] <- NA

In [None]:
catChannel = c("CorrDNA", "CorrActin", "CorrPerf")
umNKFT$channel = apply(as.data.frame(sapply(catChannel, function(x) 
    grepl(x, colnames(transformedNK)))), 1, function(x) paste(catChannel[which(x)],collapse='-'))
umNKFT$channel[umNKFT$channel == ""] <- NA

In [None]:
gp <- ggplot(umNKFT) + geom_point(aes(UMAP1, UMAP2, col = type, shape = channel), size = 3)
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_UMAP_Features.pdf", gp, width = 10)
}

In [None]:
confoundingDF = rbind(Count_Filtered_Nuclei = NK$Count_FilterNKNucleus, 
                      Count_All_Nuclei = NK$Count_Nucleus,
                      Row = NK$Metadata_Row,
                      Column = NK$Metadata_Column,
                      Well = NK$Metadata_Well)[,fieldToKeep]
confoundingUMAP = umap_transform(umNKFTfit, confoundingDF)
confoundingUMAP = as.data.frame(confoundingUMAP)
names(confoundingUMAP) = c("UMAP1", "UMAP2")
confoundingUMAP$type = rownames(confoundingDF)
confoundingUMAP$size = 4

In [None]:
confoundingAndKept = rbind(confoundingUMAP, cbind(umNKFT[,1:2], type = "Kept feature", size = 2))

In [None]:
gp <- ggplot(confoundingAndKept) + geom_point(aes(UMAP1, UMAP2, col = type, size = size)) +
      guides(size=F) + scale_size_identity()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_UMAP_Features_Confounding.pdf", gp, width = 10)
}

#### Distances to WT cells

In [None]:
drugPert = "Blebbistatin"

# Find row containing this drug and split wells between WT and drug
rowPert = NK[fieldToKeep[NK[fieldToKeep,]$Drug == drugPert],]$Metadata_Row[1]
setUmapDrug = umTNK[NK[fieldToKeep,]$Drug == drugPert,1:dimUMAP]
setUmapWT = umTNK[which(indWT)[NK[fieldToKeep[indWT],]$Metadata_Row == rowPert],1:dimUMAP]

# Ensure that we have enough points to compute distance
if ((dim(setUmapDrug)[1] < 2*dimUMAP)|(dim(setUmapWT)[1] < 2*dimUMAP)){
        print(drugPert)
        RMD = NA
}

# Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
mcdWT = covMcd(setUmapWT)
RMD = median(apply(setUmapDrug, 1, function(x) mahalanobis(x, mcdWT$center, mcdWT$cov)))

In [None]:
dim(setUmapWT)

In [None]:
# Compute the median Robust Mahalanobis Distance (RMD) between drugs
drugRMD <- function(drugPert){
    # Find row containing this drug and split wells between WT and drug
    rowPert = NK[fieldToKeep[NK[fieldToKeep,]$Drug == drugPert],]$Metadata_Row[1]
    setUmapDrug = umTNK[NK[fieldToKeep,]$Drug == drugPert,1:dimUMAP]
    setUmapWT = umTNK[which(indWT)[NK[fieldToKeep[indWT],]$Metadata_Row == rowPert],1:dimUMAP]
    
    # Ensure that we have enough points to compute distance
    if ((dim(setUmapDrug)[1] < 2*dimUMAP)|(dim(setUmapWT)[1] < 2*dimUMAP)){
            print(drugPert)
            return(NA)
    }
    
    # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
    mcdWT = covMcd(setUmapWT)
    RMD = median(apply(setUmapDrug, 1, function(x) mahalanobis(x, mcdWT$center, mcdWT$cov)))
    return(RMD)
}

In [None]:
shuffDrugRMD <- function(drugPert, nbRep = PERM_NB_ITER){
    # Find row containing this drug and shuffle wells between WT and drug
    rowPert = NK[fieldToKeep[NK[fieldToKeep,]$Drug == drugPert],]$Metadata_Row[1]
    setUmapDrug = umTNK[NK[fieldToKeep,]$Drug == drugPert,1:dimUMAP]
    setUmapWT = umTNK[which(indWT)[NK[fieldToKeep[indWT],]$Metadata_Row == rowPert],1:dimUMAP]
    setUMAP = rbind(setUmapWT,setUmapDrug)
    shuffleRMD <- function(notUsed){
        shuffSetUMAP = rbind(setUmapWT,setUmapDrug)[sample(nrow(setUMAP)),]
        # Take random subsets of corresponding sizes
        shuffSetDrug = shuffSetUMAP[1:nrow(setUmapDrug),]
        shuffSetWT = shuffSetUMAP[(nrow(setUmapDrug)+1):(nrow(setUmapDrug)+nrow(setUmapWT)),]
        
        # Ensure that we have enough points to compute distance
        if ((dim(shuffSetDrug)[1] < 2*dimUMAP)|(dim(shuffSetWT)[1] < 2*dimUMAP)){
                print(drugPert)
                return(NA)
        }
        
        # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
        mcdWT = covMcd(shuffSetWT)
        RMD = median(apply(shuffSetDrug, 1, function(x) mahalanobis(x, mcdWT$center, mcdWT$cov)))
        return(RMD)
    }
    return(sapply(1:nbRep, shuffleRMD))
}

In [None]:
# Similar procedure for DMSO against WT
dmsoRMD <- function(rowDMSO){
    setUmapDMSO = umTNK[(NK[fieldToKeep,]$Drug == "DMSO")&(NK[fieldToKeep,]$Metadata_Row == rowDMSO),1:dimUMAP]
    setUmapWT = umTNK[which(indWT)[NK[fieldToKeep[indWT],]$Metadata_Row == rowDMSO],1:dimUMAP]
   
    # Ensure that we have enough points to compute distance
    if ((dim(setUmapDMSO)[1] < 2*dimUMAP)|(dim(setUmapWT)[1] < 2*dimUMAP)){
            return(NA)
    }
    
    # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
    mcdWT = covMcd(setUmapWT)
    RMD = median(apply(setUmapDMSO, 1, function(x) mahalanobis(x, mcdWT$center, mcdWT$cov)))
    return(RMD)
}
shuffDmsoRMD <- function(rowDMSO, nbRep = PERM_NB_ITER){
    # Find row containing this drug and shuffle wells between WT and drug
    setUmapDMSO = umTNK[(NK[fieldToKeep,]$Drug == "DMSO")&(NK[fieldToKeep,]$Metadata_Row == rowDMSO),1:dimUMAP]
    setUmapWT = umTNK[which(indWT)[NK[fieldToKeep[indWT],]$Metadata_Row == rowDMSO],1:dimUMAP]
    setUMAP = rbind(setUmapWT,setUmapDMSO)
    shuffleRMD <- function(notUsed){
        shuffSetUMAP = rbind(setUmapWT,setUmapDMSO)[sample(nrow(setUMAP)),]
        # Take random subsets of corresponding sizes
        shuffSetDMSO = shuffSetUMAP[1:nrow(setUmapDMSO),]
        shuffSetWT = shuffSetUMAP[(nrow(setUmapDMSO)+1):(nrow(setUmapDMSO)+nrow(setUmapWT)),]
        
        # Ensure that we have enough points to compute distance
        if ((dim(shuffSetDMSO)[1] < 2*dimUMAP)|(dim(shuffSetWT)[1] < 2*dimUMAP)){
                return(NA)
        }
        
        # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
        mcdWT = covMcd(shuffSetWT)
        RMD = median(apply(shuffSetDMSO, 1, function(x) mahalanobis(x, mcdWT$center, mcdWT$cov)))
        return(RMD)
    }
    return(sapply(1:nbRep, shuffleRMD))
}

In [None]:
drugRMD_NK = sapply(levels(droplevels(NK$Drug[fieldToKeep])), drugRMD)

In [None]:
shuffDrugRMD_NK = sapply(levels(droplevels(NK$Drug[fieldToKeep])), shuffDrugRMD)

In [None]:
getRMPV <- function(x){
    ecdfRMD = ecdf(x[-1])
    return(1 - ecdfRMD(x[1]))
}
dfRMPV = data.frame(RMPV = apply(rbind(drugRMD_NK, shuffDrugRMD_NK), 2, getRMPV))

In [None]:
dfRMPV$Drug = rownames(dfRMPV)
dfRMPV$Strength = drugRMD_NK
dfRMPV = dfRMPV[!dfRMPV$Drug %in% c("DMSO", "WT"),]
dfRMPV$adjRMPV = p.adjust(dfRMPV$RMPV, method = "fdr")

In [None]:
gp <- ggplot(dfRMPV) + geom_point(aes(adjRMPV, Strength, color = Drug)) + geom_vline(xintercept = 0.05, color="#CCCCCC", linetype="dashed")
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_RMPV_Drugs.pdf", gp, width = 10)
}

In [None]:
dmsoRMD_NK = sapply(1:8, dmsoRMD)
shuffDmsoRMD_NK = sapply(1:8, shuffDmsoRMD)

In [None]:
dmsoRMPV = data.frame(RMPV = apply(rbind(dmsoRMD_NK, shuffDmsoRMD_NK), 2, getRMPV))
dmsoRMPV$Row = rownames(dmsoRMPV)
dmsoRMPV$Strength = dmsoRMD_NK
dmsoRMPV$adjRMPV = p.adjust(dmsoRMPV$RMPV, method = "fdr")

In [None]:
gp <- ggplot(dmsoRMPV) + geom_point(aes(adjRMPV, Strength, color = Row)) + geom_vline(xintercept = 0.05, color="#CCCCCC", linetype="dashed")
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_RMPV_DMSO.pdf", plot = gp, width = 10)
}

### Concentration-dependency

In [None]:
# Compute the Robust Mahalanobis Distance (RMD) for all images of a given drug at a given concentration
drugConcRMD <- function(drugPert, concPert){
    # Find row containing this drug and split wells between WT and drug
    rowPert = NK[fieldToKeep[NK[fieldToKeep,]$Drug == drugPert],]$Metadata_Row[1]
    pertInd = (umTNK$Drug == drugPert)&(umTNK$Conc == concPert)
    setUmapDrug = umTNK[pertInd,1:dimUMAP]
    setUmapWT = umTNK[which(indWT)[NK[fieldToKeep[indWT],]$Metadata_Row == rowPert],1:dimUMAP]
    
    # Ensure that we have enough points to compute distance
    if ((dim(setUmapDrug)[1] < 2*dimUMAP)|(dim(setUmapWT)[1] < 2*dimUMAP)){
            print(c(drugPert, concPert))
            return(NA)
    }
    
    # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
    mcdWT = covMcd(setUmapWT)
    RMD = apply(setUmapDrug, 1, function(x) mahalanobis(x, mcdWT$center, mcdWT$cov))
    return(RMD)
}

In [None]:
# Which drugs should be studied?
sigDrugs = rownames(dfRMPV)[dfRMPV$adjRMPV < 0.05]
# Classify concentration per drug as follows
concentrationLevels = c("Low", "Mid", "High")

getAllDrugConcRMD <- function(d, rename = T){
    dConc = levels(droplevels(umTNK[umTNK$Drug == d,]$Conc))
    dList = sapply(dConc, function(c) drugConcRMD(d,c), simplify = F)
    if (rename){
        names(dList) <- concentrationLevels[1:length(dConc)]
    }
    return(dList)
}
sigDrugConcRMD = sapply(sigDrugs, getAllDrugConcRMD, simplify = F)

In [None]:
getSublist <- function(x, y){
    i = which(names(x) == y)
    # If y is not a sublist of x, i is the empty integer (not null, not false but with zero-length)
    if (length(i)){
        return(x[[i]])
}}

concRMPV = sapply(concentrationLevels, function(y) unlist(sapply(sigDrugConcRMD, function(x) getSublist(x, y))))
concRMPV = as.data.frame(Reduce(rbind, Map(cbind, RMPV = concRMPV, Concentration = names(concRMPV))))
concRMPV$RMPV = as.numeric(concRMPV$RMPV)
concRMPV$Concentration = factor(concRMPV$Concentration, levels = concentrationLevels)

In [None]:
gp <- ggplot(concRMPV) + geom_violin(aes(Concentration, RMPV, fill = Concentration)) + 
                         coord_flip() + theme(legend.title = element_blank()) + guides(fill = guide_legend(reverse = TRUE)) +
                         scale_fill_discrete(type = "Yolla")
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_ConcEffect_All.pdf", plot = gp, width = 10)
}

In [None]:
drugsConcRMPV = sapply(sigDrugs, function(x) getAllDrugConcRMD(x, rename = F), simplify = F)

for (i in 1:length(drugsConcRMPV)){
    # Retrieve drug name and RMPV values
    drugName = names(drugsConcRMPV)[i]
    drugConcRMPV = drugsConcRMPV[[i]]
    
    # Structure into wide data frame
    concRMPV = as.data.frame(Reduce(rbind, Map(cbind, RMPV = drugConcRMPV, Concentration = names(drugConcRMPV))))
    concRMPV$RMPV = as.numeric(concRMPV$RMPV)
    # Reorder concentrations in decreasing numerical order
    concRMPV$Concentration = factor(concRMPV$Concentration, 
                                levels = as.character(sort(as.numeric(levels(concRMPV$Concentration)))))

    # Represent distribution per concentration for all significantly changed drug
    gp <- ggplot(concRMPV) + geom_violin(aes(Concentration, RMPV, fill = Concentration)) + 
                             coord_flip() + theme(legend.title = element_blank()) + guides(fill = guide_legend(reverse = T)) +
                             scale_fill_discrete(type = "Yolla")
    if(!TEST_MODE){
        ggsave(filename = paste("Fig/NK_Drug_", drugName, "ConcEffect.pdf", sep = "_"), plot = gp, width = 10)
    }
}

### Morphological changes upon perturbation

In [None]:
# Associate categories and colors to features
catType = c("Granularity", "Texture", "AreaShape", "RadialDistribution",
            "Intensity")
colType = cust_pal(length(catType)+1)

dtCount = as.data.frame(sapply(catType, function(x) grepl(x, colnames(transformedNK))))
dtCount$Other = !apply(dtCount, 1, any)
 
ftCat = as.factor(apply(dtCount, 1, function(x) colnames(dtCount)[which(x)]))

# Order levels so that "Other" is last
lvOther = which(levels(ftCat) == "Other")
ftCat = factor(ftCat, levels = c(levels(ftCat)[-lvOther], "Other"))
       
# Corresponding colors
ftCol = colType[ftCat]

In [None]:
# Plot number of images / categories (useful to get fill color legend)
gp <- ggplot(data.frame(Category = ftCat), aes(fill = Category)) + geom_bar(aes(Category))
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Drug_CatCount.pdf", plot = gp, width = 10)
}

In [None]:
changedFtDrug <- function(drugPert){
    # Plate row on which the drug is
    rowPert = NK[fieldToKeep[NK[fieldToKeep,]$Drug == drugPert],]$Metadata_Row[1]
    
    # Morphological features for the drug and the WT control on the same row
    setDrug = transformedNK[NK[fieldToKeep,]$Drug == drugPert,]
    setWT = transformedNK[which(indWT)[NK[fieldToKeep[indWT],]$Metadata_Row == rowPert],]
    
    medianDrug = apply(setDrug, 2, median)
    medianWT = apply(setWT, 2, median)
    medianChange = medianDrug - medianWT
    
    # Compute difference to WT
    ftToDisplay = order(abs(medianChange), decreasing = T)[1:12]
    dtChange = data.frame(Difference = medianChange[ftToDisplay])
    dtChange$Feature = rownames(dtChange)
    dtChange$Feature = factor(dtChange$Feature, levels = dtChange$Feature[order(dtChange$Difference)]) 
    dtChange$Category = ftCol[ftToDisplay]

    gp <- ggplot(dtChange) + geom_bar(aes(Feature, weight = Difference, fill = Category), color = "#CCCCCC") + 
                   coord_flip() + ylab("Difference to untreated") + scale_fill_identity() +
                   theme(legend.position="none", plot.margin=unit(c(0,5,0,0), "mm"))
    
    if(!TEST_MODE){
        ggsave(filename = paste("Fig/NK_Drug", drugPert, "ChangeFt.pdf", sep = "_"), plot = gp, width = 10)
    }
}

In [None]:
sapply(sigDrugs, changedFtDrug)

## Is concentration-dependent effect acting on similar or distinct features?
Example of Y27

In [None]:
changedFtY27 <- function(concPert){  
    drugPert = "Y-27632"
    
    # Plate row on which the drug is
    rowPert = NK[fieldToKeep[NK[fieldToKeep,]$Drug == drugPert],]$Metadata_Row[1]
    
    # Morphological features for the drug and the WT control on the same row
    setDrug = transformedNK[(NK[fieldToKeep,]$Drug == drugPert)&(NK[fieldToKeep,]$Concentration == concPert),]
    setWT = transformedNK[which(indWT)[NK[fieldToKeep[indWT],]$Metadata_Row == rowPert],]
    
    medianDrug = apply(setDrug, 2, median)
    medianWT = apply(setWT, 2, median)
    medianChange = medianDrug - medianWT
    
    # Compute difference to WT
    ftToDisplay = order(abs(medianChange), decreasing = T)[1:12]
    dtChange = data.frame(Difference = medianChange[ftToDisplay])
    dtChange$Feature = rownames(dtChange)
    dtChange$Feature = factor(dtChange$Feature, levels = dtChange$Feature[order(dtChange$Difference)]) 
    dtChange$Category = ftCol[ftToDisplay]

    gp <- ggplot(dtChange) + geom_bar(aes(Feature, weight = Difference, fill = Category), color = "#CCCCCC") + 
                   coord_flip() + ylab("Difference to WT") + scale_fill_identity() +
                   theme(legend.position="none", plot.margin=unit(c(0,5,0,0), "mm"))
    
    if(!TEST_MODE){
        ggsave(filename = paste("Fig/NK_Drug_Y-27632", concPert, "ChangeFt.pdf", sep = "_"), plot = gp, width = 10)
    }
}

In [None]:
sapply(c(5,10,25), changedFtY27)

In [None]:
sessionInfo()