# PLL plates

In [None]:
library(ggplot2)
library(extrafont)
library(stringr)
library(heatmaply)
library(ggrepel)
library(reticulate)
library(gridExtra)
library(robustbase)
library(ggpubr)

In [None]:
# Load external Python functions
source_python("reticulate_functions.py")
# Load extra fonts
loadfonts()
# Set this to true for faster compilation or false for more precise results and all outputs
TEST_MODE = F

In [None]:
# Number of permutations used for empirical p-value computations
PERM_NB_ITER = ifelse(TEST_MODE, 20, 2000)

In [None]:
# Customize ggplot appearance

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}

cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
cust_pal = colorRampPalette(c("#008CAD", "#40B9D4", "#D4ECF2", "#F8B100", "#C00000", "#2D0000"))
yolla_pal = colorRampPalette(c('#FD0F91', '#C00000', '#2D0000'))
scale_fill_discrete <- function (..., type = "Cust", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else if (type == "Yolla"){
        discrete_scale(aesthetics, "Yolla", yolla_pal, na.value = na.value, ...)
    } else if (type == "Cust"){
        discrete_scale(aesthetics, "Cust", cust_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "Cust", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else if (type == "Cust"){
        discrete_scale(aesthetics, "Cust", cust_pal, na.value = na.value, ...)
    } else if (type == "Yolla"){
        discrete_scale(aesthetics, "Yolla", yolla_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)

## NK92

In [None]:
# Fix t-SNE layout and random number generation
set.seed(38)

In [None]:
NK = read.csv("./allImages_NK_PLL.csv", header=T)

In [None]:
NK$Plate <- as.factor(str_extract(NK$URL_Actin, "Plate."))
Position <- str_extract_all(NK$URL_Actin, '\\d{2}', simplify = T)
colnames(Position) <- c("Row", "Column", "Field")
NK <- cbind(NK, Position)

In [None]:
wellAnnotation = read.csv("transferNK92.tsv", sep="\t", stringsAsFactors=F)

In [None]:
wellAnnotation$Well <- sub("([A-H])(\\d$)", "\\10\\2", wellAnnotation$Well)

In [None]:
getGene <- function(well){ # Which gene is targeted in a given well?
    return(wellAnnotation[wellAnnotation$Well == well, 2])
}
NK$Gene <- as.factor(sapply(NK$Metadata_Well, getGene))

In [None]:
getGroup <- function(well){
    return(wellAnnotation[wellAnnotation$Well == well, 4])
}
NK$Group <- sapply(NK$Metadata_Well, getGroup)
NK$Group[is.na(NK$Group)] <- "Control"
NK$Group <- as.factor(NK$Group)

In [None]:
getSh <- function(well){ # Which gene is targeted in a given well?
    x = wellAnnotation[wellAnnotation$Well == well, ]
    return(paste(x[2], x[3], sep = "_"))
}
NK$shRNA <- as.factor(sapply(NK$Metadata_Well, getSh))

## Visualize full dataset

### Cell count

In [None]:
gpNK = ggplot(NK[!is.na(NK$Count_FilteredNucleus),]) + geom_histogram(aes(Count_FilteredNucleus, fill = as.factor(Metadata_Row)), binwidth=2) +
       scale_fill_discrete(name="Row")
gpNK

if(!TEST_MODE){
    ggsave(filename = "Fig/NK_PLL_count.pdf", plot = gpNK)
}

In [None]:
gpNK = ggplot(NK[!is.na(NK$Mean_FilterCytoplasm_AreaShape_Area),]) + geom_histogram(aes(Mean_FilterCytoplasm_AreaShape_Area, 
                                                                                    fill = as.factor(Metadata_Row)), binwidth=100) +
       scale_fill_discrete(name="Row")
gpNK
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_PLL_area.pdf", plot = gpNK)
}

In [None]:
ggplot(NK) + geom_point(aes(Count_Nucleus, Count_FilteredNucleus, color = as.factor(Metadata_Row)), position = "jitter") +
       scale_color_discrete(name="Row")

## Filtering

In [None]:
FILT_MAX_INT_DNA = 0.05 # Remove empty images and small DNA precipitations
FILT_MIN_CELLS = 4 # 8 seems safe from distribution and images, 3 seems in poor shape
FILT_NB_MAX_NA_IMAGE = 83 # 48 images generated between 130 and 1865 NAs/image, all others generate at most 82/image
FILT_MAX_CORR = 0.6 # Keep uncorrelated variables
# Strong clots on DNA channels are observed to be technical artifacts driving clustering and needs to be removed
FILT_MINMAX_INT_DNA = 0.6
dimUMAP = 3

In [None]:
ftToKeep = 1:dim(NK)[2]
# Make sure that the fields are numeric
ftToKeep <- ftToKeep[which(sapply(NK[,ftToKeep], class) == "numeric")]
# Remove execution time and count features
ftToKeep <- ftToKeep[grep("(Execution)|(Count)", colnames(NK)[ftToKeep], invert = T)]

In [None]:
# Store all remaining features before filtering for downstream comparison to selected set
preFiltFt = colnames(NK)[ftToKeep]

In [None]:
# Remove wells with low max DNA intensity
fieldToKeep <- which(NK$ImageQuality_MaxIntensity_DNA >= FILT_MAX_INT_DNA)
# Remove wells with high max DNA intensity
fieldToKeep <- which(NK$ImageQuality_MaxIntensity_DNA <= FILT_MINMAX_INT_DNA)
# Remove wells with low cell count
fieldToKeep <- fieldToKeep[NK[fieldToKeep,]$Count_FilteredNucleus >= FILT_MIN_CELLS]

In [None]:
# Few bad quality pictures are generating a lot of missing values and are removed
fieldToKeep <- fieldToKeep[rowSums(is.na(NK[fieldToKeep,ftToKeep])) < FILT_NB_MAX_NA_IMAGE]
# Remove remaining features with missing values
ftToKeep <- ftToKeep[colSums(is.na(NK[fieldToKeep,ftToKeep])) == 0] 
# Remove constant columns
indWT = NK[fieldToKeep,]$Gene == "WT"
ftToKeep <- ftToKeep[sapply(NK[fieldToKeep,ftToKeep], function(x) mad(x) != 0)]
ftToKeep <- ftToKeep[sapply(NK[fieldToKeep[indWT],ftToKeep], function(x) mad(x) != 0)]

In [None]:
# Approximate normal distribution
transfLog <- function (x){
    minVal = min(x)
    return(log(x+1-minVal))
}
transformedNK = as.data.frame(apply(NK[fieldToKeep, ftToKeep], 2, transfLog))

In [None]:
# Center and scale on control values
transfNorm <- function(x, y){
    return((x - median(y))/mad(y))
}
for (plate in levels(NK$Plate)){
    indPlateField = NK$Plate[fieldToKeep] == plate
    indWTinPlate <- NK[fieldToKeep[indPlateField],]$Gene == "WT"
    
    transformedNK[indPlateField,] = apply(
        transformedNK[indPlateField,], 2, 
        function(x) transfNorm(x, x[indWTinPlate]))
}

In [None]:
# Remove columns with NA, i.e.
# features with mad == 0 for 1 plate or more
noNAFt = colSums(is.na(transformedNK)) == 0
ftToKeep = ftToKeep[noNAFt]
transformedNK = transformedNK[,noNAFt]

In [None]:
# Order features from biggest mad to smallest mad
# Since features have mad(WT) = 1 on each plate, it means that we rank features by how more variable they are
# for drug perturbations than for WT
orderFt = rev(order(apply(transformedNK, 2, mad)))

In [None]:
uncorrFt = uncorrelate(transformedNK, orderCol = orderFt-1, threshold = FILT_MAX_CORR)
uncorrFt = unlist(uncorrFt) + 1

In [None]:
transformedNK = transformedNK[,uncorrFt]

In [None]:
dim(transformedNK)

### Export subset of features

For NK cells, the following features are selected and explored separately for their biological interpretability:
```
* Actin intensity/cell (mean/well): NK$Intensity_MeanIntensity_CorrActin_FilterCytoplasm
* Cell area: NK$Mean_FilterCytoplasm_AreaShape_Area
* Cell roundness: NK$Mean_FilterCytoplasm_AreaShape_FormFactor
* Cell width: NK$Mean_FilterCytoplasm_AreaShape_MajorAxisLength 
* Cell length: NK$Mean_FilterCytoplasm_AreaShape_MinorAxisLength
* Cell length to width ratio: NK$Mean_FilterCytoplasm_AreaShape_MinorAxisLength / NK$Mean_FilterCytoplasm_AreaShape_MajorAxisLength
* Average number of perforin granules / cell: NK$Count_PerfGranules  / NK$Count_FilterCytoplasm
* Perforin area / cell area: (NK$Count_PerfGranules * NK$Mean_PerfGranules_AreaShape_Area)  / (NK$Count_FilterCytoplasm * NK$Mean_FilterCytoplasm_AreaShape_Area)
* Perforin intensity: NK$Intensity_MeanIntensity_CorrPerf_FilterCytoplasm
* Perforin area: NK$Mean_PerfGranules_AreaShape_Area
* Nucleus intensity: NK$Intensity_MeanIntensity_CorrDNA_FilteredNucleus
* Nucleus area: NK$Mean_FilteredNucleus_AreaShape_Area
* Nucleus roundness: NK$Mean_FilteredNucleus_AreaShape_FormFactor
* Nucleus width: NK$Mean_FilteredNucleus_AreaShape_MajorAxisLength 
* Nucleus length: NK$Mean_FilteredNucleus_AreaShape_MinorAxisLength
* Nucleus ratio: NK$Mean_FilteredNucleus_AreaShape_MinorAxisLength / NK$Mean_FilteredNucleus_AreaShape_MajorAxisLength
* Nucleus area / cell area: (NK$Count_FilteredNucleus * NK$Mean_FilteredNucleus_AreaShape_Area)  / (NK$Count_FilterCytoplasm * NK$Mean_FilterCytoplasm_AreaShape_Area)```

NB (from CellProfiler docs): FormFactor = $4 \times π \times Area / Perimeter^2$. Equals 1 for a perfectly circular object.

In [None]:
subsetNK = cbind(Field = str_extract(as.character(NK[fieldToKeep,]$URL_Actin), "r..c..f.."),
                 ShRNA = as.character(NK[fieldToKeep,]$shRNA),
                 ActinIntensity = NK[fieldToKeep,]$Intensity_MeanIntensity_CorrActin_FilterCytoplasm,
                 CellArea = NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_Area,
                 CellRoundness = NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_FormFactor,
                 CellWidth = NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_MajorAxisLength,
                 CellLength = NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_MinorAxisLength,
                 CellLengthOverWidth = NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_MinorAxisLength / 
                     NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_MajorAxisLength,
                 PerforinGranulesPerCell = NK[fieldToKeep,]$Count_PerfGranules  / NK[fieldToKeep,]$Count_FilterCytoplasm,
                 PerforinAreaOverCellArea = (NK[fieldToKeep,]$Count_PerfGranules * NK[fieldToKeep,]$Mean_PerfGranules_AreaShape_Area) / 
                     (NK[fieldToKeep,]$Count_FilterCytoplasm * NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_Area),
                 PerforinIntensity = NK[fieldToKeep,]$Intensity_MeanIntensity_CorrPerf_FilterCytoplasm,
                 PerforinArea = NK[fieldToKeep,]$Mean_PerfGranules_AreaShape_Area,
                 NucleusIntensity = NK[fieldToKeep,]$Intensity_MeanIntensity_CorrDNA_FilteredNucleus,
                 NucleusArea = NK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_Area,
                 NucleusRoundness = NK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_FormFactor,
                 NucleusWidth = NK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_MajorAxisLength,
                 NucleusLength = NK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_MinorAxisLength,
                 NucleusLengthOverWidth = NK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_MinorAxisLength / 
                     NK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_MajorAxisLength,
                 NucleusAreaOverCellArea = (NK[fieldToKeep,]$Count_FilteredNucleus * NK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_Area) / 
                 (NK[fieldToKeep,]$Count_FilterCytoplasm * NK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_Area)
                )

In [None]:
# Export list of "interpretable" features
if(!TEST_MODE){
    write.csv(subsetNK, "Tab/NK_PLL_features.csv", row.names = F)
}

In [None]:
CountPerWell = aggregate(NK[fieldToKeep,]$Count_FilterCytoplasm, by = list(NK[fieldToKeep,]$Metadata_Well), FUN = function(x) sum(x, na.rm = T))
names(CountPerWell) <- c("Well", "Count")
CountPerShRNA = aggregate(NK[fieldToKeep,]$Count_FilterCytoplasm, by = list(NK[fieldToKeep,]$shRNA), FUN = function(x) sum(x, na.rm = T))
names(CountPerShRNA) <- c("ShRNA", "Count")
CountPerGene = aggregate(NK[fieldToKeep,]$Count_FilterCytoplasm, by = list(NK[fieldToKeep,]$Gene), FUN = function(x) sum(x, na.rm = T))
names(CountPerGene) <- c("Gene", "Count")                    

In [None]:
# Export cell counts
if(!TEST_MODE){
    write.csv(CountPerWell, "Tab/NK_PLL_count_well.csv", row.names = F)
    write.csv(CountPerShRNA, "Tab/NK_PLL_count_shRNA.csv", row.names = F)
    write.csv(CountPerGene, "Tab/NK_PLL_count_gene.csv", row.names = F)
}

### Look at which types of features are kept

In [None]:
catChannel = c("CorrDNA", "CorrActin", "CorrPerf")
table(rowSums(sapply(catChannel, function(x) grepl(x, colnames(transformedNK)))))
dtCat = data.frame(CountIni = sort(colSums(sapply(catChannel, function(x) grepl(x, preFiltFt)))), 
                   Count = sort(colSums(sapply(catChannel, function(x) grepl(x, colnames(transformedNK))))))

dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) + ylim(c(0,675)) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 2, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_NK_SelecFt_Channel.pdf", plot = gp, width = 10)
}

In [None]:
catObjects = c("ActinGranules", "FilterCytoplasm", "ShrunkenCytoplasm", "FilteredNucleus", "PerfGranules")
table(rowSums(sapply(catObjects, function(x) grepl(x, colnames(transformedNK)))))
dtCat = data.frame(CountIni = colSums(sapply(catObjects, function(x) grepl(x, preFiltFt))), 
                   Count = colSums(sapply(catObjects, function(x) grepl(x, colnames(transformedNK)))))

dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) + ylim(c(0,510)) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 2, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_NK_SelecFt_Object.pdf", plot = gp, width = 10)
}

In [None]:
catType = c("Threshold", "Granularity", "ImageQuality", "Texture", "Distance", "AreaShape", "RadialDistribution", "Neighbors", 
            "Correlation", "Intensity", "Overlap", "Location")
which(rowSums(sapply(catType, function(x) grepl(x, colnames(transformedNK)))) == 0) # All features are covered

dtCat = data.frame(CountIni = colSums(sapply(catType, function(x) grepl(x, preFiltFt))), 
                   Count = colSums(sapply(catType, function(x) grepl(x, colnames(transformedNK)))))
dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 5, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_NK_SelecFt_Type_All.pdf", plot = gp, width = 10)
}

In [None]:
catType = c("Granularity", "Texture", "AreaShape", "RadialDistribution",
            "Correlation", "Intensity")
table(rowSums(sapply(catType, function(x) grepl(x, colnames(transformedNK)))))

dtCount = as.data.frame(sapply(catType, function(x) grepl(x, colnames(transformedNK))))
dtCount$Other = !apply(dtCount, 1, any)
dtCountIni = as.data.frame(sapply(catType, function(x) grepl(x, preFiltFt)))
dtCountIni$Other = !apply(dtCountIni, 1, any)
dtCat = data.frame(CountIni = colSums(dtCountIni), 
                   Count = colSums(dtCount))
dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 5, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_NK_SelecFt_Type_Short.pdf", plot = gp, width = 10)
}

In [None]:
# Export list of features kept
if(!TEST_MODE){
    write.table(colnames(transformedNK), file = "Tab/PLL_NK_list_features.csv", col.names = F, row.names = F)
}

### Look at the morphological distribution of the fields of view

In [None]:
# Fix random number generation
set.seed(38)

In [None]:
umTNK = umap(transformedNK, min_dist = 0.1, neighbors = 10, n = dimUMAP, metric = "euclidean")
umTNK = as.data.frame(umTNK)
names(umTNK) = paste0("UMAP", 1:dimUMAP)

In [None]:
umTNK$Row <- as.factor(NK$Metadata_Row[fieldToKeep])
umTNK$Col <- as.factor(NK$Metadata_Column[fieldToKeep])
umTNK$URL <- as.factor(NK$URL_Actin[fieldToKeep])
umTNK$Gene <- as.factor(NK$Gene[fieldToKeep])
umTNK$shRNA <- as.factor(NK$shRNA[fieldToKeep])
umTNK$Group <- as.factor(NK$Group[fieldToKeep])

In [None]:
# Order levels so that "Control" is last
lvControl = which(levels(umTNK$Group) == "Control")
umTNK$Group = factor(umTNK$Group, levels = c(levels(umTNK$Group)[-lvControl], "Control"))

In [None]:
gp <- ggplot(umTNK) + geom_point(aes(UMAP1, UMAP2, color = Group))
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_NK_UMAP_Plates.pdf", plot = gp, width = 10)
}

In [None]:
umTNK$DrugOrControl <- as.factor(ifelse(umTNK$Group == "Control", as.character(umTNK$Gene), "shRNA"))
umTNK$Shape = ifelse(umTNK$Group == "Control", "2", "1")

gp <- ggplot(umTNK) + geom_point(aes(UMAP1, UMAP2, color = DrugOrControl, shape = Shape)) + 
    theme(legend.title = element_blank()) + guides(color = guide_legend(reverse = TRUE, 
                             override.aes = list(shape = ifelse(levels(umTNK$DrugOrControl) %in% c("WT", "NOTARGET"), 1, 16))),
                             shape = F) + scale_shape_manual(values = c(16,1))
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_NK_UMAP_Control.pdf", plot = gp, width = 10)
}

#### UMAP of morphological features

In [None]:
umNKFT = umap(t(transformedNK), min_dist = 0.1, neighbors = 10, n = 2, metric = "euclidean")
umNKFT = as.data.frame(umNKFT)
names(umNKFT) = c("UMAP1", "UMAP2")

In [None]:
catType = c("Granularity", "Texture", "AreaShape", "RadialDistribution",
            "Correlation", "Intensity")
umNKFT$type = apply(as.data.frame(sapply(catType, function(x) 
    grepl(x, colnames(transformedNK)))), 1, function(x) paste(catType[which(x)],collapse='-'))
umNKFT$type[umNKFT$type == ""] <- NA

In [None]:
catChannel = c("CorrDNA", "CorrActin", "CorrPerf")
umNKFT$channel = apply(as.data.frame(sapply(catChannel, function(x) 
    grepl(x, colnames(transformedNK)))), 1, function(x) paste(catChannel[which(x)],collapse='-'))
umNKFT$channel[umNKFT$channel == ""] <- NA

In [None]:
gp <- ggplot(umNKFT) + geom_point(aes(UMAP1, UMAP2, col = type, shape = channel), size = 3)
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_NK_UMAP_Features.pdf", gp, width = 10)
}

### Distance to WT

In [None]:
# Compute the median Robust Mahalanobis Distance (RMD) between drugs
shRMD <- function(shPert){
    # Find row containing this drug and split wells between WT and drug
    rowPert = NK[fieldToKeep[NK[fieldToKeep,]$shRNA == shPert],]$Metadata_Row[1]
    setUmapSh = umTNK[NK[fieldToKeep,]$shRNA == shPert,1:dimUMAP]
    setUmapWT = umTNK[which(indWT)[NK[fieldToKeep[indWT],]$Metadata_Row == rowPert],1:dimUMAP]

    if ((dim(setUmapSh)[1] < 2*dimUMAP)|(dim(setUmapWT)[1] < 2*dimUMAP)){
        print(c(shPert))
        return(NA)
    }

    # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
    mcdWT = covMcd(setUmapWT)
    RMD = median(apply(setUmapSh, 1, function(x) mahalanobis(x, mcdWT$center, mcdWT$cov)))
    
    return(RMD)
}

In [None]:
s = Sys.time()
levelSH = levels(NK$shRNA)[!grepl("WT", levels(NK$shRNA))]
shRMD_NK = sapply(levelSH, shRMD)
print(Sys.time() - s)

In [None]:
na.omit.list <- function(y) { return(y[!sapply(y, function(x) all(is.na(x)))]) }

shuffShRMD <- function(shPert, nbRep = PERM_NB_ITER){
    # Find row containing this shRNA and split wells between WT and drug
    rowPert = NK[fieldToKeep[NK[fieldToKeep,]$shRNA == shPert],]$Metadata_Row[1]
    setUmapSh = umTNK[NK[fieldToKeep,]$shRNA == shPert,1:dimUMAP]
    setUmapWT = umTNK[which(indWT)[NK[fieldToKeep[indWT],]$Metadata_Row == rowPert],1:dimUMAP]
    setUMAP = rbind(setUmapSh, setUmapWT)
    
    if ((dim(setUmapSh)[1] < 2*dimUMAP)|(dim(setUmapWT)[1] < 2*dimUMAP)){
        return(rep(NA, nbRep))
    }
        

    shuffleRMD <- function(notUsed){
        shuffSetUMAP = setUMAP[sample(nrow(setUMAP)),]
        # Take random subsets of corresponding sizes
        shuffSetSh = shuffSetUMAP[1:nrow(setUmapSh),]
        shuffSetWT = shuffSetUMAP[(nrow(setUmapSh)+1):(nrow(setUmapSh)+nrow(setUmapWT)),]
        # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
        mcdWT = covMcd(shuffSetWT)
        RMD = median(apply(shuffSetSh, 1, function(x) mahalanobis(x, mcdWT$center, mcdWT$cov)))
        if(any(RMD < 0)){
            print("Negative distance! - Debugging needed.")
            print(c(shPert, notUsed, dim(shuffSetWT)))
            print(RMD)
        }
        return(RMD)
    }
    return(unlist(sapply(1:nbRep, shuffleRMD)))
}

In [None]:
s = Sys.time()
shuff_shRMD_NK = sapply(levelSH, function(x) shuffShRMD(x))
print(Sys.time() - s)

In [None]:
table(is.na(shRMD_NK))
table(is.na(shuff_shRMD_NK))

In [None]:
getRMPV <- function(x){
    if(is.na(x[1])){
        return(NA)
    }
    ecdfRMD = ecdf(x[-1])
    return(1 - ecdfRMD(x[1]))
}
dfRMPV = data.frame(RMPV = apply(rbind(shRMD_NK, shuff_shRMD_NK), 2, getRMPV))

In [None]:
dfRMPV$shRNA = rownames(dfRMPV)
dfRMPV$Strength = shRMD_NK
dfRMPV$adjRMPV = p.adjust(dfRMPV$RMPV, method = "fdr")
dfRMPV$labelOut = ifelse((dfRMPV$adjRMPV > 0.05)|(is.na(dfRMPV$adjRMPV)), dfRMPV$shRNA, '')
dfRMPV$labelIn = ifelse(!((dfRMPV$adjRMPV > 0.05)|(is.na(dfRMPV$adjRMPV))), dfRMPV$shRNA, '')

In [None]:
gp <- ggplot(dfRMPV) + geom_point(aes(adjRMPV, Strength, color = shRNA)) + geom_vline(xintercept = 0.05, color="#CCCCCC", linetype="dashed") +
                    theme(legend.position="none") + geom_label_repel(aes(adjRMPV, Strength, label = labelOut))
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_NK_RMPV.pdf", gp, width = 10)
}

### Morphological changes upon perturbation

In [None]:
# Associate categories and colors to features
catType = c("Granularity", "Texture", "AreaShape", "RadialDistribution",
            "Intensity")
colType = cemm_pal(length(catType)+1)

dtCount = as.data.frame(sapply(catType, function(x) grepl(x, colnames(transformedNK))))
dtCount$Other = !apply(dtCount, 1, any)
 
ftCat = as.factor(apply(dtCount, 1, function(x) colnames(dtCount)[which(x)]))
ftCol = colType[ftCat]

In [None]:
# Plot number of images / categories (useful to get fill color legend)
gp <- ggplot(data.frame(Category = ftCat), aes(fill = Category)) + geom_bar(aes(Category))
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_NK_CatCount.pdf", plot = gp, width = 10)
}

In [None]:
# Which drugs should be studied?
sigSh = unique(dfRMPV$labelIn)
sigSh = sigSh[sigSh != ""]

In [None]:
changedFtDrug <- function(shPert){  
    # Plate row on which the drug is
    rowPert = NK[fieldToKeep[NK[fieldToKeep,]$shRNA == shPert],]$Metadata_Row[1]
    
    # Morphological features for the drug and the WT control on the same row
    setSh = transformedNK[NK[fieldToKeep,]$shRNA == shPert,]
    setWT = transformedNK[which(indWT)[NK[fieldToKeep[indWT],]$Metadata_Row == rowPert],]
    
    medianSh = apply(setSh, 2, median)
    medianWT = apply(setWT, 2, median)
    medianChange = medianSh - medianWT
    
    # Compute difference to WT
    ftToDisplay = order(abs(medianChange), decreasing = T)[1:12]
    dtChange = data.frame(Difference = medianChange[ftToDisplay])
    dtChange$Feature = rownames(dtChange)
    dtChange$Feature = factor(dtChange$Feature, levels = dtChange$Feature[order(dtChange$Difference)]) 
    dtChange$Category = ftCol[ftToDisplay]

    gp <- ggplot(dtChange) + geom_bar(aes(Feature, weight = Difference, fill = Category), color = "#CCCCCC") + 
                   coord_flip() + ylab("Difference to WT") + scale_fill_identity() +
                   theme(legend.position="none", plot.margin=unit(c(0,5,0,0), "mm"))
    print(gp)
    if(!TEST_MODE){
        ggsave(filename = paste("Fig/PLL_NK", shPert, "ChangeFt.pdf", sep = "_"), plot = gp, width = 10)
    }
}

In [None]:
sapply(sigSh, changedFtDrug)

## Comparisons to WT

In [None]:
subsetNKsh = read.csv("Tab/NK_shRNA_features.csv")

In [None]:
comparePLLStimulated <- function(ft){
    ftID = which(colnames(subsetNK) == ft)
    # Get choosen feature ft for WT on the PLL plate
    DFT = data.frame(Ft = subsetNK[grep("WT", subsetNK[,2]),c(ftID)])
    # Values are stored as levels instead of numerical values
    DFT$Ft = as.numeric(as.character(DFT$Ft))
    DFT$Plate = "PLL"

    # Get choosen feature ft for WT on treated plates
    DFTT = data.frame(subsetNKsh[grep("WT", subsetNKsh[,3]),c(ftID+1,2)])
    DFTT[,1] = as.numeric(as.character(DFTT[,1]))
    names(DFTT) = names(DFT)

    DFT = rbind(DFT, DFTT)

    # We compare all treated plates to PLL with a wilcoxon signed rank test
    # comp = compare_means(Ft ~ Plate, data = DFT, method = "wilcox.test", paired = FALSE, ref.group = "PLL",
    #                      p.adjust.method = "bonferroni")
    
    nbPlates = 3 # Number of comparisons
    # We do manually a p-value adjustment (Bonferroni correction) in the signficance level displayed
    symnum.args <- list(
        cutpoints = c(0, 0.0001/nbPlates, 0.001/nbPlates, 0.01/nbPlates, 0.05/nbPlates, 1),
        symbols = c("<0.0001", "<0.001", "<0.01", "<0.05", "ns")
    )
    
    gp <- ggplot(DFT, aes(x = Plate, y = Ft)) + geom_violin(aes(fill = Plate)) + ylab(ft) + guides(fill = F) + 
          stat_compare_means(label = "p.signif", symnum.args = symnum.args, method = "wilcox.test", ref.group = "PLL")
  
    return(gp)
}

In [None]:
print(comparePLLStimulated("CellArea"))

In [None]:
if(!TEST_MODE){
    ftToPlot = c("NucleusArea", "CellArea", "ActinIntensity")
    for (ft in ftToPlot){
        gp <- comparePLLStimulated(ft)
        ggsave(filename = paste("Fig/PLL_NK", ft, "TreatmentEffect.pdf", sep = "_"), plot = gp, width = 10)
    }
}

## Jurkat

In [None]:
# Fix t-SNE layout and random number generation
set.seed(38)

In [None]:
JK = read.csv("./allImages_JK_PLL.csv", header=T)

In [None]:
Position <- str_extract_all(JK$URL_Actin, '\\d{2}', simplify = T)
colnames(Position) <- c("Row", "Column", "Field")
JK <- cbind(JK, Position)

In [None]:
wellAnnotation = read.csv("shRNA_Jurkat_platelayout.csv", stringsAsFactors=F, row.names = 1 )
# Drop empty last row
wellAnnotation = wellAnnotation[1:8,]

In [None]:
wellAnnotation = data.frame(Gene = unlist(wellAnnotation),
                            Row = rep(rownames(wellAnnotation) , 11),
                            Col = rep(1:11, each = 8))

In [None]:
wellAnnotation$Well <- sub("([A-H])(\\d$)", "\\10\\2", paste0(wellAnnotation$Row, wellAnnotation$Col))

In [None]:
getGene <- function(well){ # Which gene is targeted in a given well?
    return(wellAnnotation[wellAnnotation$Well == well, 1])
}
JK$Gene <- as.factor(sapply(JK$Metadata_Well, getGene))

## Visualize full dataset

### Cell count

In [None]:
gpJK = ggplot(JK[!is.na(JK$Count_FilteredNucleus),]) + geom_histogram(aes(Count_FilteredNucleus, fill = as.factor(Metadata_Row)), binwidth=2) +
       scale_fill_discrete(name="Row")
gpJK

if(!TEST_MODE){
    ggsave(filename = "Fig/JK_PLL_count.pdf", plot = gpJK)
}

In [None]:
gpJK = ggplot(JK[!is.na(JK$Mean_FilterCytoplasm_AreaShape_Area),]) + geom_histogram(aes(Mean_FilterCytoplasm_AreaShape_Area, 
                                                                                    fill = as.factor(Metadata_Row)), binwidth=100) +
       scale_fill_discrete(name="Row")
gpJK
if(!TEST_MODE){
    ggsave(filename = "Fig/JK_PLL_area.pdf", plot = gpJK)
}

In [None]:
ggplot(JK) + geom_point(aes(Count_Nucleus, Count_FilteredNucleus, color = as.factor(Metadata_Row)), position = "jitter") +
       scale_color_discrete(name="Row")

## Filtering

In [None]:
FILT_MAX_INT_DNA = 0.05 # Remove empty images and small DNA precipitations
FILT_MIN_CELLS = 2
FILT_NB_MAX_NA_IMAGE = 353 # Most images generate up to 352 missing values (no LFA-1 granules)
FILT_MAX_CORR = 0.6 # Keep uncorrelated variables
dimUMAP = 3

In [None]:
ftToKeep = 1:dim(JK)[2]
# Make sure that the fields are numeric
ftToKeep <- ftToKeep[which(sapply(JK[,ftToKeep], class) == "numeric")]
# Remove execution time and count features
ftToKeep <- ftToKeep[grep("(Execution)|(Count)", colnames(JK)[ftToKeep], invert = T)]

In [None]:
# Store all remaining features before filtering for downstream comparison to selected set
preFiltFt = colnames(JK)[ftToKeep]

In [None]:
# Remove wells with low max DNA intensity
fieldToKeep <- which(JK$ImageQuality_MaxIntensity_DNA >= FILT_MAX_INT_DNA)
# Remove wells with low cell count
fieldToKeep <- fieldToKeep[JK[fieldToKeep,]$Count_FilteredNucleus >= FILT_MIN_CELLS]

In [None]:
# Few bad quality pictures are generating a lot of missing values and are removed
fieldToKeep <- fieldToKeep[rowSums(is.na(JK[fieldToKeep,ftToKeep])) < FILT_NB_MAX_NA_IMAGE]
# Remove remaining features with missing values
ftToKeep <- ftToKeep[colSums(is.na(JK[fieldToKeep,ftToKeep])) == 0] 
# Remove constant columns
indWT = JK[fieldToKeep,]$Gene == "WT"
ftToKeep <- ftToKeep[sapply(JK[fieldToKeep,ftToKeep], function(x) mad(x) != 0)]
ftToKeep <- ftToKeep[sapply(JK[fieldToKeep[indWT],ftToKeep], function(x) mad(x) != 0)]

In [None]:
# Approximate normal distribution
transfLog <- function (x){
    minVal = min(x)
    return(log(x+1-minVal))
}
transformedJK = as.data.frame(apply(JK[fieldToKeep, ftToKeep], 2, transfLog))

In [None]:
# Center and scale on control values
transfNorm <- function(x, y){
    return((x - median(y))/mad(y))
}
for (plate in levels(JK$Plate)){
    indPlateField = JK$Plate[fieldToKeep] == plate
    indWTinPlate <- JK[fieldToKeep[indPlateField],]$Gene == "WT"
    
    transformedJK[indPlateField,] = apply(
        transformedJK[indPlateField,], 2, 
        function(x) transfNorm(x, x[indWTinPlate]))
}

In [None]:
# Remove columns with NA, i.e.
# features with mad == 0 for 1 plate or more
noNAFt = colSums(is.na(transformedJK)) == 0
ftToKeep = ftToKeep[noNAFt]
transformedJK = transformedJK[,noNAFt]

In [None]:
# Order features from biggest mad to smallest mad
# Since features have mad(WT) = 1 on each plate, it means that we rank features by how more variable they are
# for drug perturbations than for WT
orderFt = rev(order(apply(transformedJK, 2, mad)))

In [None]:
uncorrFt = uncorrelate(transformedJK, orderCol = orderFt-1, threshold = FILT_MAX_CORR)
uncorrFt = unlist(uncorrFt) + 1

In [None]:
transformedJK = transformedJK[,uncorrFt]

In [None]:
dim(transformedJK)

### Export subset of features

For Jurkat cells, the following features are selected and explored separately for their biological interpretability:
```
* Actin intensity/cell (mean/well): JK$Intensity_MeanIntensity_CorrActin_FilterCytoplasm
* Cell area: JK$Mean_FilterCytoplasm_AreaShape_Area
* Cell roundness: JK$Mean_FilterCytoplasm_AreaShape_FormFactor
* Cell width: JK$Mean_FilterCytoplasm_AreaShape_MajorAxisLength 
* Cell length: JK$Mean_FilterCytoplasm_AreaShape_MinorAxisLength
* Cell length to width ratio: JK$Mean_FilterCytoplasm_AreaShape_MinorAxisLength / JK$Mean_FilterCytoplasm_AreaShape_MajorAxisLength
* Average number of LFA1 granules / cell: JK$Count_PerfGranules  / JK$Count_FilterCytoplasm
* LFA1 area / cell area: (JK$Count_PerfGranules * JK$Mean_PerfGranules_AreaShape_Area)  / (JK$Count_FilterCytoplasm * JK$Mean_FilterCytoplasm_AreaShape_Area)
* LFA1 intensity: JK$Intensity_MeanIntensity_CorrPerf_FilterCytoplasm
* LFA1 area: JK$Mean_PerfGranules_AreaShape_Area
* Nucleus intensity: JK$Intensity_MeanIntensity_CorrDNA_FilteredNucleus
* Nucleus area: JK$Mean_FilteredNucleus_AreaShape_Area
* Nucleus roundness: JK$Mean_FilteredNucleus_AreaShape_FormFactor
* Nucleus width: JK$Mean_FilteredNucleus_AreaShape_MajorAxisLength 
* Nucleus length: JK$Mean_FilteredNucleus_AreaShape_MinorAxisLength
* Nucleus ratio: JK$Mean_FilteredNucleus_AreaShape_MinorAxisLength / JK$Mean_FilteredNucleus_AreaShape_MajorAxisLength
* Nucleus area / cell area: (JK$Count_FilteredNucleus * JK$Mean_FilteredNucleus_AreaShape_Area)  / (JK$Count_FilterCytoplasm * JK$Mean_FilterCytoplasm_AreaShape_Area)```

NB (from CellProfiler docs): FormFactor = $4 \times π \times Area / Perimeter^2$. Equals 1 for a perfectly circular object.

In [None]:
subsetJK = cbind(Field = str_extract(as.character(JK[fieldToKeep,]$URL_Actin), "r..c..f.."),
                 Gene = as.character(JK[fieldToKeep,]$Gene),
                 ActinIntensity = JK[fieldToKeep,]$Intensity_MeanIntensity_CorrActin_FilterCytoplasm,
                 CellArea = JK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_Area,
                 CellRoundness = JK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_FormFactor,
                 CellWidth = JK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_MajorAxisLength,
                 CellLength = JK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_MinorAxisLength,
                 CellLengthOverWidth = JK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_MinorAxisLength / 
                     JK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_MajorAxisLength,
                 Lfa1GranulesPerCell = JK[fieldToKeep,]$Count_PerfGranules  / JK[fieldToKeep,]$Count_FilterCytoplasm,
                 Lfa1AreaOverCellArea = (JK[fieldToKeep,]$Count_PerfGranules * JK[fieldToKeep,]$Mean_PerfGranules_AreaShape_Area) / 
                     (JK[fieldToKeep,]$Count_FilterCytoplasm * JK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_Area),
                 Lfa1Intensity = JK[fieldToKeep,]$Intensity_MeanIntensity_CorrPerf_FilterCytoplasm,
                 Lfa1Area = JK[fieldToKeep,]$Mean_PerfGranules_AreaShape_Area,
                 NucleusIntensity = JK[fieldToKeep,]$Intensity_MeanIntensity_CorrDNA_FilteredNucleus,
                 NucleusArea = JK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_Area,
                 NucleusRoundness = JK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_FormFactor,
                 NucleusWidth = JK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_MajorAxisLength,
                 NucleusLength = JK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_MinorAxisLength,
                 NucleusLengthOverWidth = JK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_MinorAxisLength / 
                     JK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_MajorAxisLength,
                 NucleusAreaOverCellArea = (JK[fieldToKeep,]$Count_FilteredNucleus * JK[fieldToKeep,]$Mean_FilteredNucleus_AreaShape_Area) / 
                 (JK[fieldToKeep,]$Count_FilterCytoplasm * JK[fieldToKeep,]$Mean_FilterCytoplasm_AreaShape_Area)
                )

In [None]:
# Export list of "interpretable" features
if(!TEST_MODE){
    write.csv(subsetJK, "Tab/JK_PLL_features.csv", row.names = F)
}

In [None]:
CountPerWell = aggregate(JK[fieldToKeep,]$Count_FilterCytoplasm, by = list(JK[fieldToKeep,]$Metadata_Well), FUN = function(x) sum(x, na.rm = T))
names(CountPerWell) <- c("Well", "Count")
CountPerGene = aggregate(JK[fieldToKeep,]$Count_FilterCytoplasm, by = list(JK[fieldToKeep,]$Gene), FUN = function(x) sum(x, na.rm = T))
names(CountPerGene) <- c("Gene", "Count")
# Reminder: Only a single shRNA per gene on the Jurkat PLL plate

In [None]:
# Export cell counts
if(!TEST_MODE){
    write.csv(CountPerWell, "Tab/JK_PLL_count_well.csv", row.names = F)
    write.csv(CountPerGene, "Tab/JK_PLL_count_gene.csv", row.names = F)
}

### Look at which types of features are kept

In [None]:
catChannel = c("CorrDNA", "CorrActin", "CorrPerf")
table(rowSums(sapply(catChannel, function(x) grepl(x, colnames(transformedJK)))))
dtCat = data.frame(CountIni = sort(colSums(sapply(catChannel, function(x) grepl(x, preFiltFt)))), 
                   Count = sort(colSums(sapply(catChannel, function(x) grepl(x, colnames(transformedJK))))))

dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) + ylim(c(0,675)) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 2, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_JK_SelecFt_Channel.pdf", plot = gp, width = 10)
}

In [None]:
catObjects = c("ActinGranules", "FilterCytoplasm", "ShrunkenCytoplasm", "FilteredNucleus", "PerfGranules")
table(rowSums(sapply(catObjects, function(x) grepl(x, colnames(transformedJK)))))
dtCat = data.frame(CountIni = colSums(sapply(catObjects, function(x) grepl(x, preFiltFt))), 
                   Count = colSums(sapply(catObjects, function(x) grepl(x, colnames(transformedJK)))))

dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) + ylim(c(0,510)) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 2, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_JK_SelecFt_Object.pdf", plot = gp, width = 10)
}

In [None]:
catType = c("Threshold", "Granularity", "ImageQuality", "Texture", "Distance", "AreaShape", "RadialDistribution", "Neighbors", 
            "Correlation", "Intensity", "Overlap", "Location")
which(rowSums(sapply(catType, function(x) grepl(x, colnames(transformedJK)))) == 0) # All features are covered

dtCat = data.frame(CountIni = colSums(sapply(catType, function(x) grepl(x, preFiltFt))), 
                   Count = colSums(sapply(catType, function(x) grepl(x, colnames(transformedJK)))))
dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 5, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_JK_SelecFt_Type_All.pdf", plot = gp, width = 10)
}

In [None]:
catType = c("Granularity", "Texture", "AreaShape", "RadialDistribution",
            "Correlation", "Intensity")
table(rowSums(sapply(catType, function(x) grepl(x, colnames(transformedJK)))))

dtCount = as.data.frame(sapply(catType, function(x) grepl(x, colnames(transformedJK))))
dtCount$Other = !apply(dtCount, 1, any)
dtCountIni = as.data.frame(sapply(catType, function(x) grepl(x, preFiltFt)))
dtCountIni$Other = !apply(dtCountIni, 1, any)
dtCat = data.frame(CountIni = colSums(dtCountIni), 
                   Count = colSums(dtCount))
dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 5, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_JK_SelecFt_Type_Short.pdf", plot = gp, width = 10)
}

In [None]:
# Export list of features kept
if(!TEST_MODE){
    write.table(colnames(transformedJK), file = "Tab/PLL_JK_list_features.csv", col.names = F, row.names = F)
}

### Look at the morphological distribution of the fields of view

In [None]:
# Fix random number generation
set.seed(38)

In [None]:
umTJK = umap(transformedJK, min_dist = 0.1, neighbors = 10, n = dimUMAP, metric = "euclidean")
umTJK = as.data.frame(umTJK)
names(umTJK) = paste0("UMAP", 1:dimUMAP)

In [None]:
umTJK$Row <- as.factor(JK$Metadata_Row[fieldToKeep])
umTJK$Col <- as.factor(JK$Metadata_Column[fieldToKeep])
umTJK$URL <- as.factor(JK$URL_Actin[fieldToKeep])
umTJK$Gene <- as.factor(JK$Gene[fieldToKeep])

In [None]:
gp <- ggplot(umTJK) + geom_point(aes(UMAP1, UMAP2, color = Gene))
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_JK_UMAP_Plates.pdf", plot = gp, width = 10)
}

In [None]:
umTJK$DrugOrControl <- as.factor(ifelse(umTJK$Gene == "WT", "WT", "shRNA"))

gp <- ggplot(umTJK) + geom_point(aes(UMAP1, UMAP2, color = DrugOrControl, shape = DrugOrControl)) + 
    theme(legend.title = element_blank()) + guides(color = guide_legend(reverse = TRUE), shape = guide_legend(reverse = TRUE)) + scale_shape_manual(values = c(16,1))
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_JK_UMAP_Control.pdf", plot = gp, width = 10)
}

#### UMAP of morphological features

In [None]:
umJKFT = umap(t(transformedJK), min_dist = 0.1, neighbors = 10, n = 2, metric = "euclidean")
umJKFT = as.data.frame(umJKFT)
names(umJKFT) = c("UMAP1", "UMAP2")

In [None]:
catType = c("Granularity", "Texture", "AreaShape", "RadialDistribution",
            "Correlation", "Intensity")
umJKFT$type = apply(as.data.frame(sapply(catType, function(x) 
    grepl(x, colnames(transformedJK)))), 1, function(x) paste(catType[which(x)],collapse='-'))
umJKFT$type[umJKFT$type == ""] <- NA

In [None]:
catChannel = c("CorrDNA", "CorrActin", "CorrPerf")
umJKFT$channel = apply(as.data.frame(sapply(catChannel, function(x) 
    grepl(x, colnames(transformedJK)))), 1, function(x) paste(catChannel[which(x)],collapse='-'))
umJKFT$channel[umJKFT$channel == ""] <- NA

In [None]:
gp <- ggplot(umJKFT) + geom_point(aes(UMAP1, UMAP2, col = type, shape = channel), size = 3)
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/PLL_JK_UMAP_Features.pdf", gp, width = 10)
}

In [None]:
sessionInfo()