# NK plates analysis

## Preliminary settings

In [None]:
library(ggplot2)
library(extrafont)
library(stringr)
library(heatmaply)
library(ggrepel)
library(reticulate)
library(gridExtra)
library(robustbase)

In [None]:
# Load external Python functions
source_python("reticulate_functions.py")
# Load extra fonts
loadfonts()

In [None]:
# Set this to true for faster compilation or false for more precise results and all outputs
TEST_MODE = F

In [None]:
# Customize ggplot appearance

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}


cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
scale_fill_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)

## Import data

In [None]:
NK = read.csv("./allImages.csv", header=T) 

In [None]:
NK_P3B = read.csv("./allImages_NKP3B.csv", header=T)

Data for measurement B of plate 3 was handled separately and some column are not matching so a manual merging of the tables is needed.

In [None]:
names(NK)[which(!(names(NK) %in% names(NK_P3B)))]
names(NK_P3B)[which(!(names(NK_P3B) %in% names(NK)))]

In [None]:
commonNames = intersect(names(NK_P3B), names(NK))
# Common columns are in the same order:
all(names(NK_P3B)[names(NK_P3B) %in% commonNames] == names(NK)[names(NK) %in% commonNames])

In [None]:
fields3B = str_extract(NK_P3B$URL_Actin, "r\\d{2}.*")
fieldsDuplicate = (str_extract(NK$URL_Actin, "Plate\\d") == "Plate3")&(str_extract(NK$URL_Actin, "r\\d{2}.*") %in% fields3B)

In [None]:
NK = rbind(NK[!fieldsDuplicate, names(NK) %in% commonNames], 
      NK_P3B[, names(NK_P3B) %in% commonNames])

In [None]:
NK$Plate <- as.factor(str_extract(NK$URL_Actin, "Plate."))
Position <- str_extract_all(NK$URL_Actin, '\\d{2}', simplify = T)
colnames(Position) <- c("Row", "Column", "Field")
NK <- cbind(NK, Position)

In [None]:
wellAnnotation = read.csv("transferNK92.tsv", sep="\t", stringsAsFactors=F)

In [None]:
wellAnnotation$Well <- sub("([A-H])(\\d$)", "\\10\\2", wellAnnotation$Well)

In [None]:
getGene <- function(well){ # Which gene is targeted in a given well?
    return(wellAnnotation[wellAnnotation$Well == well, 2])
}
NK$Gene <- sapply(NK$Metadata_Well, getGene)

## Visualize full dataset

### Cell count

In [None]:
ggplot(NK) + geom_histogram(aes(Count_FilteredNucleus, fill = Plate), binwidth=2)

In [None]:
ggplot(NK) + geom_point(aes(Count_Nucleus, Count_FilteredNucleus, color = Plate), position = "jitter")

In [None]:
ggplot(NK) + geom_point(aes(Mean_FilterCytoplasm_AreaShape_Perimeter, Count_FilteredNucleus, 
                            color = Plate), position = "jitter")

## Filtering

In [None]:
FILT_MAX_INT_DNA = 0.05 # Remove empty images and small DNA precipitations
FILT_MIN_CELLS = 4 # 8 seems safe from distribution and images, 3 seems in poor shape
FILT_NB_MAX_NA_IMAGE = 10 # 48 images generated between 10 and 385 NAs/image, all others generate at most 2/image
FILT_MAX_CORR = 0.6 # Keep uncorrelated variables
dimUMAP = 3

In [None]:
ftToKeep = 1:dim(NK)[2]
# Make sure that the fields are numeric
ftToKeep <- ftToKeep[which(sapply(NK[,ftToKeep], class) == "numeric")]
# Remove execution time and count features
ftToKeep <- ftToKeep[grep("(Execution)|(Count)", colnames(NK)[ftToKeep], invert = T)]

In [None]:
# Store all remaining features before filtering for downstream comparison to selected set
preFiltFt = colnames(NK)[ftToKeep]

In [None]:
# Remove wells with low max DNA intensity
fieldToKeep <- which(NK$ImageQuality_MaxIntensity_DNA >= FILT_MAX_INT_DNA)
# Remove wells with low cell count
fieldToKeep <- fieldToKeep[NK[fieldToKeep,]$Count_FilteredNucleus >= FILT_MIN_CELLS]

In [None]:
# Few bad quality pictures are generating a lot of missing values and are removed
fieldToKeep <- fieldToKeep[rowSums(is.na(NK[fieldToKeep,ftToKeep])) < FILT_NB_MAX_NA_IMAGE]
# Remove remaining features with missing values
ftToKeep <- ftToKeep[colSums(is.na(NK[fieldToKeep,ftToKeep])) == 0] 
# Remove constant columns
indWT = NK[fieldToKeep,]$Gene == "WT"
ftToKeep <- ftToKeep[sapply(NK[fieldToKeep,ftToKeep], function(x) mad(x) != 0)]
ftToKeep <- ftToKeep[sapply(NK[fieldToKeep[indWT],ftToKeep], function(x) mad(x) != 0)]

In [None]:
# Approximate normal distribution
transfLog <- function (x){
    minVal = min(x)
    return(log(x+1-minVal))
}
transformedNK = as.data.frame(apply(NK[fieldToKeep, ftToKeep], 2, transfLog))

In [None]:
# Center and scale on control values
transfNorm <- function(x, y){
    return((x - median(y))/mad(y))
}
for (plate in levels(NK$Plate)){
    indPlateField = NK$Plate[fieldToKeep] == plate
    indWTinPlate <- NK[fieldToKeep[indPlateField],]$Gene == "WT"
    
    transformedNK[indPlateField,] = apply(
        transformedNK[indPlateField,], 2, 
        function(x) transfNorm(x, x[indWTinPlate]))
}

In [None]:
# Remove columns with NA, i.e.
# features with mad == 0 for 1 plate or more
noNAFt = colSums(is.na(transformedNK)) == 0
ftToKeep = ftToKeep[noNAFt]
transformedNK = transformedNK[,noNAFt]

In [None]:
# Order features from biggest mad to smallest mad
# Since features have mad(WT) = 1 on each plate, it means that we rank features by how more variable they are
# for drug perturbations than for WT
orderFt = rev(order(apply(transformedNK, 2, mad)))

In [None]:
uncorrFt = uncorrelate(transformedNK, orderCol = orderFt-1, threshold = FILT_MAX_CORR)
uncorrFt = unlist(uncorrFt) + 1

In [None]:
transformedNK = transformedNK[,uncorrFt]

In [None]:
dim(transformedNK)

### Look at which types of features are kept

In [None]:
catChannel = c("CorrDNA", "CorrActin", "CorrPerf")
table(rowSums(sapply(catChannel, function(x) grepl(x, colnames(transformedNK)))))
dtCat = data.frame(CountIni = sort(colSums(sapply(catChannel, function(x) grepl(x, preFiltFt)))), 
                   Count = sort(colSums(sapply(catChannel, function(x) grepl(x, colnames(transformedNK))))))

dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cemm_pal(2)[2]) + ylim(c(0,675)) +
              geom_bar(aes(Category, weight = Count), fill = cemm_pal(2)[1]) + 
              geom_text(aes(x = Order, y = CountIni + 2, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/SH_NK_SelecFt_Channel.pdf", plot = gp, width = 10)
}

In [None]:
catObjects = c("ActinGranules", "FilterCytoplasm", "ShrunkenCytoplasm", "FilteredNucleus", "PerfGranules")
table(rowSums(sapply(catObjects, function(x) grepl(x, colnames(transformedNK)))))
dtCat = data.frame(CountIni = colSums(sapply(catObjects, function(x) grepl(x, preFiltFt))), 
                   Count = colSums(sapply(catObjects, function(x) grepl(x, colnames(transformedNK)))))

dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cemm_pal(2)[2]) + ylim(c(0,510)) +
              geom_bar(aes(Category, weight = Count), fill = cemm_pal(2)[1]) + 
              geom_text(aes(x = Order, y = CountIni + 2, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/SH_NK_SelecFt_Object.pdf", plot = gp, width = 10)
}

In [None]:
catType = c("Threshold", "Granularity", "ImageQuality", "Texture", "Distance", "AreaShape", "RadialDistribution", "Neighbors", 
            "Correlation", "Intensity", "Overlap", "Location")
which(rowSums(sapply(catType, function(x) grepl(x, colnames(transformedNK)))) == 0) # All features are covered

dtCat = data.frame(CountIni = colSums(sapply(catType, function(x) grepl(x, preFiltFt))), 
                   Count = colSums(sapply(catType, function(x) grepl(x, colnames(transformedNK)))))
dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cemm_pal(2)[2]) +
              geom_bar(aes(Category, weight = Count), fill = cemm_pal(2)[1]) + 
              geom_text(aes(x = Order, y = CountIni + 5, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/SH_NK_SelecFt_Type_All.pdf", plot = gp, width = 10)
}

In [None]:
catType = c("Granularity", "Texture", "AreaShape", "RadialDistribution",
            "Correlation", "Intensity")
table(rowSums(sapply(catType, function(x) grepl(x, colnames(transformedNK)))))

dtCount = as.data.frame(sapply(catType, function(x) grepl(x, colnames(transformedNK))))
dtCount$Other = !apply(dtCount, 1, any)
dtCountIni = as.data.frame(sapply(catType, function(x) grepl(x, preFiltFt)))
dtCountIni$Other = !apply(dtCountIni, 1, any)
dtCat = data.frame(CountIni = colSums(dtCountIni), 
                   Count = colSums(dtCount))
dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cemm_pal(2)[2]) +
              geom_bar(aes(Category, weight = Count), fill = cemm_pal(2)[1]) + 
              geom_text(aes(x = Order, y = CountIni + 5, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/SH_NK_SelecFt_Type_Short.pdf", plot = gp, width = 10)
}

### Look at the morphological distribution of the fields of view

In [None]:
# Fix t-SNE layout and random number generation
set.seed(38)

In [None]:
umTNK = umap(transformedNK, min_dist = 0.1, neighbors = 10, n = dimUMAP, metric = "euclidean")
umTNK = as.data.frame(umTNK)
names(umTNK) = paste0("UMAP", 1:dimUMAP)

In [None]:
umTNK$Row <- as.factor(NK$Metadata_Row[fieldToKeep])
umTNK$Col <- as.factor(NK$Metadata_Column[fieldToKeep])
umTNK$URL <- as.factor(NK$URL_Actin[fieldToKeep])
umTNK$Gene <- as.factor(NK$Gene[fieldToKeep])

In [None]:
gp <- ggplot(umTNK) + geom_point(aes(UMAP1, UMAP2, color = Gene))
gp + geom_label_repel(aes(UMAP1, UMAP2), 
              label = ifelse((umTNK$UMAP2 < -15), str_extract(as.character(umTNK$URL), "r..c..f.."), NA),
              size = 1.5, segment.alpha = 0.1)

In [None]:
paste(paste0(str_extract(string = umTNK$URL[umTNK$UMAP2 < -15], pattern = "/scratch.*ch*"), "*"), collapse = " ")

In [None]:
sessionInfo()