# NK plates analysis

## Preliminary settings

In [None]:
library(ggplot2)
library(extrafont)
library(stringr)
library(heatmaply)
library(ggrepel)

In [None]:
# Fix t-SNE layout and random number generation
set.seed(38)
# Load extra fonts
loadfonts()

In [None]:
# Customize ggplot appearance

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}


cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
scale_fill_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "CeMM", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)

## Import data

In [None]:
NK = read.csv("./allImages.csv", header=T)

In [None]:
NK$Plate <- str_extract(NK$URL_Actin, "Plate.")
Position <- str_extract_all(NK$URL_Actin, '\\d{2}', simplify = T)
colnames(Position) <- c("Row", "Column", "Field")
NK <- cbind(NK, Position)

## Visualize full dataset

### Cell count

In [None]:
ggplot(NK) + geom_histogram(aes(Count_FilteredNucleus, fill = Plate), binwidth=2)

In [None]:
ggplot(NK[fieldToKeep,]) + geom_histogram(aes(Count_FilteredNucleus, fill = Plate), binwidth=2)

In [None]:
ggplot(NK) + geom_point(aes(Count_Nucleus, Count_FilteredNucleus, color = Plate), position = "jitter")

In [None]:
medianCount = aggregate(NK$Count_Nucleus, list(paste(NK$Row, NK$Column)), median)
medianCount = matrix(medianCount[-4,2], nrow = 8)
colnames(medianCount) <- c(1:3,5:11)

In [None]:
heatmaply(medianCount, file = "medianCountPerWell.pdf", dendrogram = "none",
         colors = plasma)

In [None]:
head(names(NK), 50)
NK$Mean_FilterCytoplasm_AreaShape_Perimeter

In [None]:
ggplot(NK) + geom_point(aes(Mean_FilterCytoplasm_AreaShape_Perimeter, Count_FilteredNucleus, 
                            color = Plate), position = "jitter")

In [None]:
dis = aggregate(NK$Mean_FilterCytoplasm_AreaShape_Perimeter, list(NK$Count_FilteredNucleus), function(x) var(x, na.rm = T))
dis$Exp <- (mean(NK$StDev_FilterCytoplasm_AreaShape_Perimeter, na.rm = T)**2) /(dis$Group.1)
dis = dis[-1,]
ggplot(dis) + geom_point(aes(Group.1, x), col = "#DD8866") + geom_line(aes(Group.1, Exp), color = "#8899DD")

In [None]:
MK = NK[(NK$Col == "11")&(NK$Plate == "Plate1"),]
dis = aggregate(MK$Mean_FilterCytoplasm_AreaShape_Perimeter, list(MK$Count_FilteredNucleus), function(x) var(x, na.rm = T))
dis$Exp <- (MK$StDev_FilterCytoplasm_AreaShape_Perimeter**2)/(dis$Group.1)
dis = dis[-1,]
ggplot(dis) + geom_point(aes(Group.1, x), col = "#DD8866") + geom_line(aes(Group.1, Exp), color = "#8899DD")

In [None]:
ggplot(NK) + geom_violin(aes(Plate, Mean_FilterCytoplasm_AreaShape_Perimeter,
                            fill = Plate))

In [None]:
ggplot(NK) + geom_violin(aes(Plate, Mean_FilteredNucleus_AreaShape_Area,
                            fill = Plate))

In [None]:
medianCount = aggregate(NK$Mean_FilterCytoplasm_AreaShape_Perimeter, 
                        list(paste(NK$Row, NK$Column)), function(x) median(x, na.rm = T))
medianCount = matrix(medianCount[-4,2], nrow = 8)
colnames(medianCount) <- c(1:3,5:11)

In [None]:
heatmaply(medianCount, file = "medianMean_FilterCytoplasm_AreaShape_PerimeterPerWell.pdf", dendrogram = "none",
         colors = plasma)

## Filtering

In [None]:
FILT_MAX_INT_DNA = 0.05 # Remove empty images and small DNA precipitations
FILT_MIN_CELLS = 4 # 8 seems safe from distribution and images, 3 seems in poor shape
FILT_NB_MAX_NA_IMAGE = 10 # 48 images generated between 10 and 385 NAs/image, all others generate at most 2/image

In [None]:
labPts = ifelse((NK$ImageQuality_MaxIntensity_DNA < FILT_MAX_INT_DNA), 
                paste(NK$Row, NK$Column, NK$Field, NK$Plate, sep = "."), NA)

In [None]:
gp <- ggplot(NK) + geom_point(aes(ImageQuality_MaxIntensity_DNA, ImageQuality_MeanIntensity_DNA), alpha = 0.3) +
    geom_text_repel(aes(ImageQuality_MaxIntensity_DNA, ImageQuality_MeanIntensity_DNA, label=labPts), 
                    segment.alpha = 0.02, color = "#AAAAEE", size = 0.5)
gp
ggsave(filename = "urlIntenOutliers.pdf", plot = gp)

In [None]:
ftToKeep = 1:dim(NK)[2]
# Make sure that the fields are numeric
ftToKeep <- ftToKeep[which(sapply(NK[,ftToKeep], class) == "numeric")]
# Remove execution time and count features
ftToKeep <- ftToKeep[grep("(Execution)|(Count)", colnames(NK)[ftToKeep], invert = T)]

In [None]:
# Remove wells with low max DNA intensity
fieldToKeep <- which(NK$ImageQuality_MaxIntensity_DNA >= FILT_MAX_INT_DNA)
# Remove wells with low cell count
fieldToKeep <- fieldToKeep[NK[fieldToKeep,]$Count_FilteredNucleus >= FILT_MIN_CELLS]

In [None]:
# Few bad quality pictures are generating a lot of missing values and are removed
fieldToKeep <- fieldToKeep[rowSums(is.na(NK[fieldToKeep,ftToKeep])) < FILT_NB_MAX_NA_IMAGE]
# Remove remaining features with missing values
ftToKeep <- ftToKeep[colSums(is.na(NK[fieldToKeep,ftToKeep])) == 0] 
# Remove constant columns
ftToKeep <- ftToKeep[sapply(NK[fieldToKeep,ftToKeep], function(x) sd(x) != 0)]

In [None]:
dim(NK)
length(ftToKeep)
length(fieldToKeep)

In [None]:
head(sample(NK[fieldToKeep[NK[fieldToKeep,]$Count_FilteredNucleus == 8],]$URL_Actin, 6))

In [None]:
table(NK[fieldToKeep,]$Count_FilteredNucleus)