# NK Primary cells

In [None]:
library(ggplot2)
library(extrafont)
library(stringr)
library(heatmaply)
library(ggrepel)
library(reticulate)
library(gridExtra)
library(robustbase)
library(randomForest)
library(reshape2)

In [None]:
# Load external Python functions
source_python("reticulate_functions.py")
# Load extra fonts
ttf_import(paths = "/tmp/.fonts/")
loadfonts()
# Set this to true for faster compilation or false for more precise results and all outputs
TEST_MODE = F

In [None]:
# Number of permutations used for empirical p-value computations
PERM_NB_ITER = ifelse(TEST_MODE, 20, 2000)
# Number of genes selected for hierarchical heatmap
CLUST_NB_GENES = ifelse(TEST_MODE, 30, 300)
# Number of trees used in random forest classifier
RF_NB_TREES = ifelse(TEST_MODE, 50, 1000)

In [None]:
# Customize ggplot appearance

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}

cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
cust_pal = colorRampPalette(c("#008CAD", "#40B9D4", "#D4ECF2", "#F8B100", "#C00000", "#2D0000"))
yolla_pal = colorRampPalette(c('#FC7070', '#C00000', '#2D0000'))
scale_fill_discrete <- function (..., type = "Cust", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else if (type == "Yolla"){
        discrete_scale(aesthetics, "Yolla", yolla_pal, na.value = na.value, ...)
    } else if (type == "Cust"){
        discrete_scale(aesthetics, "Cust", cust_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "Cust", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else if (type == "Cust"){
        discrete_scale(aesthetics, "Cust", cust_pal, na.value = na.value, ...)
    } else if (type == "Yolla"){
        discrete_scale(aesthetics, "Yolla", yolla_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)

## Primary cells (from healthy donors)

In [None]:
# Fix t-SNE layout and random number generation
set.seed(38)

In [None]:
NK = read.csv("Rsc/allImages_NK_primary.csv", header=T)

In [None]:
NK$Donor = as.factor(ifelse(NK$Metadata_Row <= 5, "Donor 1",
                     ifelse(NK$Metadata_Row <= 9, "Donor 2", "Donor 3")))
NK$Drug = ifelse(NK$Metadata_Column <= 4, "PLL",
          ifelse(NK$Metadata_Column <= 6, "Icam",
          ifelse(NK$Metadata_Column <= 14, "CK869", "SMIFH2")))
NK$Drug[NK$Metadata_Row %in% c(5,9,13)] <- "DMSO"
NK$Concentration = NA
NK$Concentration[NK$Drug == "DMSO"] = ifelse(NK$Metadata_Column[NK$Drug == "DMSO"] <= 8, 0.01,
                                      ifelse(NK$Metadata_Column[NK$Drug == "DMSO"] <= 10, 0.02,
                                      ifelse(NK$Metadata_Column[NK$Drug == "DMSO"] <= 12, 0.05,
                                      ifelse(NK$Metadata_Column[NK$Drug == "DMSO"] <= 14, 0.1, 0.2))))
NK$Concentration[NK$Drug == "CK869"] = ifelse(NK$Metadata_Column[NK$Drug == "CK869"] <= 8, 5,
                                       ifelse(NK$Metadata_Column[NK$Drug == "CK869"] <= 10, 10,
                                       ifelse(NK$Metadata_Column[NK$Drug == "CK869"] <= 12, 25, 50)))
NK$Concentration[NK$Drug == "SMIFH2"] = ifelse(NK$Metadata_Column[NK$Drug == "SMIFH2"] <= 16, 12.5,
                                        ifelse(NK$Metadata_Column[NK$Drug == "SMIFH2"] <= 18, 25,
                                        ifelse(NK$Metadata_Column[NK$Drug == "SMIFH2"] <= 20, 50, 100)))
NK$Drug = as.factor(NK$Drug)

In [None]:
gpNK = ggplot(NK[!is.na(NK$Count_FilterNKNucleus),]) + geom_histogram(aes(Count_FilterNKNucleus, fill = as.factor(Metadata_Row)), binwidth=2) +
       scale_fill_discrete(name="Row")
gpNK
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Primary_count.pdf", plot = gpNK)
}

In [None]:
gpNK = ggplot(NK[!is.na(NK$Mean_FilterNKCytoplasm_AreaShape_Area),]) + geom_histogram(aes(Mean_FilterNKCytoplasm_AreaShape_Area, 
                                                                                    fill = as.factor(Metadata_Row)), binwidth=100) +
       scale_fill_discrete(name="Row")
gpNK
if(!TEST_MODE){
    ggsave(filename = "Fig/NK_Primary_area.pdf", plot = gpNK)
}

### Filtering

In [None]:
FILT_MAX_INT_DNA = 0.005 # Remove empty images and small DNA precipitations
FILT_MIN_CELLS = 2 # Most field of views have only 2 cells
FILT_NB_MAX_NA_IMAGE = 63
FILT_MAX_CORR = 0.6 # Keep uncorrelated variables
dimUMAP = 3

In [None]:
ftToKeep = 1:dim(NK)[2]
# Make sure that the fields are numeric
ftToKeep <- ftToKeep[which(sapply(NK[,ftToKeep], class) == "numeric")]
# Remove execution time and count features
ftToKeep <- ftToKeep[grep("(Execution)|(Count)|(Concentration)|(ActinGranules)", colnames(NK)[ftToKeep], invert = T)]
# ftToKeep <- ftToKeep[grep("(Execution)|(Count)|(Concentration)", colnames(NK)[ftToKeep], invert = T)]

In [None]:
# Store all remaining features before filtering for downstream comparison to selected set
preFiltFt = colnames(NK)[ftToKeep]

In [None]:
# Remove wells with low max DNA intensity
fieldToKeep <- which(NK$ImageQuality_MaxIntensity_DNA >= FILT_MAX_INT_DNA)
# Remove wells with low cell count
fieldToKeep <- na.omit(fieldToKeep[NK[fieldToKeep,]$Count_FilterNKNucleus >= FILT_MIN_CELLS])

In [None]:
# Few bad quality pictures are generating a lot of missing values and are removed
fieldToKeep <- fieldToKeep[rowSums(is.na(NK[fieldToKeep,ftToKeep])) < FILT_NB_MAX_NA_IMAGE]

In [None]:
# Remove remaining features with missing values
ftToKeep <- ftToKeep[colSums(is.na(NK[fieldToKeep,ftToKeep])) == 0] 
# Remove constant columns
indWT = NK[fieldToKeep,]$Drug == "Icam"
ftToKeep <- ftToKeep[sapply(NK[fieldToKeep,ftToKeep], function(x) mad(x) != 0)]
ftToKeep <- ftToKeep[sapply(NK[fieldToKeep[indWT],ftToKeep], function(x) mad(x) != 0)]

In [None]:
# Approximate normal distribution
transfLog <- function (x){
    minVal = min(x)
    return(log(x+1-minVal))
}

# Center and scale on control values
transfNorm <- function(x, y){
    return((x - median(y))/mad(y))
}

In [None]:
transformedNK = apply(NK[fieldToKeep, ftToKeep], 2, transfLog)
for (donor in levels(NK$Donor)){
    indDonorField = NK$Donor[fieldToKeep] == donor
    indICAMinDonor <- NK[fieldToKeep[indDonorField],]$Drug == "Icam"
    
    transformedNK[indDonorField,] <- apply(
        transformedNK[indDonorField,], 2, 
        function(x) transfNorm(x, x[indICAMinDonor]))
}
# NB: Feature with mad == 0 for a given donor have constant features for ICAM
noNAFt = colSums(is.na(transformedNK)) == 0
ftToKeep = ftToKeep[noNAFt]
transformedNK = transformedNK[,noNAFt]

    transformedNK = apply(NK[fieldToKeep, ftToKeep], 2, transfLog)

    indICAM <- NK[fieldToKeep,]$Drug == "Icam"
    transformedNK <- apply(transformedNK, 2, 
            function(x) transfNorm(x, x[indICAM]))
    # NB: Feature with mad == 0 for a given donor have constant features for ICAM
    noNAFt = colSums(is.na(transformedNK)) == 0
    ftToKeep = ftToKeep[noNAFt]
    transformedNK = transformedNK[,noNAFt]

In [None]:
# Order features from biggest mad to smallest mad
# Since features have mad(WT) = 1, it means that we rank features by how more variable they are
# for drug perturbations than for WT
orderFt = rev(order(apply(transformedNK, 2, mad)))

In [None]:
uncorrFt = uncorrelate(transformedNK, orderCol = orderFt-1, threshold = FILT_MAX_CORR)
uncorrFt = unlist(uncorrFt) + 1

In [None]:
transformedNK = transformedNK[,uncorrFt]

In [None]:
dim(NK)
dim(NK[fieldToKeep, ftToKeep])
dim(transformedNK)

In [None]:
colMedians(transformedNK[(NK$Donor[fieldToKeep] == "Donor 2")&
                         (NK$Drug[fieldToKeep] == "Icam"),])

### Export subset of features

For NK cells, the following features are selected and explored separately for their biological interpretability:
```
* Actin intensity/cell (mean/well): NK$Intensity_MeanIntensity_CorrActin_FilterCytoplasm
* Cell area: NK$Mean_FilterCytoplasm_AreaShape_Area
* Cell roundness: NK$Mean_FilterCytoplasm_AreaShape_FormFactor
* Cell width: NK$Mean_FilterCytoplasm_AreaShape_MajorAxisLength 
* Cell length: NK$Mean_FilterCytoplasm_AreaShape_MinorAxisLength
* Cell length to width ratio: NK$Mean_FilterCytoplasm_AreaShape_MinorAxisLength / NK$Mean_FilterCytoplasm_AreaShape_MajorAxisLength
* Average number of perforin granules / cell: NK$Count_PerfGranules  / NK$Count_FilterCytoplasm
* Perforin area / cell area: (NK$Count_PerfGranules * NK$Mean_PerfGranules_AreaShape_Area)  / (NK$Count_FilterCytoplasm * NK$Mean_FilterCytoplasm_AreaShape_Area)
* Perforin intensity: NK$Intensity_MeanIntensity_CorrPerf_FilterCytoplasm
* Perforin area: NK$Mean_PerfGranules_AreaShape_Area
* Nucleus intensity: NK$Intensity_MeanIntensity_CorrDNA_FilteredNucleus
* Nucleus area: NK$Mean_FilteredNucleus_AreaShape_Area
* Nucleus roundness: NK$Mean_FilteredNucleus_AreaShape_FormFactor
* Nucleus width: NK$Mean_FilteredNucleus_AreaShape_MajorAxisLength 
* Nucleus length: NK$Mean_FilteredNucleus_AreaShape_MinorAxisLength
* Nucleus ratio: NK$Mean_FilteredNucleus_AreaShape_MinorAxisLength / NK$Mean_FilteredNucleus_AreaShape_MajorAxisLength
* Nucleus area / cell area: (NK$Count_FilteredNucleus * NK$Mean_FilteredNucleus_AreaShape_Area)  / (NK$Count_FilterCytoplasm * NK$Mean_FilterCytoplasm_AreaShape_Area)```

NB (from CellProfiler docs): FormFactor = $4 \times π \times Area / Perimeter^2$. Equals 1 for a perfectly circular object.

In [None]:
subsetNK = cbind(Field = str_extract(as.character(NK[fieldToKeep,]$URL_Actin), "r..c..f.."),
                 Drug = as.character(NK[fieldToKeep,]$Drug),
                 Donor = as.character(NK[fieldToKeep,]$Donor),
                 Concentration = NK[fieldToKeep,]$Concentration,
                 ActinIntensity = NK[fieldToKeep,]$Intensity_MeanIntensity_CorrActin_FilterNKCytoplasm,
                 CellArea = NK[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_Area,
                 CellRoundness = NK[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_FormFactor,
                 CellWidth = NK[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_MajorAxisLength,
                 CellLength = NK[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_MinorAxisLength,
                 CellLengthOverWidth = NK[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_MinorAxisLength / 
                     NK[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_MajorAxisLength,
                 PerforinGranulesPerCell = NK[fieldToKeep,]$Count_FilterNKPerfGranules  / 
                     NK[fieldToKeep,]$Count_FilterNKCytoplasm,
                 PerforinAreaOverCellArea = (NK[fieldToKeep,]$Count_FilterNKPerfGranules * 
                     NK[fieldToKeep,]$Mean_PFilterNKPerfGranules_AreaShape_Area) / 
                     (NK[fieldToKeep,]$Count_FilterNKCytoplasm * 
                      NK[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_Area),
                 PerforinIntensity = NK[fieldToKeep,]$Intensity_MeanIntensity_CorrPerf_FilterNKCytoplasm,
                 PerforinArea = NK[fieldToKeep,]$Mean_FilterNKPerfGranules_AreaShape_Area,
                 NucleusIntensity = NK[fieldToKeep,]$Intensity_MeanIntensity_CorrDNA_FilterNKNucleus,
                 NucleusArea = NK[fieldToKeep,]$Mean_FilterNKNucleus_AreaShape_Area,
                 NucleusRoundness = NK[fieldToKeep,]$Mean_FilterNKNucleus_AreaShape_FormFactor,
                 NucleusWidth = NK[fieldToKeep,]$Mean_FilterNKNucleus_AreaShape_MajorAxisLength,
                 NucleusLength = NK[fieldToKeep,]$Mean_FilterNKNucleus_AreaShape_MinorAxisLength,
                 NucleusLengthOverWidth = NK[fieldToKeep,]$Mean_FilterNKNucleus_AreaShape_MinorAxisLength / 
                     NK[fieldToKeep,]$Mean_FilterNKNucleus_AreaShape_MajorAxisLength,
                 NucleusAreaOverCellArea = (NK[fieldToKeep,]$Count_FilterNKNucleus * 
                     NK[fieldToKeep,]$Mean_FilterNKNucleus_AreaShape_Area) / 
                     (NK[fieldToKeep,]$Count_FilterNKCytoplasm * 
                      NK[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_Area)
                )

In [None]:
# Export list of "interpretable" features
if(!TEST_MODE){
    write.csv(subsetNK, "Tab/NK_Primary_features.csv", row.names = F)
}

In [None]:
CountPerWell = aggregate(NK[fieldToKeep,]$Count_FilterNKCytoplasm, by = list(NK[fieldToKeep,]$Metadata_Well), FUN = function(x) sum(x, na.rm = T))
names(CountPerWell) <- c("Well", "Count")
CountPerDrug = aggregate(NK[fieldToKeep,]$Count_FilterNKCytoplasm, by = list(NK[fieldToKeep,]$Drug), FUN = function(x) sum(x, na.rm = T))
names(CountPerDrug) <- c("Drug", "Count")
CountPerConcentration = aggregate(NK[fieldToKeep,]$Count_FilterNKCytoplasm, by = list(NK[fieldToKeep,]$Drug, 
    NK[fieldToKeep,]$Concentration), FUN = function(x) sum(x, na.rm = T))
names(CountPerConcentration) <- c("Drug", "Concentration", "Count")     
CountPerDonor = aggregate(NK[fieldToKeep,]$Count_FilterNKCytoplasm, by = list(NK[fieldToKeep,]$Donor), 
                          FUN = function(x) sum(x, na.rm = T))
names(CountPerDonor) <- c("Donor", "Count")

In [None]:
wellCountGranules = aggregate(NK[fieldToKeep,]$Count_FilterNKPerfGranules, 
                              by = list(NK[fieldToKeep,]$Metadata_Well), FUN = sum)
wellCountCells = aggregate(NK[fieldToKeep,]$Count_FilterNKCytoplasm, 
                           by = list(NK[fieldToKeep,]$Metadata_Well), FUN = sum)
stopifnot(wellCountGranules$Group.1 == wellCountCells$Group.1)
granulePerCellPerWell = data.frame(Well = wellCountGranules$Group.1, Average = wellCountGranules$x / wellCountCells$x)

In [None]:
# Export cell counts
if(!TEST_MODE){
    write.csv(CountPerWell, "Tab/NK_Primary_count_well.csv", row.names = F)
    write.csv(CountPerDrug, "Tab/NK_Primary_count_drug.csv", row.names = F)
    write.csv(CountPerDonor, "Tab/NK_Primary_count_donor.csv", row.names = F)
    write.csv(CountPerConcentration, "Tab/NK_Primary_count_concentration.csv", row.names = F)    
    write.csv(granulePerCellPerWell, "Tab/NK_Primary_average_granule_count_per_cell.csv", row.names = F)    
}

### Look at the morphological distribution of the fields of view

In [None]:
# Fix random number generation
set.seed(38)

In [None]:
umTNK = umap(transformedNK, min_dist = 0.1, neighbors = 10, n = dimUMAP, metric = "euclidean")
umTNK = as.data.frame(umTNK)
names(umTNK) = paste0("UMAP", 1:dimUMAP)

In [None]:
umTNK$Row <- as.factor(NK$Metadata_Row[fieldToKeep])
umTNK$Col <- as.factor(NK$Metadata_Column[fieldToKeep])
umTNK$URL <- as.factor(NK$URL_Actin[fieldToKeep])
umTNK$Drug <- as.factor(NK$Drug[fieldToKeep])
umTNK$Donor <- as.factor(NK$Donor[fieldToKeep])

In [None]:
gp <- ggplot(umTNK) + geom_point(aes(UMAP1, UMAP2, color = Donor))
gp

In [None]:
gp <- ggplot(umTNK) + geom_point(aes(UMAP1, UMAP2, color = Drug))
gp

In [None]:
umTNK$Count <- NK$Count_FilterNKNucleus[fieldToKeep]
gp <- ggplot(umTNK) + geom_point(aes(UMAP1, UMAP2, color = Count))
gp

    umTNK$Labs = ifelse((umTNK$UMAP1 > 15)&(umTNK$UMAP2 < 14), str_extract(umTNK$URL, "r..c..f.."), "")
    gp <- ggplot(umTNK) + geom_point(aes(UMAP1, UMAP2, color = Col)) +
          geom_label_repel(aes(UMAP1, UMAP2, label = Labs))
    gp

In [None]:
umapDonor <- function(donor){
    umDonor = umap(transformedNK[NK$Donor[fieldToKeep] == donor,], min_dist = 0.1, neighbors = 10, n = dimUMAP, metric = "euclidean")
    umDonor = as.data.frame(umDonor)
    names(umDonor) = paste0("UMAP", 1:dimUMAP)
    
    umDonor$Treatment <- as.factor(NK$Drug[fieldToKeep][NK$Donor[fieldToKeep] == donor])
    
    gp <- ggplot(umDonor) + geom_point(aes(UMAP1, UMAP2, color = Treatment))
    ggsave(filename = paste("Fig/NK_Primary_UMAP", donor, "all.pdf", sep = "_"), plot = gp, width = 10)
}

In [None]:
if(!TEST_MODE){
    sapply(levels(umTNK$Donor), umapDonor)
}

In [None]:
umapDrugDonor <- function(donor){
    umDonor = umap(transformedNK[(NK$Donor[fieldToKeep] == donor)&
                                 (NK$Drug[fieldToKeep] %in% c("CK869", "SMIFH2", "DMSO")),],
                   min_dist = 0.1, neighbors = 10, n = dimUMAP, metric = "euclidean")
    umDonor = as.data.frame(umDonor)
    names(umDonor) = paste0("UMAP", 1:dimUMAP)
    
    umDonor$Drug <- as.factor(NK$Drug[fieldToKeep][(NK$Donor[fieldToKeep] == donor)&
                                 (NK$Drug[fieldToKeep] %in% c("CK869", "SMIFH2", "DMSO"))])
    
    gp <- ggplot(umDonor) + geom_point(aes(UMAP1, UMAP2, color = Drug))
    ggsave(filename = paste("Fig/NK_Primary_UMAP", donor, "drugs.pdf", sep = "_"), plot = gp, width = 10)
}

In [None]:
if(!TEST_MODE){
    sapply(levels(umTNK$Donor), umapDrugDonor)
}

## UMAP averaged per well

In [None]:
donor = "Donor 2"
indDonor = NK$Donor[fieldToKeep] == donor

wellNK = apply(transformedNK[indDonor,], 2, function(x)
    aggregate(x, by = list(NK$Metadata_Well[fieldToKeep][indDonor]), FUN = median)$x)
               
wells = levels(droplevels(NK$Metadata_Well[fieldToKeep][indDonor]))

umDonor = umap(wellNK, min_dist = 1, neighbors = 3, n = 2, metric = "euclidean")    
umDonor = as.data.frame(umDonor)
names(umDonor) = paste0("UMAP", 1:2)
               
getDrug <- function(well){
    fieldInWell = which(NK$Metadata_Well == well)[1]
    return(NK$Drug[fieldInWell])
}
umDonor$Drug = sapply(wells, getDrug)
gp <- ggplot(umDonor) + geom_point(aes(UMAP1, UMAP2, color = Drug))
gp

In [None]:
ggsave(filename = paste("Fig/NK_Primary_UMAP", donor, "all_per_well.pdf", sep = "_"), plot = gp, width = 10)

In [None]:
sessionInfo()