# Patient ARPC1B deficient cells

In [None]:
library(ggplot2)
library(extrafont)
library(stringr)
library(heatmaply)
library(ggrepel)
library(reticulate)
library(gridExtra)
library(robustbase)
library(randomForest)
library(reshape2)

In [None]:
# Load external Python functions
source_python("reticulate_functions.py")
# Load extra fonts
ttf_import(paths = "/tmp/.fonts/")
loadfonts()
# Set this to true for faster compilation or false for more precise results and all outputs
TEST_MODE = T

In [None]:
# Number of permutations used for empirical p-value computations
PERM_NB_ITER = ifelse(TEST_MODE, 20, 2000)
# Number of trees used in random forest classifier
RF_NB_TREES = ifelse(TEST_MODE, 50, 1000)

In [None]:
# Customize ggplot appearance

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}

cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
cust_pal = colorRampPalette(c("#008CAD", "#40B9D4", "#D4ECF2", "#F8B100", "#C00000", "#2D0000"))
yolla_pal = colorRampPalette(c('#FC7070', '#C00000', '#2D0000'))
scale_fill_discrete <- function (..., type = "Cust", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else if (type == "Yolla"){
        discrete_scale(aesthetics, "Yolla", yolla_pal, na.value = na.value, ...)
    } else if (type == "Cust"){
        discrete_scale(aesthetics, "Cust", cust_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "Cust", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else if (type == "Cust"){
        discrete_scale(aesthetics, "Cust", cust_pal, na.value = na.value, ...)
    } else if (type == "Yolla"){
        discrete_scale(aesthetics, "Yolla", yolla_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)

## Patient cells (∆ARPC1B)

In [None]:
# Fix t-SNE layout and random number generation
set.seed(38)

### We load individual cell measurements

In [None]:
cytoplasm = read.csv("Rsc/compiled_LT_ARPC1B_GranulePosition_FilterNKCytoplasm.csv", header=T)

In [None]:
granule = read.csv("Rsc/compiled_LT_ARPC1B_GranulePosition_FilterNKPerfGranules.csv", header=T)

In [None]:
dim(cytoplasm)
dim(granule)

In [None]:
ftGranule = 16:95
aggGranule = aggregate(granule[ftGranule], list(Image = granule$ImageNumber, Cytoplasm = granule$Parent_FilterNKCytoplasm), median)

In [None]:
cell = merge(cytoplasm, aggGranule, by.x = c("ImageNumber", "ObjectNumber"),
                                    by.y = c("Image", "Cytoplasm"))

### Integrate image information

In [None]:
LT = read.csv("Rsc/compiled_LT_ARPC1B_GranulePosition_Image.csv", header=T)

In [None]:
annotation = read.csv("Rsc/ARPC1BPlateLayout.csv", header=T)[1:90,] # Avoid terminal empty lines

In [None]:
names(LT)

In [None]:
getDonor <- function(row,column){
    return(annotation[(annotation$Row == row)&(annotation$Column == column),1])
}
getCoating <- function(row,column){
    return(annotation[(annotation$Row == row)&(annotation$Column == column),2])
}
LT$Donor = mapply(getDonor, LT$Metadata_Row, LT$Metadata_Column)
LT$Coating = mapply(getCoating, LT$Metadata_Row, LT$Metadata_Column)
# Is normal donor?
LT$ND = F
LT$ND[grep("ND", LT$Donor)] = T
# Which patient?
LT$Patient = ifelse(LT$ND, "ND", as.character(LT$Donor))

We discard the data from the PLL coated plates (not studied in this analysis).

In [None]:
LT = LT[LT$Coating != "PLL",]

In [None]:
gpLT = ggplot(LT[!is.na(LT$Count_FilterNKNucleus),]) + geom_histogram(aes(Count_FilterNKNucleus, fill = as.factor(Metadata_Row)), binwidth=2) +
       scale_fill_discrete(name="Row")
gpLT

In [None]:
gpLT = ggplot(LT[!is.na(LT$Mean_FilterNKCytoplasm_AreaShape_Area),]) + geom_histogram(aes(Mean_FilterNKCytoplasm_AreaShape_Area, 
                                                                                    fill = as.factor(Donor)), binwidth=100) +
       scale_fill_discrete(name="Row")
gpLT

### Filtering images

In [None]:
FILT_MAX_INT_DNA = 0.01 # Remove empty images and small DNA precipitations
FILT_MIN_CELLS = 5 # Most field of views have only 2 cells
FILT_NB_MAX_NA_IMAGE = 1
FILT_MAX_CORR = 0.6 # Keep uncorrelated variables

In [None]:
ftToKeep = 1:dim(LT)[2]
# Make sure that the fields are numeric
ftToKeep <- ftToKeep[which(sapply(LT[,ftToKeep], class) == "numeric")]
# Remove execution time and count features
ftToKeep <- ftToKeep[grep("(Execution)|(Count)|(Concentration)|(ActinGranules)", colnames(LT)[ftToKeep], invert = T)]

In [None]:
# Remove wells with low max DNA intensity
fieldToKeep <- which(LT$ImageQuality_MaxIntensity_DNA >= FILT_MAX_INT_DNA)
# Remove wells with low cell count
fieldToKeep <- na.omit(fieldToKeep[LT[fieldToKeep,]$Count_FilterNKNucleus >= FILT_MIN_CELLS])

In [None]:
# Few bad quality pictures are generating a lot of missing values and are removed
fieldToKeep <- fieldToKeep[rowSums(is.na(LT[fieldToKeep,ftToKeep])) < FILT_NB_MAX_NA_IMAGE]

In [None]:
imgToKeep = LT$ImageNumber[fieldToKeep]

### Filter corresponding cells

In [None]:
cellToKeep = 1:nrow(cell)
# Keep only if image is kept
cellToKeep = cellToKeep[cell$ImageNumber %in% imgToKeep]

In [None]:
ftToKeep = 1:dim(cell)[2]
# Make sure that the fields are numeric
ftToKeep <- ftToKeep[which(sapply(cell[,ftToKeep], class) == "numeric")]
# Remove unwanted features
ftToKeep <- ftToKeep[grep("(ObjectNumber)|(Metadata)|(Location)|(Object_Number)|(Center)", 
                          colnames(cell)[ftToKeep], invert = T)]
# Remove features with NA
ftToKeep <- ftToKeep[colSums(is.na(cell[cellToKeep,ftToKeep])) == 0]

In [None]:
# Granules and cytoplasm centers are close from each other (as expected)
ggplot(cell[cellToKeep,], aes(x = Location_Center_X.x, y = Location_Center_X.y)) + geom_point()

In [None]:
ggplot(cell[cellToKeep,], aes(x = Mean_FilterNKPerfGranules_Distance_Minimum_FilterNKCytoplasm, 
                 y = Intensity_MeanIntensity_CorrActin)) + geom_point()

In [None]:
isNDcell <- function(imageN){
    return(LT$ND[LT$ImageNumber == imageN])
}
cellND = sapply(cell$ImageNumber[cellToKeep], isNDcell)

In [None]:
# Remove constant columns
ftToKeep <- ftToKeep[sapply(cell[cellToKeep,ftToKeep], function(x) mad(x) != 0)]
ftToKeep <- ftToKeep[sapply(cell[cellToKeep[cellND],ftToKeep], function(x) mad(x) != 0)]

In [None]:
dim(cell)
dim(cell[cellToKeep,ftToKeep])

In [None]:
# Approximate normal distribution
transfLog <- function (x){
    minVal = min(x)
    return(log(x+1-minVal))
}

# Center and scale on control values
transfNorm <- function(x, y){
    return((x - median(y))/mad(y))
}

In [None]:
# Try centering on all healthy donors
transCell = apply(cell[cellToKeep, ftToKeep], 2, transfLog)
transCell = apply(transCell, 2, function(x) transfNorm(x, x[cellND]))

In [None]:
# Order features from biggest mad to smallest mad
# Since features have mad(ND) = 1, it means that we rank features by how more variable they are
# for patients than for ND
orderFt = rev(order(apply(transCell, 2, mad)))

In [None]:
uncorrFt = uncorrelate(transCell, orderCol = orderFt-1, threshold = FILT_MAX_CORR)
uncorrFt = unlist(uncorrFt) + 1

In [None]:
transCell = transCell[,uncorrFt]

In [None]:
dim(cell)
dim(cell[cellToKeep, ftToKeep])
dim(transCell)

### Look at the morphological distribution of the fields of view

In [None]:
# Fix random number generation
set.seed(38)

In [None]:
umCell = umap(transCell, min_dist = 1, neighbors = 20, n = 2, metric = "euclidean")
umCell = as.data.frame(umCell)
names(umCell) = paste0("UMAP", 1:2)

In [None]:
getCellDonor <- function(imageN){
    return(LT$Donor[LT$ImageNumber == imageN])
}
cellDonor = sapply(cell$ImageNumber[cellToKeep], getCellDonor)

In [None]:
umCell$ND <- as.factor(cellND)
umCell$Donor <- as.factor(cellDonor)

In [None]:
gp <- ggplot(umCell) + geom_point(aes(UMAP1, UMAP2, color = Donor))
gp

In [None]:
gp3 <- ggplot(umCell, aes(UMAP1, UMAP2)) + geom_hex(aes(fill = ..density..), bins = 20) + facet_wrap("Donor")
print(gp3)

## Random forest regression

In [None]:
set.seed(38)

In [None]:
# We select actin and cytoplasm related features to explain other variables
predictiveFtInd = c(grep("\\.x", names(cell)[ftToKeep]), which(names(cell)[ftToKeep] %in% names(cytoplasm)))
predictiveFt = names(cell)[ftToKeep][predictiveFtInd]
# Remove granule related feature
predictiveFtInd = grep("Granule", predictiveFt, invert = T)
predictiveFt = names(cell)[ftToKeep][predictiveFtInd]

In [None]:
form1 = as.formula(paste("Mean_FilterNKPerfGranules_Distance_Minimum_FilterNKCytoplasm ~ ", 
                         paste(predictiveFt, collapse = " + ")))
model1 = lm(form1, data = cell[cellToKeep, ftToKeep])

In [None]:
summary(model1)

In [None]:
form2 = as.formula(paste("Intensity_IntegratedIntensity_DistNuc ~ ", 
                         paste(predictiveFt, collapse = " + ")))
model2 = lm(form2, data = cell[cellToKeep, ftToKeep])

In [None]:
summary(model2)

In [None]:
form3 = as.formula(paste("Intensity_MeanIntensity_DistNuc ~ ", 
                         paste(predictiveFt, collapse = " + ")))
model3 = lm(form3, data = cell[cellToKeep, ftToKeep])

In [None]:
summary(model3)

In [None]:
form4 = as.formula(paste("Intensity_MedianIntensity_DistNuc ~ ", 
                         paste(predictiveFt, collapse = " + ")))
model4 = lm(form4, data = cell[cellToKeep, ftToKeep])

In [None]:
summary(model4)

Then, we explore how much can be predicted from easily interpretable actin and shape features alone.

In [None]:
interpretableFt = c('AreaShape_Perimeter.x', 'AreaShape_MaximumRadius.x', 'AreaShape_MinorAxisLength.x',
                    'AreaShape_MeanRadius.x', 'AreaShape_FormFactor.x', 'Intensity_MeanIntensity_CorrActin',
                    'RadialDistribution_FracAtD_CorrActin_1of3', 'RadialDistribution_FracAtD_CorrActin_2of3')

In [None]:
form5 = as.formula(paste("Mean_FilterNKPerfGranules_Distance_Minimum_FilterNKCytoplasm ~ ", 
                         paste(interpretableFt, collapse = " + ")))
model5 = lm(form5, data = cell[cellToKeep, ftToKeep])

In [None]:
summary(model5)

In [None]:
# This model seems more interesting than the previous ones
AIC(model5)
BIC(model5)

In [None]:
linModDF = data.frame(MeanMinimumDistance = 
                      cell[cellToKeep,]$Mean_FilterNKPerfGranules_Distance_Minimum_FilterNKCytoplasm,
                      PredictedDistance = model5$fitted.values,
                      Donor = as.factor(cellDonor))
ggplot(linModDF, aes(x = MeanMinimumDistance, y = PredictedDistance, color = Donor)) + geom_point()

In [None]:
sessionInfo()