# Patient ARPC1B deficient cells

In [None]:
library(ggplot2)
library(extrafont)
library(stringr)
library(heatmaply)
library(ggrepel)
library(reticulate)
library(gridExtra)
library(robustbase)
library(randomForest)
library(reshape2)

In [None]:
# Load external Python functions
source_python("reticulate_functions.py")
# Load extra fonts
ttf_import(paths = "/tmp/.fonts/")
loadfonts()
# Set this to true for faster compilation or false for more precise results and all outputs
TEST_MODE = F

In [None]:
# Number of permutations used for empirical p-value computations
PERM_NB_ITER = ifelse(TEST_MODE, 20, 2000)
# Number of trees used in random forest classifier
RF_NB_TREES = ifelse(TEST_MODE, 50, 1000)

In [None]:
# Customize ggplot appearance

# Change theme
customTheme <- theme_light() + 
               theme(panel.grid.minor=element_blank(), text=element_text(size=17, family="Arial", colour = "#333333"),
                     line=element_line(colour = "#333333"), legend.background = element_rect(fill=alpha('#CCCCCC', 0.1)), legend.key = element_blank())

# Change default colors
scale_colour_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_colour_gradient(...), 
        viridis = scale_colour_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))
}
scale_color_continuous <- scale_colour_continuous

scale_fill_continuous <- function (..., begin = 0.1, end = 0.9, direction = -1, option = "plasma", 
                                     type = getOption("ggplot2.continuous.colour", default = "viridis")) {
    switch(type, gradient = scale_fill_gradient(...), 
        viridis = scale_fill_viridis_c(option = option, begin = begin, end = end, direction = direction, ...), 
        stop("Unknown scale type", call. = FALSE))

}

cemm_pal = colorRampPalette(c("#5A463C", "#008CAD", "#40B9D4", "#D4ECF2", "#D2323C", "#F8B100", "#DFDC00"))
cust_pal = colorRampPalette(c("#008CAD", "#40B9D4", "#D4ECF2", "#F8B100", "#C00000", "#2D0000"))
yolla_pal = colorRampPalette(c('#FC7070', '#C00000', '#2D0000'))
scale_fill_discrete <- function (..., type = "Cust", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "fill") 
{
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else if (type == "Yolla"){
        discrete_scale(aesthetics, "Yolla", yolla_pal, na.value = na.value, ...)
    } else if (type == "Cust"){
        discrete_scale(aesthetics, "Cust", cust_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}

scale_color_discrete <- function (..., type = "Cust", h = c(0, 360) + 15, c = 100, l = 65, h.start = 0, 
    direction = 1, na.value = "grey50", aesthetics = "colour") {
    if (type == "CeMM"){
        discrete_scale(aesthetics, "CeMM", cemm_pal, na.value = na.value, ...)
    } else if (type == "Cust"){
        discrete_scale(aesthetics, "Cust", cust_pal, na.value = na.value, ...)
    } else if (type == "Yolla"){
        discrete_scale(aesthetics, "Yolla", yolla_pal, na.value = na.value, ...)
    } else {
        discrete_scale(aesthetics, "hue", scales::hue_pal(h, c, l, h.start, 
            direction), na.value = na.value, ...)
    }
}
scale_colour_discrete <- scale_color_discrete

noGridTheme <- function(...){
    theme(panel.grid.major=element_blank(), axis.text.x=element_text(size=12), axis.text.y=element_text(size=12),
                      axis.line=element_line(color="#333333", size = 0.2), panel.border = element_blank(), ...)
}

darkTheme <- function(...){
    theme(panel.background = element_rect(fill = '#333333'), plot.background = element_rect(fill = '#333333'), 
          axis.line=element_line(color="#CCCCCC", size = 0.2), 
          text=element_text(size=17, family="Arial", colour = "#CCCCCC"),
          line=element_line(colour = "#CCCCCC"))
}

theme_set(customTheme)

options(repr.plot.width=10, repr.plot.height=10)

## Patient cells (∆ARPC1B)

In [None]:
# Fix t-SNE layout and random number generation
set.seed(38)

In [None]:
LT = read.csv("Rsc/allImages_LT_ARPC1B.csv", header=T)

In [None]:
annotation = read.csv("Rsc/ARPC1BPlateLayout.csv", header=T)[1:90,] # Avoid terminal empty lines

In [None]:
getDonor <- function(row,column){
    return(annotation[(annotation$Row == row)&(annotation$Column == column),1])
}
getCoating <- function(row,column){
    return(annotation[(annotation$Row == row)&(annotation$Column == column),2])
}
LT$Donor = mapply(getDonor, LT$Metadata_Row, LT$Metadata_Column)
LT$Coating = mapply(getCoating, LT$Metadata_Row, LT$Metadata_Column)
# Is normal donor?
LT$ND = F
LT$ND[grep("ND", LT$Donor)] = T
# Which patient?
LT$Patient = ifelse(LT$ND, "ND", as.character(LT$Donor))

We discard the data from the PLL coated plates (not studied in this analysis).

In [None]:
LT = LT[LT$Coating != "PLL",]

In [None]:
gpLT = ggplot(LT[!is.na(LT$Count_FilterNKNucleus),]) + geom_histogram(aes(Count_FilterNKNucleus, fill = as.factor(Metadata_Row)), binwidth=2) +
       scale_fill_discrete(name="Row")
gpLT
if(!TEST_MODE){
    ggsave(filename = "Fig/LT_ARPC1B_count.pdf", plot = gpLT)
}

In [None]:
gpLT = ggplot(LT[!is.na(LT$Mean_FilterNKCytoplasm_AreaShape_Area),]) + geom_histogram(aes(Mean_FilterNKCytoplasm_AreaShape_Area, 
                                                                                    fill = as.factor(Donor)), binwidth=100) +
       scale_fill_discrete(name="Row")
gpLT
if(!TEST_MODE){
    ggsave(filename = "Fig/LT_ARPC1B_area.pdf", plot = gpLT)
}

### Filtering

In [None]:
FILT_MAX_INT_DNA = 0.01 # Remove empty images and small DNA precipitations
FILT_MIN_CELLS = 5 # Most field of views have only 2 cells
FILT_NB_MAX_NA_IMAGE = 1
FILT_MAX_CORR = 0.6 # Keep uncorrelated variables
dimUMAP = 10

In [None]:
ftToKeep = 1:dim(LT)[2]
# Make sure that the fields are numeric
ftToKeep <- ftToKeep[which(sapply(LT[,ftToKeep], class) == "numeric")]
# Remove execution time and count features
ftToKeep <- ftToKeep[grep("(Execution)|(Count)|(Concentration)|(ActinGranules)", colnames(LT)[ftToKeep], invert = T)]

In [None]:
# Store all remaining features before filtering for downstream comparison to selected set
preFiltFt = colnames(LT)[ftToKeep]

In [None]:
# Remove wells with low max DNA intensity
fieldToKeep <- which(LT$ImageQuality_MaxIntensity_DNA >= FILT_MAX_INT_DNA)
# Remove wells with low cell count
fieldToKeep <- na.omit(fieldToKeep[LT[fieldToKeep,]$Count_FilterNKNucleus >= FILT_MIN_CELLS])

In [None]:
# Few bad quality pictures are generating a lot of missing values and are removed
fieldToKeep <- fieldToKeep[rowSums(is.na(LT[fieldToKeep,ftToKeep])) < FILT_NB_MAX_NA_IMAGE]

In [None]:
# Remove remaining features with missing values
ftToKeep <- ftToKeep[colSums(is.na(LT[fieldToKeep,ftToKeep])) == 0] 
# Remove constant columns
indWT = LT[fieldToKeep,]$ND
ftToKeep <- ftToKeep[sapply(LT[fieldToKeep,ftToKeep], function(x) mad(x) != 0)]
ftToKeep <- ftToKeep[sapply(LT[fieldToKeep[indWT],ftToKeep], function(x) mad(x) != 0)]

In [None]:
# Approximate normal distribution
transfLog <- function (x){
    minVal = min(x)
    return(log(x+1-minVal))
}

# Center and scale on control values
transfNorm <- function(x, y){
    return((x - median(y))/mad(y))
}

In [None]:
# Try centering on all healthy donors
transformedLT = apply(LT[fieldToKeep, ftToKeep], 2, transfLog)
transformedLT = apply(transformedLT, 2, function(x) transfNorm(x, x[indWT]))

In [None]:
# Order features from biggest mad to smallest mad
# Since features have mad(ND) = 1, it means that we rank features by how more variable they are
# for patients than for ND
orderFt = rev(order(apply(transformedLT, 2, mad)))

In [None]:
uncorrFt = uncorrelate(transformedLT, orderCol = orderFt-1, threshold = FILT_MAX_CORR)
uncorrFt = unlist(uncorrFt) + 1

In [None]:
transformedLT = transformedLT[,uncorrFt]

In [None]:
dim(LT)
dim(LT[fieldToKeep, ftToKeep])
dim(transformedLT)

### Export subset of features

For LT cells, the following features are selected and explored separately for their biological interpretability:
```
* Actin intensity/cell (mean/well): LT$Intensity_MeanIntensity_CorrActin_FilterCytoplasm
* Cell area: LT$Mean_FilterCytoplasm_AreaShape_Area
* Cell roundness: LT$Mean_FilterCytoplasm_AreaShape_FormFactor
* Cell width: LT$Mean_FilterCytoplasm_AreaShape_MajorAxisLength 
* Cell length: LT$Mean_FilterCytoplasm_AreaShape_MinorAxisLength
* Cell length to width ratio: LT$Mean_FilterCytoplasm_AreaShape_MinorAxisLength / LT$Mean_FilterCytoplasm_AreaShape_MajorAxisLength
* Average number of perforin granules / cell: LT$Count_PerfGranules  / LT$Count_FilterCytoplasm
* Perforin area / cell area: (LT$Count_PerfGranules * LT$Mean_PerfGranules_AreaShape_Area)  / (LT$Count_FilterCytoplasm * LT$Mean_FilterCytoplasm_AreaShape_Area)
* Perforin intensity: LT$Intensity_MeanIntensity_CorrPerf_FilterCytoplasm
* Perforin area: LT$Mean_PerfGranules_AreaShape_Area
* Nucleus intensity: LT$Intensity_MeanIntensity_CorrDNA_FilteredNucleus
* Nucleus area: LT$Mean_FilteredNucleus_AreaShape_Area
* Nucleus roundness: LT$Mean_FilteredNucleus_AreaShape_FormFactor
* Nucleus width: LT$Mean_FilteredNucleus_AreaShape_MajorAxisLength 
* Nucleus length: LT$Mean_FilteredNucleus_AreaShape_MinorAxisLength
* Nucleus ratio: LT$Mean_FilteredNucleus_AreaShape_MinorAxisLength / LT$Mean_FilteredNucleus_AreaShape_MajorAxisLength
* Nucleus area / cell area: (LT$Count_FilteredNucleus * LT$Mean_FilteredNucleus_AreaShape_Area)  / (LT$Count_FilterCytoplasm * LT$Mean_FilterCytoplasm_AreaShape_Area)
* LFA intensity: LT$Intensity_MeanIntensity_CorrLFA_FilterNKLFA
* LFA-Actin correlation: LT[fieldToKeep,]$Mean_FilterNKCytoplasm_Correlation_Correlation_CorrLFA_CorrActin
* LFA area: LT$Mean_FilterNKLFA_AreaShape_Area
* LFA area / cell area: LT$Mean_FilterNKLFA_AreaShape_Area / (LT$Count_FilterCytoplasm * LT$Mean_FilterCytoplasm_AreaShape_Area)```

NB (from CellProfiler docs): FormFactor = $4 \times π \times Area / Perimeter^2$. Equals 1 for a perfectly circular object.

In [None]:
subsetLT = cbind(Field = str_extract(as.character(LT[fieldToKeep,]$URL_Actin), "r..c..f.."),
                 Coating = as.character(LT[fieldToKeep,]$Coating),
                 Donor = as.character(LT[fieldToKeep,]$Donor),
                 ActinIntensity = LT[fieldToKeep,]$Intensity_MeanIntensity_CorrActin_FilterNKCytoplasm,
                 CellArea = LT[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_Area,
                 CellRoundness = LT[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_FormFactor,
                 CellWidth = LT[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_MajorAxisLength,
                 CellLength = LT[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_MinorAxisLength,
                 CellLengthOverWidth = LT[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_MinorAxisLength / 
                     LT[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_MajorAxisLength,
                 PerforinGranulesPerCell = LT[fieldToKeep,]$Count_FilterNKPerfGranules  / 
                     LT[fieldToKeep,]$Count_FilterNKCytoplasm,
                 PerforinAreaOverCellArea = (LT[fieldToKeep,]$Count_FilterNKPerfGranules * 
                     LT[fieldToKeep,]$Mean_PFilterNKPerfGranules_AreaShape_Area) / 
                     (LT[fieldToKeep,]$Count_FilterNKCytoplasm * 
                      LT[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_Area),
                 PerforinIntensity = LT[fieldToKeep,]$Intensity_MeanIntensity_CorrPerf_FilterNKCytoplasm,
                 PerforinArea = LT[fieldToKeep,]$Mean_FilterNKPerfGranules_AreaShape_Area,
                 NucleusIntensity = LT[fieldToKeep,]$Intensity_MeanIntensity_CorrDNA_FilterNKNucleus,
                 NucleusArea = LT[fieldToKeep,]$Mean_FilterNKNucleus_AreaShape_Area,
                 NucleusRoundness = LT[fieldToKeep,]$Mean_FilterNKNucleus_AreaShape_FormFactor,
                 NucleusWidth = LT[fieldToKeep,]$Mean_FilterNKNucleus_AreaShape_MajorAxisLength,
                 NucleusLength = LT[fieldToKeep,]$Mean_FilterNKNucleus_AreaShape_MinorAxisLength,
                 NucleusLengthOverWidth = LT[fieldToKeep,]$Mean_FilterNKNucleus_AreaShape_MinorAxisLength / 
                     LT[fieldToKeep,]$Mean_FilterNKNucleus_AreaShape_MajorAxisLength,
                 NucleusAreaOverCellArea = (LT[fieldToKeep,]$Count_FilterNKNucleus * 
                     LT[fieldToKeep,]$Mean_FilterNKNucleus_AreaShape_Area) / 
                     (LT[fieldToKeep,]$Count_FilterNKCytoplasm * 
                      LT[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_Area),
                 LfaIntensity = LT[fieldToKeep,]$Intensity_MeanIntensity_CorrLFA_FilterNKLFA,
                 LfaActinCorrelation = LT[fieldToKeep,]$Mean_FilterNKCytoplasm_Correlation_Correlation_CorrLFA_CorrActin,
                 LfaArea = LT[fieldToKeep,]$Mean_FilterNKLFA_AreaShape_Area,
                 LfaAreaOverCellArea = LT[fieldToKeep,]$Mean_FilterNKLFA_AreaShape_Area / 
                     LT[fieldToKeep,]$Mean_FilterNKCytoplasm_AreaShape_Area
                )

In [None]:
# Export list of "interpretable" features
if(!TEST_MODE){
    write.csv(subsetLT, "Tab/LT_ARPC1B_features.csv", row.names = F)
}

In [None]:
CountPerWell = aggregate(LT[fieldToKeep,]$Count_FilterNKCytoplasm, by = list(LT[fieldToKeep,]$Metadata_Well), 
                         FUN = function(x) sum(x, na.rm = T))
names(CountPerWell) <- c("Well", "Count")
CountPerCoating = aggregate(LT[fieldToKeep,]$Count_FilterNKCytoplasm, by = list(LT[fieldToKeep,]$Coating), 
                            FUN = function(x) sum(x, na.rm = T))
names(CountPerCoating) <- c("Coating", "Count")
CountPerDonor = aggregate(LT[fieldToKeep,]$Count_FilterNKCytoplasm, by = list(LT[fieldToKeep,]$Donor), 
                          FUN = function(x) sum(x, na.rm = T))
names(CountPerDonor) <- c("Donor", "Count")

In [None]:
wellCountGranules = aggregate(LT[fieldToKeep,]$Count_FilterNKPerfGranules, 
                              by = list(LT[fieldToKeep,]$Metadata_Well), FUN = sum)
wellCountCells = aggregate(LT[fieldToKeep,]$Count_FilterNKCytoplasm, 
                           by = list(LT[fieldToKeep,]$Metadata_Well), FUN = sum)
stopifnot(wellCountGranules$Group.1 == wellCountCells$Group.1)
granulePerCellPerWell = data.frame(Well = wellCountGranules$Group.1, Average = wellCountGranules$x / wellCountCells$x)

In [None]:
# Export cell counts
if(!TEST_MODE){
    write.csv(CountPerWell, "Tab/LT_ARPC1B_count_well.csv", row.names = F)
    write.csv(CountPerCoating, "Tab/LT_ARPC1B_count_coating.csv", row.names = F)
    write.csv(CountPerDonor, "Tab/LT_ARPC1B_count_donor.csv", row.names = F)
    write.csv(granulePerCellPerWell, "Tab/LT_ARPC1B_average_granule_count_per_cell.csv", row.names = F)
}

### Look at which types of features are kept

In [None]:
catChannel = c("CorrDNA", "CorrActin", "CorrPerf", "CorrLFA")
table(rowSums(sapply(catChannel, function(x) grepl(x, colnames(transformedLT)))))
dtCat = data.frame(CountIni = sort(colSums(sapply(catChannel, function(x) grepl(x, preFiltFt)))), 
                   Count = sort(colSums(sapply(catChannel, function(x) grepl(x, colnames(transformedLT))))))

dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) + ylim(c(0,700)) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 2, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/LT_ARPC1B_SelecFt_Channel.pdf", plot = gp, width = 10)
}

In [None]:
catObjects = c("FilterNKCytoplasm", "FilterNKLFA", "ShrunkenCytoplasm", "FilterNKNucleus", "FilterNKPerfGranules")
table(rowSums(sapply(catObjects, function(x) grepl(x, colnames(transformedLT)))))
dtCat = data.frame(CountIni = colSums(sapply(catObjects, function(x) grepl(x, preFiltFt))), 
                   Count = colSums(sapply(catObjects, function(x) grepl(x, colnames(transformedLT)))))

dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) + ylim(c(0,650)) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 2, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/LT_ARPC1B_SelecFt_Object.pdf", plot = gp, width = 10)
}

In [None]:
catType = c("Threshold", "Granularity", "ImageQuality", "Texture", "Distance", "AreaShape", "RadialDistribution", "Neighbors", 
            "Correlation", "Intensity", "Overlap", "Location")
which(rowSums(sapply(catType, function(x) grepl(x, colnames(transformedLT)))) == 0) # All features are covered

dtCat = data.frame(CountIni = colSums(sapply(catType, function(x) grepl(x, preFiltFt))), 
                   Count = colSums(sapply(catType, function(x) grepl(x, colnames(transformedLT)))))
dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 5, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/LT_ARPC1B_SelecFt_Type_All.pdf", plot = gp, width = 10)
}

In [None]:
catType = c("Granularity", "Texture", "AreaShape", "RadialDistribution",
            "Correlation", "Intensity")
table(rowSums(sapply(catType, function(x) grepl(x, colnames(transformedLT)))))

dtCount = as.data.frame(sapply(catType, function(x) grepl(x, colnames(transformedLT))))
dtCount$Other = !apply(dtCount, 1, any)
dtCountIni = as.data.frame(sapply(catType, function(x) grepl(x, preFiltFt)))
dtCountIni$Other = !apply(dtCountIni, 1, any)
dtCat = data.frame(CountIni = colSums(dtCountIni), 
                   Count = colSums(dtCount))
dtCat$Order <- rank(dtCat$CountIni, ties.method = c("first"))
dtCat$Category <- factor(rownames(dtCat), levels=rownames(dtCat)[order(dtCat$CountIni)])
dtCat$Ratio <- dtCat$Count / dtCat$CountIni
dtCat$Ratio <- paste0(round(100*dtCat$Ratio, 1), "%")
gp <- ggplot(dtCat) + geom_bar(aes(Category, weight = CountIni), fill = cust_pal(2)[1]) +
              geom_bar(aes(Category, weight = Count), fill = cust_pal(2)[2]) + 
              geom_text(aes(x = Order, y = CountIni + 5, label = Ratio), hjust = 0) + coord_flip()
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/LT_ARPC1B_SelecFt_Type_Short.pdf", plot = gp, width = 10)
}

In [None]:
# Export list of features kept
if(!TEST_MODE){
    write.table(colnames(transformedLT), file = "Tab/LT_ARPC1B_list_features_kept.csv", col.names = F, row.names = F)
}

### Look at the morphological distribution of the fields of view

In [None]:
# Fix random number generation
set.seed(38)

In [None]:
umTLT = umap(transformedLT, min_dist = 0.1, neighbors = 10, n = 2, metric = "euclidean")
umTLT = as.data.frame(umTLT)
names(umTLT) = paste0("UMAP", 1:2)

In [None]:
umTLT$Row <- as.factor(LT$Metadata_Row[fieldToKeep])
umTLT$Col <- as.factor(LT$Metadata_Column[fieldToKeep])
umTLT$URL <- as.factor(LT$URL_Actin[fieldToKeep])
umTLT$Coating <- as.factor(LT$Coating[fieldToKeep])
umTLT$Donor <- as.factor(LT$Donor[fieldToKeep])
umTLT$Well <- as.factor(LT$Metadata_Well[fieldToKeep])

In [None]:
gp <- ggplot(umTLT) + geom_point(aes(UMAP1, UMAP2, color = Donor))
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/LT_ARPC1B_UMAP_all_donors.pdf", plot = gp, width = 10)
}

In [None]:
gp <- ggplot(umTLT) + geom_point(aes(UMAP1, UMAP2, color = Coating))
gp

In [None]:
umTLT$Count <- LT$Count_FilterNKNucleus[fieldToKeep]
gp <- ggplot(umTLT) + geom_point(aes(UMAP1, UMAP2, color = Count))
gp

    umTLT$Labs = ifelse((umTLT$UMAP2 > 20), str_extract(umTLT$URL, "r..c..f.."), "")
    gp <- ggplot(umTLT) + geom_point(aes(UMAP1, UMAP2, color = Donor))
    gp + geom_text_repel(aes(UMAP1, UMAP2, label = Labs))

In [None]:
umXlim = ggplot_build(gp)$layout$panel_scales_x[[1]]$range$range
umYlim = ggplot_build(gp)$layout$panel_scales_y[[1]]$range$range

In [None]:
# Display separately the two coatings on a UMAP plot computed for both
sharedUmapCoating <- function(coating){
    umCoating = umTLT[LT$Coating[fieldToKeep] == coating,]
    
    gp <- ggplot(umCoating) + geom_point(aes(UMAP1, UMAP2, color = Donor)) +
          xlim(umXlim) + ylim(umYlim)
    print(gp)
    ggsave(filename = paste("Fig/LT_ARPC1B_UMAP", coating, "coating.pdf", sep = "_"), plot = gp, width = 10)
}

In [None]:
if(TEST_MODE){
    sapply(levels(droplevels(umTLT$Coating)), sharedUmapCoating)
}

#### Distances to ND cells

In [None]:
umTLT = umap(transformedLT, min_dist = 0.1, neighbors = 10, n = dimUMAP, metric = "euclidean")
umTLT = as.data.frame(umTLT)
names(umTLT) = paste0("UMAP", 1:dimUMAP)

In [None]:
# Compute the median Robust Mahalanobis Distance (RMD) between drugs
condRMD <- function(donor, coating){
    setUmapDonor = umTLT[(LT[fieldToKeep,]$Donor == donor)&(LT[fieldToKeep,]$Coating == coating),1:dimUMAP]
    setUmapWT = umTLT[which((LT[fieldToKeep,]$ND)&(LT[fieldToKeep,]$Coating == coating)),1:dimUMAP]
    
    # Ensure that we have enough points to compute distance
    if ((dim(setUmapDonor)[1] < 2*dimUMAP)|(dim(setUmapWT)[1] < 2*dimUMAP)){
            print(donor, coating)
            return(NA)
    }
    
    # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
    mcdWT = covMcd(setUmapWT)
    RMD = median(apply(setUmapDonor, 1, function(x) mahalanobis(x, mcdWT$center, mcdWT$cov)))
    return(RMD)
}

In [None]:
shuffCondRMD <- function(donor, coating, nbRep = PERM_NB_ITER){
    setUmapDonor = umTLT[(LT[fieldToKeep,]$Donor == donor)&(LT[fieldToKeep,]$Coating == coating),1:dimUMAP]
    setUmapWT = umTLT[which((LT[fieldToKeep,]$ND)&(LT[fieldToKeep,]$Coating == coating)),1:dimUMAP]
    setUMAP = rbind(setUmapWT,setUmapDonor)
    shuffleRMD <- function(notUsed){
        shuffSetUMAP = setUMAP[sample(nrow(setUMAP)),]
        # Take random subsets of corresponding sizes
        shuffSetDonor = shuffSetUMAP[1:nrow(setUmapDonor),]
        shuffSetWT = shuffSetUMAP[(nrow(setUmapDonor)+1):(nrow(setUmapDonor)+nrow(setUmapWT)),]
        
        # Ensure that we have enough points to compute distance
        if ((dim(shuffSetDonor)[1] < 2*dimUMAP)|(dim(shuffSetWT)[1] < 2*dimUMAP)){
                print(donor, coating)
                return(NA)
        }
        
        # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance
        mcdWT = covMcd(shuffSetWT)
        RMD = median(apply(shuffSetDonor, 1, function(x) mahalanobis(x, mcdWT$center, mcdWT$cov)))
        return(RMD)
    }
    return(sapply(1:nbRep, shuffleRMD))
}

In [None]:
getRMPV <- function(x){
    ecdfRMD = ecdf(x[-1])
    return(1 - ecdfRMD(x[1]))
}

In [None]:
for (coating in levels(droplevels(LT$Coating[fieldToKeep]))){
    RMD_LT = sapply(levels(droplevels(LT$Donor[fieldToKeep])), function(x) condRMD(x, coating))
    shuffRMD_LT = sapply(levels(droplevels(LT$Donor[fieldToKeep])), function(x) shuffCondRMD(x, coating))
    dfRMPV = data.frame(RMPV = apply(rbind(RMD_LT, shuffRMD_LT), 2, getRMPV))
    dfRMPV$Donor = rownames(dfRMPV)
    dfRMPV$Strength = RMD_LT
    dfRMPV$adjRMPV = p.adjust(dfRMPV$RMPV, method = "fdr") 
    gp <- ggplot(dfRMPV) + geom_point(aes(adjRMPV, Strength, color = Donor)) + 
        geom_vline(xintercept = 0.05, color="#CCCCCC", linetype="dashed")
    print(gp)
    if(!TEST_MODE){
        ggsave(filename = paste("Fig/LT_ARPC1B_RMPV", coating, "Donors.pdf", sep = "_"), gp, width = 10)
    }
}

### Morphological changes in patients

In [None]:
# Associate categories and colors to features
catType = c("Granularity", "Texture", "AreaShape", "RadialDistribution",
            "Intensity")
colType = cust_pal(length(catType)+1)

dtCount = as.data.frame(sapply(catType, function(x) grepl(x, colnames(transformedLT))))
dtCount$Other = !apply(dtCount, 1, any)
 
ftCat = as.factor(apply(dtCount, 1, function(x) colnames(dtCount)[which(x)]))

# Order levels so that "Other" is last
lvOther = which(levels(ftCat) == "Other")
ftCat = factor(ftCat, levels = c(levels(ftCat)[-lvOther], "Other"))
       
# Corresponding colors
ftCol = colType[ftCat]

In [None]:
# Plot number of images / categories (useful to get fill color legend)
gp <- ggplot(data.frame(Category = ftCat), aes(fill = Category)) + geom_bar(aes(Category))
if(!TEST_MODE){
    ggsave(filename = "Fig/LT_ARPC1B_CatCount.pdf", plot = gp, width = 10)
}

## Prediction of CD3 concentration based on morphology
5CV + 1 fold validation


In [None]:
set.seed(38)

In [None]:
indIcamDonor = (LT$Coating[fieldToKeep] != "PLL")&LT$ND[fieldToKeep]
icamDonorLT = transformedLT[indIcamDonor,]

In [None]:
# Number of images per fold (5CV + validation set)
nbSampTestSet = round(nrow(icamDonorLT)/6)

In [None]:
# Shuffle images
idImages = sample(nrow(icamDonorLT))

# Last fold will be used as validation set
validationSet = as.data.frame(icamDonorLT[-idImages[1:(nbSampTestSet*5)],])
validationSet$Coating = droplevels(LT$Coating[fieldToKeep][indIcamDonor][-idImages[1:(nbSampTestSet*5)]])

table(validationSet$Coating)

In [None]:
# F1 score for a given level
F1pred <- function(preds, obs, curLvl){
    TP = sum((preds == curLvl) & (obs == curLvl))
    FP = sum((preds == curLvl) & (obs != curLvl))
    TN = sum((preds != curLvl) & (obs != curLvl))
    FN = sum((preds != curLvl) & (obs == curLvl))
    stopifnot(length(preds) == TP+FP+TN+FN)
    stopifnot(length(obs) == TP+FP+TN+FN)
    return((2*TP)/(2*TP+FN+FP))
}

In [None]:
# Perform random forest cross-validation on a given dataset
crossValRF <- function(dataset, folds, nbSampTestSet, idImages, mtryRange, nbTrees = RF_NB_TREES){
    # Fitness matrix (F1 score between 0 and 1, with 1 optimal)
    fitMat = matrix(ncol = folds, nrow = length(mtryRange))
    rownames(fitMat) <- as.character(mtryRange)
    for (mtryId in 1:length(mtryRange)) {
        curMtry = mtryRange[mtryId]
        print(paste("Mtry", curMtry))
        for (fold in 1:folds) {
            print(paste("Fold", fold))
            foldInd = ((fold-1)*nbSampTestSet+1):(fold*nbSampTestSet)
            testSet = dataset[foldInd,]
            trainSet = dataset[-foldInd,]

            # All classes are present in train and test sets
            stopifnot(length(table(testSet$Coating)) == 2)
            stopifnot(length(table(trainSet$Coating)) == 2)

            rf = randomForest(Coating ~ ., data = trainSet, mtry = curMtry, ntree = nbTrees)

            print(paste("Mean class error", mean(rf$confusion[,3])))
            # Prediction on test set
            preds <- predict(rf, testSet)
            # Accuracy
            print(paste("Accuracy", mean(preds == testSet$Coating)))
            # Macro F1 score
            fitMat[mtryId, fold] = mean(sapply(levels(testSet$Coating), 
                        function(x) F1pred(preds, testSet$Coating, x)))
            flush.console()
        }
    }
    return(fitMat)
}

In [None]:
datasetCV = as.data.frame(icamDonorLT[idImages[1:(nbSampTestSet*5)],])
datasetCV$Coating = droplevels(LT$Coating[fieldToKeep][indIcamDonor][idImages[1:(nbSampTestSet*5)]])
CM = crossValRF(dataset = datasetCV, folds = 5, nbSampTestSet = nbSampTestSet, 
                idImages = idImages, mtryRange = c(20, 30, 40, 50, 60, 70, 80, 90))

In [None]:
CM
rowMeans(CM)
optiMtry = as.numeric(rownames(CM)[rowMeans(CM) == max(rowMeans(CM))])

In [None]:
optiMtry
fullRF = randomForest(Coating ~ ., data = datasetCV, mtry = optiMtry, ntree = RF_NB_TREES, localImp = T)

In [None]:
preds <- predict(fullRF, validationSet)
# Accuracy
print(paste("Validation accuracy", mean(preds == validationSet$Coating)))
# Macro F1-score
mean(sapply(levels(validationSet$Coating), 
            function(x) F1pred(preds, validationSet$Coating, x)))

In [None]:
fullRF

### Output confusion matrix

In [None]:
confMat = data.frame(Observed = rep(levels(validationSet$Coating), each = length(levels(validationSet$Coating))),
                     Predicted = rep(levels(validationSet$Coating), length(levels(validationSet$Coating))),
                     Count = 0)
# Color: white on diagonal (for text readability)
confMat$Color = ifelse(confMat$Observed == confMat$Predicted, "#FFFFFF", "#333333")

In [None]:
for (i in 1:nrow(validationSet)){
    idMat = (confMat$Observed == validationSet$Coating[i]) & (confMat$Predicted == preds[i])
    confMat$Count[idMat] = confMat$Count[idMat] + 1
}

head(confMat)

In [None]:
totalDF = aggregate(confMat$Count, by = list(confMat$Observed), FUN = sum)
names(totalDF) <- c("Observed", "Count")
totalDF$Predicted = "Total"
totalDF$Color = "#FFFFFF"
confMat = rbind(confMat, totalDF)

In [None]:
f1DF = as.data.frame(round(sapply(levels(validationSet$Coating), 
            function(x) F1pred(preds, validationSet$Coating, x)), 2))
names(f1DF) <- "Count"
f1DF$Observed = rownames(f1DF)
f1DF$Predicted = "F1 score"
f1DF$Color = "#333333"
confMat = rbind(confMat, f1DF)

In [None]:
# Change coating labels
levels(confMat$Observed) = c("1µg/ml", "10µg/ml")
confMat$Observed = factor(confMat$Observed, 
                          levels = rev(levels(confMat$Observed)))
levels(confMat$Predicted) = c("1µg/ml", "10µg/ml", 'Total', 'F1 score') 

In [None]:
gp <- ggplot(confMat, aes(Predicted, Observed)) + geom_tile(aes(fill = Count)) +
    geom_text(aes(label = Count, color = Color), size = 8) + scale_color_identity() +
    guides(fill = F) + theme(axis.text = element_text(size = 24), axis.title = element_text(size = 24)) +
    geom_vline(xintercept = 2.5, color = "#FFFFFF", size = 1.5)
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/LT_ARPC1B_CD3_concentration_prediction.pdf", plot = gp, width = 10)
}

### Combination of features by feature type and biological object

In [None]:
catObjects = c("FilterNKCytoplasm", "FilterNKLFA", "ShrunkenCytoplasm", "FilterNKNucleus", "FilterNKPerfGranules")
dtCount = as.data.frame(sapply(catObjects, function(x) grepl(x, colnames(transformedLT))))
dtCount$Other = apply(dtCount, 1, function(x) sum(x) != 1)
# Features matching several objects are classified as "Other"
dtCount[dtCount$Other,-6] <- F

In [None]:
ftCatObj = as.factor(apply(dtCount, 1, function(x) colnames(dtCount)[which(x)]))

# Rename to useful names
levels(ftCatObj) <- str_remove(string = levels(ftCatObj), pattern = "FilterNK")
levels(ftCatObj)[levels(ftCatObj) %in% c("LFA", "ShrunkenCytoplasm")] <- "Cytoplasm"
                           
# Order levels so that "Other" is last
lvOther = which(levels(ftCatObj) == "Other")
ftCatObj = factor(ftCatObj, levels = c(levels(ftCatObj)[-lvOther], "Other"))
     
# Combination of feature category and biological object described
combFtCat = paste(ftCat, ftCatObj, sep = " / ")

In [None]:
combImpDF = aggregate(fullRF$importance[,colnames(fullRF$importance) == "MeanDecreaseAccuracy"],
                      by = list(combFtCat), FUN = sum)
combImpDF = cbind(combImpDF, aggregate(fullRF$importance[,colnames(fullRF$importance) == "MeanDecreaseAccuracy"],
                      by = list(combFtCat), FUN = length)[,2], "Cumulated")
names(combImpDF) = c("Type", "Count", "Cardinality", "CountType")
combImpDF$Type = factor(combImpDF$Type, levels = combImpDF$Type[order(combImpDF$Count)])

In [None]:
# Add counts normalized by number of feature in a given type

# Visual correction: a bar with this absolute count will have an average bar of equal size
visNormFactor = 5
relDF = data.frame(Type = combImpDF$Type, 
                   Count = -visNormFactor * combImpDF$Count / combImpDF$Cardinality,
                   Cardinality = NA, CountType = "Average")
combImpDF = rbind(combImpDF, relDF)

In [None]:
gp <- ggplot(combImpDF) + geom_col(aes(Type, Count, fill = Cardinality)) +
                    xlab("Feature type / object") + ylab("Importance") + coord_flip() +
                    guides(fill = F) + geom_text(aes(x = Type, y = Count + 0.002, label = Cardinality), size = 4) +
                    scale_y_continuous(breaks = seq(-0.05, 0.07, 0.01), 
                                       labels = c(abs(seq(-0.05, -0.001, 0.01)/visNormFactor),seq(0, 0.07, 0.01))) +
                    geom_hline(yintercept = 0, color = "#333333") +
                    annotate("text", x = 1, y = 0.05, label = "Cumulated") +
                    annotate("text", x = 1, y = - 0.025, label = "Average")
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/LT_ARPC1B_CD3_concentration_prediction_features.pdf", plot = gp, height = 5, width = 10)
}

## Prediction of ARPC1B status based on morphology

5CV + 1 fold validation

In [None]:
set.seed(38)

In [None]:
indIcam10 = (LT$Coating[fieldToKeep] == "Icam 10")
icamLT = transformedLT[indIcam10,]

In [None]:
# Number of images per fold (5CV + validation set)
nbSampTestSet = round(nrow(icamLT)/6)

In [None]:
# Shuffle images
idImages = sample(nrow(icamLT))

# Last fold will be used as validation set
validationSet = as.data.frame(icamLT[-idImages[1:(nbSampTestSet*5)],])
validationSet$ND = as.factor(LT$Patient[fieldToKeep][indIcam10][-idImages[1:(nbSampTestSet*5)]])

table(validationSet$ND)

In [None]:
# F1 score for a given level
F1pred <- function(preds, obs, curLvl){
    TP = sum((preds == curLvl) & (obs == curLvl))
    FP = sum((preds == curLvl) & (obs != curLvl))
    TN = sum((preds != curLvl) & (obs != curLvl))
    FN = sum((preds != curLvl) & (obs == curLvl))
    stopifnot(length(preds) == TP+FP+TN+FN)
    stopifnot(length(obs) == TP+FP+TN+FN)
    return((2*TP)/(2*TP+FN+FP))
}

In [None]:
# Perform random forest cross-validation on a given dataset
crossValRF <- function(dataset, folds, nbSampTestSet, idImages, mtryRange, nbTrees = RF_NB_TREES){
    # Fitness matrix (F1 score between 0 and 1, with 1 optimal)
    fitMat = matrix(ncol = folds, nrow = length(mtryRange))
    rownames(fitMat) <- as.character(mtryRange)
    for (mtryId in 1:length(mtryRange)) {
        curMtry = mtryRange[mtryId]
        print(paste("Mtry", curMtry))
        for (fold in 1:folds) {
            print(paste("Fold", fold))
            foldInd = ((fold-1)*nbSampTestSet+1):(fold*nbSampTestSet)
            testSet = dataset[foldInd,]
            trainSet = dataset[-foldInd,]

            # All classes are present in train and test sets
            stopifnot(length(table(testSet$ND)) == 3)
            stopifnot(length(table(trainSet$ND)) == 3)

            rf = randomForest(ND ~ ., data = trainSet, mtry = curMtry, ntree = nbTrees)

            print(paste("Mean class error", mean(rf$confusion[,3])))
            # Prediction on test set
            preds <- predict(rf, testSet)
            # Accuracy
            print(paste("Accuracy", mean(preds == testSet$ND)))
            # Macro F1 score
            fitMat[mtryId, fold] = mean(sapply(levels(testSet$ND), 
                        function(x) F1pred(preds, testSet$ND, x)))
            flush.console()
        }
    }
    return(fitMat)
}

In [None]:
datasetCV = as.data.frame(icamLT[idImages[1:(nbSampTestSet*5)],])
datasetCV$ND = as.factor(LT$Patient[fieldToKeep][indIcam10][idImages[1:(nbSampTestSet*5)]])
CM = crossValRF(dataset = datasetCV, folds = 5, nbSampTestSet = nbSampTestSet, 
                idImages = idImages, mtryRange = c(20, 30, 40, 50, 60, 70, 80, 90))

In [None]:
CM
rowMeans(CM)
optiMtry = as.numeric(rownames(CM)[rowMeans(CM) == max(rowMeans(CM))])[1]

In [None]:
optiMtry
fullRF = randomForest(ND ~ ., data = datasetCV, mtry = optiMtry, ntree = RF_NB_TREES, localImp = T)

In [None]:
preds <- predict(fullRF, validationSet)
# Accuracy
print(paste("Validation accuracy", mean(preds == validationSet$ND)))
# Macro F1-score
mean(sapply(levels(validationSet$ND), 
            function(x) F1pred(preds, validationSet$ND, x)))

In [None]:
fullRF

### Output confusion matrix

In [None]:
confMat = data.frame(Observed = rep(levels(validationSet$ND), each = length(levels(validationSet$ND))),
                     Predicted = rep(levels(validationSet$ND), length(levels(validationSet$ND))),
                     Count = 0)
# Color: white on diagonal (for text readability)
confMat$Color = ifelse(confMat$Observed == confMat$Predicted, "#FFFFFF", "#333333")

In [None]:
for (i in 1:nrow(validationSet)){
    idMat = (confMat$Observed == validationSet$ND[i]) & (confMat$Predicted == preds[i])
    confMat$Count[idMat] = confMat$Count[idMat] + 1
}

head(confMat)

In [None]:
totalDF = aggregate(confMat$Count, by = list(confMat$Observed), FUN = sum)
names(totalDF) <- c("Observed", "Count")
totalDF$Predicted = "Total"
totalDF$Color = "#FFFFFF"
confMat = rbind(confMat, totalDF)

In [None]:
f1DF = as.data.frame(round(sapply(levels(validationSet$ND), 
            function(x) F1pred(preds, validationSet$ND, x)), 2))
names(f1DF) <- "Count"
f1DF$Observed = rownames(f1DF)
f1DF$Predicted = "F1 score"
f1DF$Color = "#333333"
confMat = rbind(confMat, f1DF)

In [None]:
# Change coating labels
levels(confMat$Observed) = c("Normal donor", "Patient 1", "Patient 2")
confMat$Observed = factor(confMat$Observed, 
                          levels = rev(levels(confMat$Observed)))
levels(confMat$Predicted) = c(levels(confMat$Observed), 'Total', 'F1 score') 

In [None]:
gp <- ggplot(confMat, aes(Predicted, Observed)) + geom_tile(aes(fill = Count)) +
    geom_text(aes(label = Count, color = Color), size = 8) + scale_color_identity() +
    guides(fill = F) + theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), 
                             axis.text = element_text(size = 24), axis.title = element_text(size = 24)) +
    geom_vline(xintercept = 3.5, color = "#FFFFFF", size = 1.5)
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/LT_ARPC1B_status_prediction.pdf", plot = gp, width = 10)
}

### Combination of features by feature type and biological object

#### Features important for overall prediction

In [None]:
catObjects = c("FilterNKCytoplasm", "FilterNKLFA", "ShrunkenCytoplasm", "FilterNKNucleus", "FilterNKPerfGranules")
dtCount = as.data.frame(sapply(catObjects, function(x) grepl(x, colnames(transformedLT))))
dtCount$Other = apply(dtCount, 1, function(x) sum(x) != 1)
# Features matching several objects are classified as "Other"
dtCount[dtCount$Other,-6] <- F

In [None]:
ftCatObj = as.factor(apply(dtCount, 1, function(x) colnames(dtCount)[which(x)]))

# Rename to useful names
levels(ftCatObj) <- str_remove(string = levels(ftCatObj), pattern = "FilterNK")
levels(ftCatObj)[levels(ftCatObj) %in% c("LFA", "ShrunkenCytoplasm")] <- "Cytoplasm"
                           
# Order levels so that "Other" is last
lvOther = which(levels(ftCatObj) == "Other")
ftCatObj = factor(ftCatObj, levels = c(levels(ftCatObj)[-lvOther], "Other"))
     
# Combination of feature category and biological object described
combFtCat = paste(ftCat, ftCatObj, sep = " / ")

In [None]:
combImpDF = aggregate(fullRF$importance[,colnames(fullRF$importance) == "MeanDecreaseAccuracy"],
                      by = list(combFtCat), FUN = sum)
combImpDF = cbind(combImpDF, aggregate(fullRF$importance[,colnames(fullRF$importance) == "MeanDecreaseAccuracy"],
                      by = list(combFtCat), FUN = length)[,2], "Cumulated")
names(combImpDF) = c("Type", "Count", "Cardinality", "CountType")
combImpDF$Type = factor(combImpDF$Type, levels = combImpDF$Type[order(combImpDF$Count)])

In [None]:
# Add counts normalized by number of feature in a given type

# Visual correction: a bar with this absolute count will have an average bar of equal size
visNormFactor = 20
relDF = data.frame(Type = combImpDF$Type, 
                   Count = -visNormFactor * combImpDF$Count / combImpDF$Cardinality,
                   Cardinality = NA, CountType = "Average")
combImpDF = rbind(combImpDF, relDF)

In [None]:
gp <- ggplot(combImpDF) + geom_col(aes(Type, Count, fill = Cardinality)) +
                    xlab("Feature type / object") + ylab("Importance") + coord_flip() +
                    guides(fill = F) + geom_text(aes(x = Type, y = Count + 0.003, label = Cardinality), size = 4) +
                    scale_y_continuous(breaks = seq(-0.5, 0.5, 0.05), 
                                       labels = c(abs(seq(-0.5, -0.001, 0.05)/visNormFactor),seq(0, 0.5, 0.05))) +
                    geom_hline(yintercept = 0, color = "#333333") +
                    annotate("text", x = 1, y = 0.1, label = "Cumulated") +
                    annotate("text", x = 1, y = - 0.05, label = "Average")
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/LT_ARPC1B_status_prediction_features.pdf", plot = gp, height = 5, width = 10)
}

#### Features important for normal donor prediction
By opposition, it means that this features are important to describe healthy phenotype

In [None]:
combImpDF = aggregate(fullRF$importance[,colnames(fullRF$importance) == "ND"],
                      by = list(combFtCat), FUN = sum)
combImpDF = cbind(combImpDF, aggregate(fullRF$importance[,colnames(fullRF$importance) == "ND"],
                      by = list(combFtCat), FUN = length)[,2], "Cumulated")
names(combImpDF) = c("Type", "Count", "Cardinality", "CountType")
combImpDF$Type = factor(combImpDF$Type, levels = combImpDF$Type[order(combImpDF$Count)])

In [None]:
# Add counts normalized by number of feature in a given type

# Visual correction: a bar with this absolute count will have an average bar of equal size
visNormFactor = 20
relDF = data.frame(Type = combImpDF$Type, 
                   Count = -visNormFactor * combImpDF$Count / combImpDF$Cardinality,
                   Cardinality = NA, CountType = "Average")
combImpDF = rbind(combImpDF, relDF)

In [None]:
gp <- ggplot(combImpDF) + geom_col(aes(Type, Count, fill = Cardinality)) +
                    xlab("Feature type / object") + ylab("Importance") + coord_flip() +
                    guides(fill = F) + geom_text(aes(x = Type, y = Count + 0.003, label = Cardinality), size = 4) +
                    scale_y_continuous(breaks = seq(-0.5, 0.5, 0.05), 
                                       labels = c(abs(seq(-0.5, -0.001, 0.05)/visNormFactor),seq(0, 0.5, 0.05))) +
                    geom_hline(yintercept = 0, color = "#333333") +
                    annotate("text", x = 1, y = 0.1, label = "Cumulated") +
                    annotate("text", x = 1, y = - 0.05, label = "Average")
gp

In [None]:
if(!TEST_MODE){
    ggsave(filename = "Fig/LT_ARPC1B_status_prediction_normalDonors_}features.pdf", plot = gp, height = 5, width = 10)
}

In [None]:
sessionInfo()