# Check distributions of phenotypes

We will make qq plots and histograms of all traits before and after transformation, to assess violations of normality assumptions.

<div class="alert alert-block alert-info"> In this version of the notebook, functions are broken up and cleaned, as well as being finalized. We try box-cox transformation, but first provide options to threshold, remove duplicates, and remove outliers. We also write statistics out to a table. </div>


## Load libraries and phenotype data

### Libraries

In [None]:
library(curl)
library(foreach)
library(data.table)
library(ggplot2)
library(ggpubr) # For combining  multiple ggplot objects into a figure
library(tools)
library(dplyr)
library(RNOmni)

### Prepare lists of files

In [None]:
files_in <- list.files("pheno_files",
                       pattern="noheader.pheno",
                       full.names = TRUE,
                       recursive = TRUE)

files_in <- files_in[!grepl("trans", files_in)]
files_in <- files_in[!grepl("Covariates", files_in)]
files_in <- files_in[!grepl("copies", files_in)]
files_in <- files_in[!grepl("boxcox", files_in)]
#files_in <- files_in[!grepl("PC", files_in)]
files_in <- files_in[!grepl("epicormic", files_in)]
files_in <- files_in[!grepl("time", files_in)]
files_in <- files_in[!grepl("binarized", files_in)]
files_in <- files_in[!grepl("threshold", files_in)]

#files_in_sans_PC <- files_in[!grepl("PC", files_in)]
files_in_PC_only <- files_in[grepl("PC", files_in)]

### Provide functions

#### Function for making plots

In [None]:
plot_histogram_qq <- function(this_file_in, subtitle){
  plot1 <- ggplot(na.omit(this_file_in), aes(sample = V3)) + stat_qq() +
  geom_abline(intercept=mean(na.omit(this_file_in$V3)), slope = sd(na.omit(this_file_in$V3))) +
  ylab("Trait values") + 
  xlab("Theoretical quantiles")


  plot2 <-ggplot(data = na.omit(this_file_in)) + 
  geom_histogram(mapping = aes(x = V3, y = ..density..), fill="steelblue", colour="black") +
  xlab("Trait values") +
  ylab("Density (PDF)") +
  stat_function(fun = dnorm, args = list(mean = mean(na.omit(this_file_in$V3)),
                                         sd = sd(na.omit(this_file_in$V3)))  )

  combo_plot <- ggarrange(plot1, plot2)
  combo_plot <- annotate_figure(combo_plot, top = text_grob(subtitle, color = "darkblue"))
  combo_plot
}

#### Functions for "pre-treatments" of duplicates/outlier removal, threshold-based filtering

In [None]:
remove_duplicates <- function(this_file_in, phenotype_file_path){
  # For PC phenotypes, we want only unique values. Those with 0 for all root have
  #  the same very low value, which will skew distributions even after Box-Cox.
    
  # Prepare a file that says which are duplicates (0) and which are not (1)
  duplicate_indices <- which(duplicated(this_file_in$V3))
  unique_indices <- which(!duplicated(this_file_in$V3))
  na_indices <- which(is.na(this_file_in$V3))
  this_file_in_binarized <- this_file_in
  this_file_in_binarized[duplicate_indices, 3] <- 0
  this_file_in_binarized[unique_indices, 3] <- 1
  this_file_in_binarized[na_indices, 3] <- NA
    
  binarized_out_path <- gsub("\\.noheader", "_duplicates_binarized\\.binary.noheader", phenotype_file_path)
    
  fwrite(this_file_in_binarized, binarized_out_path,
         col.names = FALSE, row.names = FALSE,
         sep="\t", quote = FALSE, na="NA")
    
  #Write out before omitting NAs so we keep all genotypes, in desired order.
  fwrite(this_file_in_binarized, binarized_out_path, col.names = FALSE, row.names = FALSE, sep="\t", quote = FALSE, na="NA")
    
  # Write version with header
  binarized_out_path_wheader <- gsub("noheader", "header", binarized_out_path)
  fwrite(this_file_in_binarized, binarized_out_path_wheader, col.names = TRUE, row.names = FALSE, sep="\t", quote = FALSE, na="NA")
    
  # Remove duplicates and calculate statistics before and after
  this_file_in$V3[duplicated(this_file_in$V3)] <- NA
  n_duplicates <- length(this_file_in$V3[duplicated(this_file_in$V3,incomparables=NA)])

  print("Finding S-W test for trait after duplicates dropped is...")
  print(paste0("Sample size is: ", length(na.omit(this_file_in$V3))))
  pSW_sans_duplicates <- shapiro.test(this_file_in$V3)$p.value
  Pearson_CC_sans_duplicates <- find_pearson_cc(this_file_in)

  stats_to_append <- c(n_duplicates, pSW_sans_duplicates, Pearson_CC_sans_duplicates, binarized_out_path)
  names(stats_to_append) <- c("n_duplicates", "pval_SW_sans_duplicates", "Pearson_CC_sans_duplicates", "duplicate_binarized_out_path")
  return(list("this_file_in" = this_file_in,
              "stats_to_append" = stats_to_append))
}

In [None]:
find_elbow <- function(file_in){
    density <- density(na.omit(file_in$V3))
    deriv <- diff(density$y) / diff(density$x)
    deriv2 <- diff(deriv) / diff(density$x[-1])
    elbow <- density$x[which(deriv2 == max(deriv2)) + 2] # Need +2 because second derivate is computed only for points other than the first two points
    elbow
}

In [None]:
threshold <- function(this_file_in, threshold, phenotype_file_path){
    this_file_in_binarized <- this_file_in
    
    n_values_below_threshold <- 
    length(this_file_in_binarized$V3[which(this_file_in_binarized$V3 < threshold)])
    
    this_file_in_binarized$V3[which(this_file_in$V3 < threshold)] <- 0
    this_file_in_binarized$V3[which(this_file_in$V3 >= threshold)] <- 1
    
    this_file_in$V3[which(this_file_in$V3 < threshold)] <- NA # so we can acheive normality...
    # these ones won't be included in our transformed file, only the binarized one.
    
    pSW_thresholded <- shapiro.test(this_file_in$V3)$p.value
    PearsonCC_thresholded <- find_pearson_cc(this_file_in)
    
    binarized_out_path <- gsub("\\.noheader",
                               paste0("_threshold", threshold, "\\.binary.noheader"),
                               phenotype_file_path)
    
    fwrite(this_file_in_binarized, binarized_out_path,
           col.names = FALSE, row.names = FALSE,
           sep="\t", quote = FALSE, na="NA")

    # Write version with header
    binarized_out_path_wheader <- gsub("noheader", "header", binarized_out_path)
    
    fwrite(this_file_in_binarized, binarized_out_path_wheader,
           col.names = FALSE, row.names = FALSE,
           sep="\t", quote = FALSE, na="NA")

    stats_to_append <- c(threshold, n_values_below_threshold,
                         pSW_thresholded, PearsonCC_thresholded, binarized_out_path)
    names(stats_to_append) <- c("threshold", "n_values_below_threshold",
                                "pSW_thresholded", "PearsonCC_thresholded", "binarized_out_path")
    
    return(list("this_file_in" = this_file_in,
                "stats_to_append" = stats_to_append))
}

In [None]:
remove_outliers <- function(this_file_in){
  Q <- quantile(this_file_in$V3,
            probs=c(.25, .75),
            na.rm = TRUE)

  iqr <- IQR(this_file_in$V3, na.rm = TRUE)

  up <-  Q[2]+1.5*iqr
  low <- Q[1]-1.5*iqr

  # How many outliers are there? Let's keep track of this
  n_outliers <- 
  length(this_file_in$V3[which(this_file_in$V3 > up)]) + 
  length(this_file_in$V3[which(this_file_in$V3 < low)])

  this_file_in$V3[which(this_file_in$V3 > up)] <- NA
  this_file_in$V3[which(this_file_in$V3 < low)] <- NA

  print("Finding S-W test for trait after duplicates dropped and outliers removed is...")
  print(paste0("Sample size is: ", length(na.omit(this_file_in$V3))))

  pval_SW_sans_outliers <- shapiro.test(this_file_in$V3)$p.value
  PearsonCC_sans_outliers <- find_pearson_cc(this_file_in)
    
  stats_to_append <- c(n_outliers, pval_SW_sans_outliers, PearsonCC_sans_outliers)
  names(stats_to_append) <- c("n_outliers", "pval_SW_sans_outliers", "PearsonCC_sans_outliers")
    
  return(list("this_file_in" = this_file_in,
              "stats_to_append" = stats_to_append))
}

#### Function to produce Pearson CC of trait against normal distribution

In [None]:
find_pearson_cc <- function(this_file_in){

  this_qq <- qqnorm(as.numeric(as.character(na.omit(this_file_in$V3))),
              plot.it = FALSE)

  pearson_cc <- cor(na.omit(this_qq$x), na.omit(this_qq$y))

  pearson_cc
}

#### Function for Box-Cox transformation, with option for scaling to bring minimum value > 0

In [None]:
boxcox <- function(this_file_in, phenotype_file_path, exclude_negative_if = "growth"){
  minimum <- min(na.omit(this_file_in$V3))
  print(paste("Minimum for this phenotype is", minimum))
  # Prior to Box-Cox, we will scale by a constant alpha to bring everything above zero.
  if (minimum < 0){
      if(grepl(exclude_negative_if, phenotype_file_path)){
          # For growth phenotypes, we will simply discard observations with negative "growth"
          this_file_in$V3[which(this_file_in$V3 < 0)] <- NA
          alpha <- 0
      } else{

          alpha <- (-1 * min(na.omit(this_file_in$V3)) ) + 1e-8 # scale to min > 0
          this_file_in$V3 <- this_file_in$V3 + alpha
          print(paste("Minimum after scaling: ", min(na.omit(this_file_in$V3))))
      }
  } else {
      alpha <- 0
  }

  # Since we wish to write this out later, and want to keep all genotypes in order, change
  #   =0 to NA and do not drop NA
  this_file_in$V3[which(this_file_in$V3 == 0)] <- NA

  pTobject <- car::powerTransform(this_file_in$V3, family = "bcPower")
  lambda <- pTobject$roundlam

  print(paste("Lambda is: ", lambda))

  if (lambda == 0){
      this_file_in$V3 <- log(this_file_in$V3)
  }

  if (lambda != 0){
        this_file_in$V3 <- ((this_file_in$V3 ^ lambda) - 1) / lambda
  }

  return(list("this_file_in" = this_file_in,
              "lambda" = lambda,
              "alpha" = alpha))

}

#### Function with workflow for pre-treatments

In [None]:
preprocess_trait <- function(this_file_in, trait_path,
                             duplicate_rm = FALSE, outlier_rm = FALSE, threshold = FALSE){
  pSW_raw <- shapiro.test(this_file_in$V3)$p.value
  pearson_cc_raw <- find_pearson_cc(this_file_in)
  n_non_NA <- nrow(na.omit(this_file_in))
  n_gt_0 <- nrow(na.omit(this_file_in[which(this_file_in$V3 > 0)], ))
  stats <- c(trait_path, pSW_raw, pearson_cc_raw, n_non_NA, n_gt_0)
  names(stats) <- c("trait_raw", "pval_SW_raw",
                    "PearsonCC_raw",
                    "n_non_NA", "n_gt_0")
  
    
   pearson_cc_before_pretreatment <- find_pearson_cc(this_file_in)
  if (threshold == TRUE){
      elbow <- find_elbow(this_file_in)
      out <- threshold(this_file_in,
                       threshold = elbow,
                       phenotype_file_path = trait_path)
      this_file_in <- out$this_file_in
      stats_to_append <- out$stats_to_append
      
      trait_path <- gsub("\\.noheader", paste0("_threshold",
                                               elbow,
                                               "_\\.noheader"), trait_path)
      
      subtitle_pt2 <- paste0("\nRemoved ", stats_to_append["n_values_below_threshold"],
                             " values below threshold ", round(elbow, digits = 3))
      
  }
    
  if (threshold == FALSE){
      stats_to_append <- rep("no_threshold", 5)
      names(stats_to_append) <- c("threshold", "n_values_below_threshold",
                                  "pval_SW_thresholded", "PearsonCC_thresholded",
                                  "binarized_out_path")
      
      trait_path <- gsub("\\.noheader", "_nothreshold\\.noheader", trait_path)

      subtitle_pt2 <- "\nNo thresholding"
  }
    
  stats <- c(stats, stats_to_append)
    
  if (duplicate_rm == TRUE){
      out <- remove_duplicates(this_file_in, phenotype_file_path = trait_path)

      this_file_in <- out$this_file_in
      stats_to_append <- out$stats_to_append

      trait_path <- gsub("\\.noheader", "_unique\\.noheader", trait_path)

      subtitle_pt3 <- paste0("\n", stats_to_append["n_duplicates"], " duplicates removed")
  }

  if (duplicate_rm == FALSE){
      stats_to_append <- rep("dup_not_removed", 4)
      names(stats_to_append) <- c("n_duplicates", "pval_SW_sans_duplicates", "PearsonCC_sans_duplicates", "duplicate_binarized_out_path")

      trait_path <- gsub("\\.noheader", "_nodupfilter\\.noheader", trait_path)

      subtitle_pt3 <- "\nDuplicates not removed"
  }
    
  stats <- c(stats, stats_to_append)

  # Remove outliers by IQR method (https://www.r-bloggers.com/2020/01/how-to-remove-outliers-in-r/)
  if (outlier_rm == TRUE){
      out <- remove_outliers(this_file_in)

      this_file_in <- out$this_file_in
      stats_to_append <- out$stats_to_append

      trait_path <- gsub("\\.noheader", "_rmoutliers_boxcox\\.noheader", trait_path)

      subtitle_pt4 <- paste0("\n", stats_to_append["n_outliers"], " outliers removed")
  }

  if(outlier_rm == FALSE){

      stats_to_append <- rep("outliers_not_removed", 3)
      names(stats_to_append) <- c("n_outliers", "pval_SW_sans_outliers", "PearsonCC_sans_outliers")

      trait_path <- gsub("\\.noheader", "_keepoutliers_boxcox\\.noheader", trait_path)

      subtitle_pt4 <- "\nOutliers not removed"
  }
    
  subtitle_pt2 <- paste0(subtitle_pt2, subtitle_pt3, subtitle_pt4)
    
  stats <- c(stats, stats_to_append)
    
  return(list("this_file_in" = this_file_in,
              "out_path" = trait_path,
              "subtitle_pt2" = subtitle_pt2,
              "stats_to_append" = stats))
}

#### Function with complete workflow encompassing pre-treatments and Box-Cox transformation

In [None]:
transform_trait <- function(phenotype_file_path, duplicate_rm, outlier_rm,
                            threshold, transformation = "boxcox"){
  print(paste("Reading file", basename(phenotype_file_path)))
  this_file_in <- fread(phenotype_file_path, header=FALSE)
    
  plot_path <- gsub("pheno_files", "pheno_plots", phenotype_file_path)
    
  out <- preprocess_trait(this_file_in = this_file_in,
                          threshold = threshold,
                          duplicate_rm = duplicate_rm,
                          outlier_rm = outlier_rm,
                          trait_path = phenotype_file_path)
  this_file_in <- out$this_file_in
  out_path <- out$out_path
  subtitle_pt2 <- out$subtitle_pt2
  my_stats <- out$stats_to_append
    
  phenotype_name <- basename(tools::file_path_sans_ext(tools::file_path_sans_ext(out_path)))
  plot_path <- gsub(".noheader.pheno", ".png", out_path)
    
  title <- basename(file_path_sans_ext(phenotype_file_path))
  title <- gsub(".noheader", "", title)
  title <- gsub("trans", "", title)
  title <- paste("Phenotype file:", title, "\n",
                 "Number of genotypes: ", my_stats["n_non_NA"], "\n",
                 "Number of genotypes with trait value > 0: ", my_stats["n_gt_0"])
  
  pearson_cc_pre_boxcox <- find_pearson_cc(this_file_in)
    
  subtitle <- paste("Without transformation",
                    subtitle_pt2,
                    "\nPearson CC: ",
                    round(pearson_cc_pre_boxcox, digits=3))

  combo_plot_1 <- plot_histogram_qq(this_file_in = this_file_in,
                                    subtitle = subtitle)

  if(transformation == "boxcox"){
    out <- boxcox(this_file_in = this_file_in,
            phenotype_file_path = phenotype_file_path,
            exclude_negative_if = "growth")
    this_file_in <- out$this_file_in
    lambda <- out$lambda
    alpha <- out$alpha
  }
  if(transformation == "rbinv"){
      out_path <- gsub("boxcox", "rbinv", out_path)
      
      # Since we need to exclude observations with missing measurements
      #   let's keep track of which ones are missing
      this_file_in_no_missing <- na.omit(this_file_in)
      
      this_file_in_no_missing$V3 <-
      RNOmni::RankNorm(this_file_in_no_missing$V3)
      
      this_file_in <- merge(this_file_in[,1:2],
                            this_file_in_no_missing,
                            by = c("V1", "V2"),
                            all.x = TRUE)
      
      lambda <- "no_box_cox"
      alpha <- "no_box_cox"
  }
    
  pSW_boxcox <- shapiro.test(this_file_in$V3)$p.value
    
  #Write out before omitting NAs so we keep all genotypes, in desired order.
  fwrite(this_file_in, out_path, col.names = FALSE, row.names = FALSE, sep="\t", quote = FALSE, na="NA")

  # Write version with header
  out_path_wheader <- gsub("noheader", "header", out_path)
  fwrite(this_file_in, out_path_wheader, col.names = TRUE, row.names = FALSE, sep="\t", quote = FALSE, na="NA")

  plot_path <- gsub(".header.pheno", ".png", out_path_wheader)
  plot_path <- gsub("pheno_files", "pheno_plots", plot_path)
  
  this_file_in <- na.omit(this_file_in) # Remove NAs before Box-Cox transformation so it works

  pearson_cc_post_boxcox <- find_pearson_cc(this_file_in)
  #print(paste("PCC: ", pearson_cc_post_boxcox))
    
    
  if(transformation == "boxcox"){
    subtitle <- paste0("After Box-Cox transformation \n",
                "(\u03BB = ", lambda, "; \u03B1 = ", alpha, ")\n",
                "Pearson CC: ", round(pearson_cc_post_boxcox, digits = 3))
  }
  if(transformation == "rbinv"){
    subtitle <- paste0("After rank-based inverse normal transformation\n",
                 "Pearson CC: ", round(pearson_cc_post_boxcox, digits = 3))
  }
    
  combo_plot_2 <- plot_histogram_qq(this_file_in = this_file_in,
                                    subtitle = subtitle)
    
  combo_plot_all <- ggarrange(combo_plot_1, combo_plot_2, nrow = 2)
  combo_plot_all <- annotate_figure(combo_plot_all, top = text_grob(title, face = "bold"))
  plot(combo_plot_all)

  ggsave(plot_path, combo_plot_all)
    
  stats_to_append <- c(pearson_cc_pre_boxcox, pearson_cc_post_boxcox,
                       pSW_boxcox,  lambda, alpha,
                       #phenotype_name,
                       out_path, plot_path)
  names(stats_to_append) <- c("PearsonCC_before_transform", "PearsonCC_after_transform",
                              "pSW_transformed", "lambda", "alpha",
                              #"phenotype_name",
                              "out_sansheader", "plot")
  my_stats <- c(my_stats, stats_to_append)  
    
  return(my_stats)
}

## Run data

### Run all traits first without dropping duplicates or removing outliers

In [None]:
stat_table <- foreach(file=files_in, .combine=rbind)%do%{
#for(file in files_in){
    my_stats <- transform_trait(file, duplicate_rm = FALSE, outlier_rm = FALSE, threshold = FALSE)
    my_stats
}

### Run all PC traits, while removing duplicates 

In [None]:
stat_table2 <- foreach(file=files_in_PC_only, .combine=rbind)%do%{
#for(file in files_in_PC_only){
    my_stats <- transform_trait(file, duplicate_rm = TRUE, outlier_rm = FALSE, threshold = FALSE)
    my_stats
}

### Run all PC traits, while removing duplicates and outliers

In [None]:
stat_table3 <- foreach(file=files_in_PC_only, .combine=rbind)%do%{
#for(file in files_in_PC_only){
    my_stats <- transform_trait(file, duplicate_rm = TRUE, outlier_rm = TRUE, threshold = FALSE)
    my_stats
}

In [None]:
getwd()

In [None]:
#dir.create("spreadsheets")

### Run all traits while removing duplicates

In [None]:
stat_table4 <- foreach(file=files_in, .combine=rbind)%do%{
#for(file in files_in_PC_only){
    my_stats <- transform_trait(file, duplicate_rm = TRUE, outlier_rm = FALSE, threshold = FALSE)
    my_stats
}

### Run all traits while removing duplicates and outliers

In [None]:
stat_table5 <- foreach(file=files_in, .combine=rbind)%do%{
#for(file in files_in_PC_only){
    my_stats <- transform_trait(file, duplicate_rm = TRUE, outlier_rm = TRUE, threshold = FALSE)
    my_stats
}

### Threshold at position where derivative of density curve is maximum

#### Functionalize

#### Build list of files we wish to employ the threshold approach for

In [None]:
to_threshold <- c("pheno_files/root/LRL_PC2.noheader.pheno",
                  "pheno_files/root/root_area_PC2.noheader.pheno",
                  "pheno_files/stem_regen/CallusShoot_PC2.noheader.pheno",
                  "pheno_files/stem_regen/Shoot_PC2.noheader.pheno",
                  "pheno_files/stem_regen/Shoot_PC3.noheader.pheno",
                  "pheno_files/stem_regen/Shoot_PC4.noheader.pheno")

In [None]:
stat_table6 <- foreach(file=to_threshold, .combine=rbind)%do%{
#for(file in to_threshold){
    my_stats <- transform_trait(file, duplicate_rm = TRUE, outlier_rm = TRUE, threshold = TRUE)
    my_stats
}

### Try running everything through rank-based inverse normal transformation

In [None]:
library(RNOmni)

In [None]:
stat_table7 <- foreach(file=files_in, .combine=rbind)%do%{
#for(file in files_in){
    my_stats <- transform_trait(file, duplicate_rm = TRUE,
                                outlier_rm = FALSE, threshold = FALSE,
                                transformation = "rbinv")
    my_stats
}

In [None]:
head(stat_table7)

### Save spreadsheets of results

In [None]:
fwrite(stat_table, "spreadsheets/all_traits_no_preprocessing_just_boxcox.csv")
fwrite(stat_table2, "spreadsheets/PC_traits_remove_duplicates_before_boxcox.csv")
fwrite(stat_table3, "spreadsheets/PC_traits_remove_duplicates_outliers_before_boxcox.csv")
fwrite(stat_table4, "spreadsheets/all_traits_remove_duplicates_before_boxcox.csv")
fwrite(stat_table5, "spreadsheets/all_traits_remove_duplicates_outliers_before_boxcox.csv")
fwrite(stat_table6, "spreadsheets/select_traits_remove_duplicates_outliers_then_thresdhold_before_boxcox.csv")
fwrite(stat_table7, "spreadsheets/all_traits_remove_duplicates_before_rbinv.csv")