16S_analysis_ASVs_differential_adundance_script_v23.RMD

---
title: "Azteca-Cecropia 16S rRNA gene manuscript - ALDEx2"
subtitle: "Differential abundance analysis based on sequence table"
author: "Maximilian Nepel"
date: "last updated August 2023"
output:
  rmarkdown::html_document:
    toc: true
    toc_float: true
---

```{r, warning=FALSE, message=FALSE}
source("0_v01_common.R", chdir = TRUE) # runs the common.R file
```

# libraries
```{r libraries, include=F}
  library(knitr) # A General-Purpose Package for Dynamic Report Generation in R
  library(kableExtra) # Construct Complex Table with 'kable' and Pipe Syntax
  library(rmarkdown) # Dynamic Documents for R
  library(extrafont) # for extra figure fonts
  library(tidyverse) # for dplyr forcats ggplot2 readr tibble
  library(ggrepel) # Repulsive Text and Label Geoms for 'ggplot2'
  library(magrittr) # pipes
  library(scales) # Generic plot scaling methods
  library(svglite) # for svg files
  library(DESeq2) # Differential gene expression analysis
  library(phyloseq) # Handling and analysis of high-throughput phylogenetic sequence data
  library(ALDEx2) # Analysis Of Differential Abundance Taking Sample Variation Into Account
```

# loaded libraries
```{r}
all.pkgs.loaded <- gsub("package:","", search()[grep("package:",search())])
sessionInfo()$basePkgs
pkgs.loaded <- all.pkgs.loaded[!all.pkgs.loaded %in% sessionInfo()$basePkgs]

pkgs <- data.frame(matrix(ncol = 1,nrow = 0))
  colnames(pkgs) <- c("Version")
for (i in pkgs.loaded) {
  pkgs[i,1] <- sessionInfo()$otherPkgs[[i]]$Version
  }
print(pkgs)
```

# all subsetted phyloseq datasets are saved as .RDS files and need to be loaded here
```{r}
## examples
# physeq.Fp.abs <- readRDS(file.path(RD_DIR, "ASVs.physeq.Fp.abs.rds"))
# physeq.YCp.abs <- readRDS(file.path(RD_DIR, "ASVs.physeq.YCp.abs.rds"))
# physeq.FYp.abs <- readRDS(file.path(RD_DIR, "ASVs.physeq.FYp.abs.rds"))
# physeq.Ep.mean.abs <- readRDS(file.path(RD_DIR, "ASVs.physeq.Ep.mean.abs.rds"))
#   physeq.Ep.alf.abs <- subset_samples(physeq.Ep.mean.abs, Ant == "alf")
#   physeq.Ep.con.abs <- subset_samples(physeq.Ep.mean.abs, Ant == "con")
```

# functions for ALDEx2
```{r functions, include=F}
DropRareSpecies <- function(Ps_obj = physeq.ansys_pairwise, abundance = 0.5,  preval = 0.1) { # , prevalence = 0.1
    prevdf <- apply(
      X = otu_table(Ps_obj),
      MARGIN = ifelse(taxa_are_rows(Ps_obj), yes = 1, no = 2),
      FUN = function(x) {sum(x > 0)}
    )
    Ps_obj %>%
      transform_sample_counts(., function(x) x / sum(x) * 100) ->
      Ps_obj.rel
    meanRelAb <- colSums(otu_table(Ps_obj.rel))/nrow(otu_table(Ps_obj.rel))

  # Add taxonomy and total read counts to this data.frame
    prevdf <- data.frame(
      Abundance = prevdf,
      meanRelAb = meanRelAb,
      TotalAbundance = taxa_sums(Ps_obj),
      tax_table(Ps_obj)
    )
    
    # Define abundance threshold as 0.X of total samples
    abundanceThreshold <- abundance * nsamples(Ps_obj)
    abundanceThreshold

    # Execute abundance and prevalence filter, using `prune_taxa()` function
    prevdf_RareTax_filt <-
      subset(prevdf,
             Class %in% get_taxa_unique(Ps_obj, "Class")) # %!in% Rare_tax) # could exclude Rare Classes

    prevdf_RareTax_filt.2 <- prevdf_RareTax_filt[(prevdf_RareTax_filt$Abundance >= abundanceThreshold),]
    keepTaxa <- row.names(prevdf_RareTax_filt.2)[(prevdf_RareTax_filt.2$meanRelAb >= preval)]
      meanRelAb.sub <- prevdf_RareTax_filt.2$meanRelAb[prevdf_RareTax_filt.2$meanRelAb >= preval]

    Ps_obj_small <- prune_taxa(keepTaxa, Ps_obj)
    sample_data(Ps_obj_small)$Lib.size <-
      rowSums(otu_table(Ps_obj_small))
    print(Ps_obj)
    print(Ps_obj_small)
    return(Ps_obj_small)
}

CalcALDEx.Class <- function(physeq_obj = physeq.ansys_pairwise_s, vars2test = "Type", Rare_tax = Rare_tax, sig_level = significance, LFC = 0.0, ...) {
  physeq_obj <- filter_taxa(physeq_obj, function(x) sum(x) > 0, TRUE)
  data2test <- t(otu_table(physeq_obj))
  comparison <- as.character(get_variable(physeq_obj, vars2test))
  ALDEx <- aldex.clr(
    data2test,
    comparison,
    mc.samples = 128,
    denom = "iqlr", # iqlr for slight assymetry in composition
    verbose = TRUE,
    useMC = TRUE
  ) 
  ALDEx_tt <- aldex.ttest(ALDEx, paired.test = FALSE) # for two conditions
  ALDEx_effect <- aldex.effect(
    ALDEx,
    include.sample.summary = TRUE,
    verbose = TRUE,
    useMC = TRUE
  ) # estimate effect sizes
  ALDEx2plot <- PrepAlDExData.CLass(ALDEx_tt, ALDEx_effect, physeq_obj, sig_level, LFC, Taxa_rank, Rare_tax)
  return(ALDEx2plot)
}

PrepAlDExData.CLass <- function(ALDEx_tt, ALDEx_effect, physeq_obj = Ps_obj_filt_subset, sig_level, LFC, Taxa_rank, Class, ...) {
  ALDEx2plot <- data.frame(ALDEx_tt, ALDEx_effect) # merge results
  physeq_obj %>%
    transform_sample_counts(., function(x) x / sum(x) * 100) %>% 
    psmelt() %>%
    group_by(OTU) -> # %>%
    physeq_obj.OTU
    baseMean <- aggregate(physeq_obj.OTU$Abundance, by=list(OTU=physeq_obj.OTU$OTU), FUN=mean)
    colnames(baseMean) <- c("OTU", "baseMean")
  
  ALDEx2plot$OTU <- rownames(ALDEx2plot)
  ALDEx2plot %<>% left_join(., baseMean, by = "OTU") # add mean abundance to results table
  ALDEx2plot %<>% cbind(., tax_table(physeq_obj)[taxa_names(physeq_obj) %in% ALDEx2plot$OTU, ], stringsAsFactors = FALSE) # add taxonomy
  # change their name to "Rare"
  
  ALDEx2plot[ALDEx2plot$Class %in% Rare_tax, ]$Class <- 'Rare' # Rare_tax is calcuted for the taxa box plots
  ALDEx2plot$Significance <- factor("Fail", levels = c("Fail", "Pass")) # define significance factor
  ALDEx2plot$Significance[ALDEx2plot$wi.eBH < sig_level &
                            !is.na(ALDEx2plot$wi.eBH) &
                            abs(ALDEx2plot$effect) > LFC] <- "Pass"
  # Rank by taxa abundance
  ALDEx2plot$Class %<>%
    factor(., levels = Taxa_rank$Class) %>%  # Taxa_rank is calcuted for the taxa box plots
    fct_relevel(., "Rare", after = Inf)
  return(ALDEx2plot)
}

GGPlotALDExTax.Class <- function(ALDEx2plot=ALDEx2plot_pairwise, OTU_labels = FALSE, Taxa = "Class", Y_val = "effect", sig_level = 0.05) {
  pos <- position_jitter(width = 0.3, seed = 1)
  p <-
    ggplot(ALDEx2plot) +
    geom_point(aes_string(
             x = Taxa,
             y = Y_val,
             colour = "Significance",
             size = "baseMean"),
             position = pos, 
             alpha = 2 / 3, 
             stroke = 0) +
    xlab("") +
    ylab(expression(paste("Effect size (lo", g[2], " fold change)"))) +
    labs(colour = paste("Significance at \n p <", sig_level), size = "Mean count (%)") +
    theme_grey(base_size = 18) + # ,  base_family = f_name
    theme(axis.text.x = element_text(angle = 45.0, vjust = 1, hjust = 1)) +
    guides(colour = guide_legend(override.aes = list(size = 5))) +
    scale_size_continuous(range = c(1, 5), breaks = c(1, 2.5, 5, 10))
  
  if (OTU_labels) {
    p <- p + geom_label_repel(
      aes_string(x = Taxa, y = Y_val),
      size = 2,
      label = sub("Seq_([0-9]+)", "\\1", ALDEx2plot[ALDEx2plot$Significance == "Pass", "OTU"]),
      position = pos,
      data = ALDEx2plot[ALDEx2plot$Significance == "Pass", ], # if only abundance threshold
      colour = "#4a4a4a",
      label.size = NA, 
      alpha = 0.75, 
      box.padding = 0.80,
      max.overlaps = Inf,
      point.padding = 0.5
      
    )
  }
  return(p)
}

GGPlotALDExTax.Class.prev <- function(ALDEx2plot=ALDEx2plot_pairwise.prev, OTU_labels = FALSE, Taxa = "Class", Y_val = "effect", sig_level = 0.05, prev = 0.5) {
  pos <- position_jitter(width = 0.3, seed = 1)
  p <-
    ggplot(ALDEx2plot) +
    geom_point(aes_string(
             x = Taxa,
             y = Y_val,
             colour = "SigPlot",
             size = "baseMean"),
             position = pos, 
             alpha = 2 / 3, 
             stroke = 0) +
    coord_cartesian(ylim = c(-2, 2)) +
    xlab("") +
    ylab(expression(paste("Effect size (lo", g[2], " fold change)"))) +
    labs(colour = paste("Significance at \n p <", sig_level, "\n meanRelAb >", prev, "%"), size = "Mean count (%)") +
    theme_grey(base_size = 18) + # ,  base_family = f_name
    theme(axis.text.x = element_text(angle = 45.0, vjust = 1, hjust = 1)) +
    guides(colour = guide_legend(override.aes = list(size = 5))) +
    scale_size_continuous(range = c(1, 5), breaks = c(1, 2.5, 5, 10))
  
  if (OTU_labels) {
    p <- p + geom_label_repel(
      aes_string(x = Taxa, y = Y_val),
      size = 2,
      label = sub("Seq_([0-9]+)", "\\1", ALDEx2plot[ALDEx2plot$Significance == "Pass" & 
                                                      ALDEx2plot$baseMean >= prevalence,"OTU"]), # 
      position = pos,
      data = ALDEx2plot[ALDEx2plot$SigPlot == "Pass", ], # if only abundance threshold

      colour = "#4a4a4a",
      label.size = NA, 
      alpha = 0.75, 
      box.padding = 0.80,
      max.overlaps = Inf,
      point.padding = 0.5
      
    )
  }
  return(p)
}
```

# define and prep data set for ALDEx2

## define dataset for ALDEx2
```{r}
# set dataset, which should be looped through.
## EP = patches of established ant colonies (mean EP community for multiple sampled ant colonies)
## FYP = patches of initial and young ant colonies
## YEP = patches of young and established ant colonies (mean EP community for multiple sampled ant colonies)
## YEP.alf | YEP.con = only A. alfari, or A. constructor YEPs
## FEP = patches of intial and established ant colonies (mean EP community for multiple sampled ant colonies)

dset <- "FYP" # define here

if (dset == "EP"){
(target.physeq <- physeq.Ep.mean.abs)
} else if (dset == "FYP") {
(target.physeq <- physeq.FYp.abs)
} else if (dset == "YEP") {
(physeq.YEp.abs <- merge_phyloseq(physeq.YCp.abs, physeq.Ep.mean.abs))
(target.physeq <- physeq.YEp.abs)
} else if (dset == "YEP.alf") {
(physeq.YEp.alf.abs <- merge_phyloseq(physeq.YCp.abs, physeq.Ep.alf.abs))
(target.physeq <- physeq.YEp.alf.abs)
} else if (dset == "YEP.con") {
(physeq.YEp.con.abs <- merge_phyloseq(physeq.YCp.abs, physeq.Ep.con.abs))
(target.physeq <- physeq.YEp.con.abs)
} else if (dset == "FEP") {
(physeq.FEp.abs <- merge_phyloseq(physeq.Fp.abs, physeq.Ep.mean.abs))
(target.physeq <- physeq.FEp.abs)
}
```

## calc tax_glom() to diff tax level
```{r}
# order level
(target.physeq_glom.order <- tax_glom(target.physeq,
                             "Order",
                             NArm = TRUE))
tax_table(target.physeq_glom.order) <- tax_table(target.physeq_glom.order)[,1:4]
```

## define physeq for ALDEx2 analyses
```{r}
physeq.ansys <- target.physeq_glom.order
tax.level <-  "order"
```

## define "rare" classes for export & plotting
```{r tag rare tax group, cache=T}
physeq.ansys_glom <- tax_glom(physeq.ansys, 
                             "Class", 
                             NArm = TRUE)
physeq.ansys_glom_rel <- transform_sample_counts(physeq.ansys_glom, function(x) x / sum(x)) 
physeq.ansys_glom_rel_DF <- psmelt(physeq.ansys_glom_rel)
physeq.ansys_glom_rel_DF$Class %<>% as.character()

physeq.ansys_glom_rel_DF %>%
  group_by(Class) ->
  physeq.ansys_glom_rel_DF_byClass
medians <- aggregate(physeq.ansys_glom_rel_DF_byClass$Abundance, by=list(Category=physeq.ansys_glom_rel_DF_byClass$Class), FUN=median)
colnames(medians) <- c("Class", "median")

# find tax_group whose rel. abund. is less than X%
Rare_tax <- medians[medians$median <= 0.01, ]$Class # 0.004 for EP 0.01 for EP

head(unique(physeq.ansys_glom_rel_DF[physeq.ansys_glom_rel_DF$Class %in% Rare_tax, ]$Class))
unique(physeq.ansys_glom_rel_DF[physeq.ansys_glom_rel_DF$Class %!in% Rare_tax, ]$Class)

# change their name to "Rare"
physeq.ansys_glom_rel_DF[physeq.ansys_glom_rel_DF$Class %in% Rare_tax, ]$Class <- 'Rare'

physeq.ansys_glom_rel_DF %>%
  group_by(Class) ->
  physeq.ansys_glom_rel_DF_byClass
Taxa_rank <- aggregate(physeq.ansys_glom_rel_DF_byClass$Abundance, by=list(Category=physeq.ansys_glom_rel_DF_byClass$Class), FUN=sum)
colnames(Taxa_rank) <- c("Class", "Abundance")
Taxa_rank <- Taxa_rank[order(Taxa_rank$Abundance, decreasing = TRUE),]
```

# run and plot ALDEx2

## run ALDEx2 - FYP / YEP / EPmean Ant - plot relevant classes
```{r run ALDEx2, cache=T, results = 'asis'}
significance <- 0.05
prevalence <- 0.5 # threshold mean relative abundance ONLY for plotting
ALDEx_comparisons <- list()

if (dset == "EP"){
physeq.ansys %>% 
  sample_data() %>% 
  .$Ant %>% 
  levels -> ALDEx_comparisons$Ant # for EPmean
ALDEx_comparisons$Comparisons <- as.character(get_variable(physeq.ansys, "Ant")) # for EPmean
} else if ((dset == "FYP") | (dset == "YEP") | (dset == "YEP.alf") | (dset == "YEP.con") | (dset == "FEP")) {
physeq.ansys %>% 
  sample_data() %>% 
  .$Type %>%
levels -> ALDEx_comparisons$Type # for FYP, YEP
ALDEx_comparisons$Comparisons <- as.character(get_variable(physeq.ansys, "Type")) # for FYP, YEP
}

physeq.ansys -> physeq.ansys_pairwise # %>%

  #  Remove species with presence in only < X% of samples
  abundance <- 0.5
  preval <- 0.05 # remove taxa with low average relative abundance -> 0.05% reads of all samples 
  physeq.ansys_pairwise_s <- DropRareSpecies(Ps_obj = physeq.ansys_pairwise, abundance, preval) # , preval
  sum(colSums(otu_table(physeq.ansys_pairwise_s)))
    colSums(otu_table(physeq.ansys_pairwise_s))[order(colSums(otu_table(physeq.ansys_pairwise_s)), decreasing = FALSE)]
    unique(tax_table(physeq.ansys_pairwise_s)[,"Phylum"])
    length(unique(tax_table(physeq.ansys_pairwise_s)[,"Phylum"]))

  # make Joint.sample.name for matching OTUs between compared samples
if (dset == "EP"){
  suppressWarnings(
    sample_data(physeq.ansys_pairwise_s) %<>%
      as_tibble() %>%
      mutate_if(is.factor, as.character) %>%
      transmute(Joint.sample.name  = paste0(.$Type, ".", .$Ant)) %>% # for EPmean
      cbind(sample_data(physeq.ansys_pairwise_s), Joint.sample.name = .)
  )

  ALDEx2plot_pairwise <- CalcALDEx.Class(
    physeq.ansys_pairwise_s,
    "Ant", # for EPmean
    Rare_tax,
    significance,
    0
    )
} else if ((dset == "FYP") | (dset == "YEP") | (dset == "YEP.alf") | (dset == "YEP.con") | (dset == "FEP")) {
  suppressWarnings(
    sample_data(physeq.ansys_pairwise_s) %<>%
      as_tibble() %>%
      mutate_if(is.factor, as.character) %>%
      transmute(Joint.sample.name  = paste0(.$Type, ".", .$Type)) %>% # for IYP, YEP
      cbind(sample_data(physeq.ansys_pairwise_s), Joint.sample.name = .)
  )

  ALDEx2plot_pairwise <- CalcALDEx.Class(
    physeq.ansys_pairwise_s,
    "Type", # for FYP, YEP
    Rare_tax,
    significance,
    0
    )
}

#  creating a plotting column -> splitting up significant taxa - by prevalence: threshold mean relative abundance
ALDEx2plot_pairwise.prev <- ALDEx2plot_pairwise
ALDEx2plot_pairwise.prev$SigPlot <- ALDEx2plot_pairwise.prev$Significance
levels(ALDEx2plot_pairwise.prev$SigPlot) <- c(levels(ALDEx2plot_pairwise.prev$SigPlot),"SigOnly")

ALDEx2plot_pairwise.prev$SigPlot[
  ALDEx2plot_pairwise.prev$Significance == "Pass" &
    ALDEx2plot_pairwise.prev$baseMean < prevalence ## significant, but little prevalence = meanRelAb
  ] <- "SigOnly" # rename significant but low prevalent OTUs -> "SigOnly"

  ALDEX_summary.prev <- tibble(Label = c(paste0(
    " ", sum(ALDEx2plot_pairwise.prev$effect > 0 & ALDEx2plot_pairwise.prev$SigPlot == "Pass"),
    " ", sum( ALDEx2plot_pairwise.prev$effect < 0 & ALDEx2plot_pairwise.prev$SigPlot == "Pass"),
    " ", sum(ALDEx2plot_pairwise.prev$effect > 0 & ALDEx2plot_pairwise.prev$SigPlot == "SigOnly"),
    " ", sum( ALDEx2plot_pairwise.prev$effect < 0 & ALDEx2plot_pairwise.prev$SigPlot == "SigOnly"),
    " (", nrow( ALDEx2plot_pairwise.prev), ")"
    )))

  ALDEx2plot_pairwise.prev %>%
    filter(Significance == "Pass") -> ALDEx2plot_pairwise_pass # %>%

if (tax.level == "order"){
  ALDEx2plot_pairwise_pass[, c("OTU", "baseMean", "effect", "wi.eBH", "Phylum", "Class", "Order")] %>% # order level
    arrange(desc(abs(effect))) ->
    ALDEx2plot_pairwise_results
} else if (tax.level == "otu"){
  ALDEx2plot_pairwise_pass[, c("OTU", "baseMean", "effect", "wi.eBH", "Phylum", "Class", "Order", "Family", "Genus")] %>% # otu
    arrange(desc(abs(effect))) ->
    ALDEx2plot_pairwise_results
}
  
  nrow(ALDEx2plot_pairwise_results)
  
  print(ALDEx2plot_pairwise_results %>%
    kable(., digits = c(2), caption = "Significantly different taxa:") %>%
    kable_styling(
      bootstrap_options = c("striped", "hover", "condensed", "responsive"),
      full_width = F
    ))

  write.table(ALDEx2plot_pairwise_results, 
              paste(RESULTS_DIR,"/ALDEx2.lap.", dset, ".",tax.level,".ab", abundance, # ".prev", prevalence,
                    ".class.", Sys.Date(),".txt", sep = ""),
              sep="\t", row.names=FALSE)
```

## Plot ALDEX plot
```{r plot ALDEx2, cache=T, results = 'asis', fig.width=8, fig.height=7}
  p1 <- GGPlotALDExTax.Class.prev(ALDEx2plot_pairwise.prev, OTU_labels = FALSE, sig_level = significance, prev = prevalence) +
    ggtitle(paste("ASV.lap.", dset, ".", tax.level,".ab", abundance, ".prev", prevalence, sep = "")) +
    geom_text(
    data    = ALDEX_summary.prev, # ALDEX_summary if no SignOnly
    mapping = aes(x = Inf, y = Inf, label = Label),
    hjust   = 1.1,
    vjust   = 1.6
  )
  
  print(p1)

  ggsave(paste(PLOTS_DIR,"/ALDEx2.lap.", dset, ".", tax.level,
               ".ab", abundance, ".prev", prevalence, ".", Sys.Date(),".png",sep = ""),
         width = 8, height = 7, dpi = 300)
  ggsave(paste(PLOTS_DIR,"/ALDEx2.lap.", dset, ".", tax.level,
               ".ab", abundance, ".prev", prevalence, ".", Sys.Date(),".pdf",sep = ""), 
         useDingbats=FALSE,
         width = 8, height = 7, dpi = 300)
```