## Exploration of the Estonian - Latvian Dataset

In [None]:
# General Imports

library(ggh4x)
library(writexl)
library(marginaleffects)
library(readxl)
library(vegan)
library(ggplot2)
library(SRS)
library(tidyverse)
library(divermeta)
library(kableExtra)
library(FSA)
library(latex2exp)
library(cowplot)
library(patchwork)
library(gridExtra)
library(pheatmap)
library(multcompView)
library(permuco)
library(rlang)

source("utils/general_functions.R")

## Distance Based Multiplicity

In [None]:
data  <- qs::qread("EST-LAT/vegetation.qs")

In [None]:
df_vegetation  <- data$metaveg
head(df_vegetation)

In [None]:
# Reads
abundance_file <- "EST-LAT/EstLat_Veg-DenoisedTable.txt"
df_abbundance  <- read_tsv(abundance_file, col_types = cols())

head(df_abbundance)

In [None]:
# Generates the files
diss_file <- "EST-LAT/EstLat_SeqDistances/unified_distances.csv"
diss_clust_file <- "EST-LAT/EstLat_SeqDistances/unified_distances_clusters.csv"


#  Wites the empty headers
base_tibble  <- tibble(ID1 = character(), ID2 = character(), Distance = numeric())
write.table(base_tibble, file = diss_file, sep = ",", row.names = FALSE, col.names = TRUE)
write.table(base_tibble, file = diss_clust_file, sep = ",", row.names = FALSE, col.names = TRUE)

In [None]:
# Unifies distances

base_clusters <- df_abbundance %>% 
        select(ID_Clustered, ID_Denoised)

clusts_ids  <- unique(base_clusters$ID_Clustered)


# Iterates over each of the clusters to construct further clusters
dfs  <- list()

i <- 0
total_clust <- length(unique(base_clusters$ID_Clustered))

# Saves the inner distances
for (base_clust_id in clusts_ids) {

    i <-  i +1
    # print(paste(base_clust_id, i,"of", total_clust, sep = " "))
    flush.console()

    # Extract the current ids
    df_ids <- base_clusters %>% 
        filter(ID_Clustered == base_clust_id) %>% 
        select(ID_Denoised) %>% 
        distinct()

    num_ids <- nrow(df_ids)
    # print(num_ids)

    if(num_ids == 1)
    {
        dfs[[base_clust_id]]  <- data.frame(ID1 = df_ids$ID_Denoised, ID2 = df_ids$ID_Denoised, Distance = 0)
        next
    }


    # Loads the distance file
    dist_file <- paste("EST-LAT/EstLat_SeqDistances/Split_OTUs_Distances/",base_clust_id,".txt", sep="")
    df_dist  <- read_tsv(dist_file, col_types = cols(), col_names = c("ID1", "ID2", "Distance"))

    df_dist <- df_dist %>% 
    filter(ID1 %in% df_ids$ID_Denoised & ID2 %in% df_ids$ID_Denoised)

    dfs[[base_clust_id]]  <- df_dist
}

dfs_distances <- bind_rows(dfs)

#  Saves
write_csv(dfs_distances, diss_file, append = TRUE, col_names = FALSE)


In [None]:
max(dfs_distances$Distance)

In [None]:
#  Selectes the relevant columns for the taxa

taxa_level <- "Genus"
print(taxa_level)

cols <- c('QueryName','Order','Family','Genus','Species')

df_taxa <- data$taxveg %>% select(all_of(cols)) %>% dplyr::rename(ID_Clustered = QueryName)

totals <- table(df_taxa[[taxa_level]])
df_order <- tibble(Taxa = names(totals), Count = as.vector(totals))
df_order <- df_order %>% arrange(-Count)

print(paste("Total Groups: ", nrow((df_order))))

selected_genus <- c(
"Cenococcum",
"Cortinarius",
"Russula",
"Sebacina" ,
"Hebeloma",
"Tuber"
)

df_order <- df_order %>% filter(Taxa %in% selected_genus )

print(paste("Total Groups Selected Groups: ", nrow((df_order))))


df_order


In [None]:
included_vegetation <- c('forest', 'park', 'energy plantation')

unique(df_vegetation$vegetation_type)
table(df_vegetation$vegetation_type)

In [None]:

clust_distances <- seq(from = 0.002, to = 0.046, by = 0.002)


sample_ids <- colnames(df_abbundance  %>% select(-ID_Denoised,-ID_Clustered))


vals <- c()
divs <- c()


i <- 0

taxa_groups <- df_order %>% pull(Taxa)

rows <- list()

for(sampleID in sample_ids)
{
        
    i  <- i + 1
    print(paste(sampleID, i, "of", length(sample_ids), sep = " "))
    flush.console()

    veg_type  <- df_vegetation %>% filter(SampleID == sampleID) %>% pull(vegetation_type)

    if(!(veg_type %in%  included_vegetation))
        next


    df_abbundance_sample <- df_abbundance %>% 
        select(ID_Clustered, ID_Denoised, all_of(sampleID))  %>% 
        rename(Abundance = all_of(sampleID)) %>% 
        filter(Abundance > 0)

    # First for all Taxa    
    df_abbundance_taxa <- df_abbundance_sample 
    ids_denoised <- df_abbundance_taxa %>% pull(ID_Denoised)
    
    diss <-  dfs_distances %>% filter(ID1 %in% ids_denoised) %>% filter(ID2 %in% ids_denoised)
    ids <- df_abbundance_taxa$ID_Denoised
    ab <- df_abbundance_taxa$Abundance
    clust <- df_abbundance_taxa$ID_Clustered


    for(sigma in clust_distances)
    {

        ab_clust <- tapply(ab, clust, sum)

        m <- multiplicity.distance.by_blocks(ids, ab, diss, clust, sigma)
        d <- renyi(ab_clust, hill = TRUE, scales = 1)[[1]]

        rows[[length(rows) + 1]]  <- c(sampleID, "All", sigma, m, d, veg_type)

    }


        
    for(txg in taxa_groups)
    {

        cluster_ids <- df_taxa %>% filter(!!sym(taxa_level) == txg) %>% pull(ID_Clustered)
        df_abbundance_taxa <- df_abbundance_sample %>% filter(ID_Clustered %in% cluster_ids)
        ids_denoised <- df_abbundance_taxa %>% pull(ID_Denoised)
        
        diss <-  dfs_distances %>% filter(ID1 %in% ids_denoised) %>% filter(ID2 %in% ids_denoised)
        ids <- df_abbundance_taxa$ID_Denoised
        ab <- df_abbundance_taxa$Abundance
        clust <- df_abbundance_taxa$ID_Clustered


        for(sigma in clust_distances)
        {

            if(length(ids) == 0) {
                rows[[length(rows) + 1]]  <- c(sampleID, txg, sigma, 0, 0, veg_type)
                next
            }
            
            if(length(ids) == 1)
            {
                rows[[length(rows) + 1]]  <- c(sampleID, txg, sigma, 1, 1, veg_type)
                next
            }


            ab_clust <- tapply(ab, clust, sum)

            m <- multiplicity.distance.by_blocks(ids, ab, diss, clust, sigma)
            d <- renyi(ab_clust, hill = TRUE, scales = 1)[[1]]

            rows[[length(rows) + 1]]  <- c(sampleID, txg, sigma, m, d, veg_type)

        }

    }


}


df_result <- do.call(rbind, rows)
colnames(df_result) <- c("SampleID","Taxa", "sigma", "delta_M", "D", "Veg")

df_result <-  as_tibble(df_result) %>%
  mutate(delta_M = as.numeric(delta_M), D = as.numeric(D), sigma = as.numeric(sigma), Taxa = as.factor(Taxa), Veg = as.factor(Veg)) 

df_result


In [None]:
 
 
printLatexPermanovaResults <- function(perm_model, caption, label)
{

    # Extract ANOVA table and compute eta²
    anova_df <- as.data.frame(perm_model$table)
    
    #  print(anova_df)
    
    anova_df <- anova_df %>%
        mutate(
        eta = round(SS / sum(SS) * 100, 2),
        `F value` = round(F, 2),
        `resampled Pr(>F)` = round(`resampled P(>F)`, 4)    
        ) %>%
        select(df, eta, SS, `F value`, `resampled Pr(>F)`)
    
    # Format for LaTeX
    anova_latex <- anova_df %>%
        mutate(
        `resampled Pr(>F)` = case_when(
            `resampled Pr(>F)` < 0.001 ~ paste0(sprintf("%.4f", `resampled Pr(>F)`), " ***"),
            `resampled Pr(>F)` < 0.01  ~ paste0(sprintf("%.4f", `resampled Pr(>F)`), " ** "),
            `resampled Pr(>F)` < 0.05  ~ paste0(sprintf("%.4f", `resampled Pr(>F)`), " *  "),
            TRUE             ~ sprintf("%.4f", `resampled Pr(>F)`)
        ),
        `SS`  = sprintf("%.2f", SS),
        eta = sprintf("%.2f", eta),
        `F value`  = sprintf("%.2f", `F value`)
        ) %>%
        rename(
        `Degrees of Freedom` = df,
        `Sum Sq` = `SS`,
        `$\\eta^2$` = eta,
        `p-value (Resampled)` = `resampled Pr(>F)`
        )
    
    tab <- kable(anova_latex, format = "latex", booktabs = TRUE,
                caption = caption,
                #  label = paste0("tab:anova_results_", target_col),
                align = c("r", "r", "r", "r", "l"), escape = FALSE) %>%
        kable_styling(latex_options = c("hold_position"))
    
    cat(gsub("end{tabular}", paste0("end{tabular}\n\\label{","tab:", label, "}"), tab,fixed = TRUE))
    cat("\n\n")
}



# Fixes seed
set.seed(161)
np  <- 10000



sig <- 0.02
df_analysis  <- df_result %>% filter(sigma == sig) %>% filter(Taxa != "All")

df_analysis$Taxa  <- as.factor(df_analysis$Taxa)
df_analysis$Veg  <- as.factor(df_analysis$Veg)

grid <- data.frame(Taxa = unique(df_analysis$Taxa), Veg = "alley")
grid$Veg <- factor(grid$Veg, levels = levels(df_analysis$Veg))

# Distance Based Multplicity
print("Distance Based Multiplicity")

# Run permutation ANOVA
model_m <- aov(delta_M ~ Taxa * Veg, data = df_analysis)
perm_model_m <- aovperm(delta_M ~ Taxa * Veg,
                      data = df_analysis,
                      np = np, # number of permutations
                      progress = FALSE)

# Extract ANOVA table and compute eta²
df_anova_m <- as.data.frame(perm_model_m$table)
df_anova_m <- df_anova_m %>% mutate(eta = round(100* SS / sum(.data$SS), 2)) 
df_anova_m <- cbind(id = rownames(df_anova_m), df_anova_m)
df_anova_m

printLatexPermanovaResults(perm_model_m, caption = "Permutation ANOVA results for the $\\delta M_{0.002}$ model for taxonomic fungi samples
from northern Baltic region", label = "anova_distance_multiplicity_tax_balt")


df_marginal_m <- comparisons(model_m, variables = "Veg", newdata = grid) %>% 
                 mutate(contrast.var = str_split_fixed(contrast, " - ", 2)[, 1])


# Diversity

print("Diversity")

# Run permutation ANOVA
model_d <- aov(D ~ Taxa * Veg, data = df_analysis)
perm_model_d <- aovperm(D ~ Taxa * Veg,
                      data = df_analysis,
                      np = np, # number of permutations
                      progress = FALSE)

# Extract ANOVA table and compute eta²
df_anova_d <- as.data.frame(perm_model_d$table)
df_anova_d <- df_anova_d %>% mutate(eta = round(100* SS / sum(.data$SS), 2)) 
df_anova_d <- cbind(id = rownames(df_anova_d), df_anova_d)
df_anova_d

printLatexPermanovaResults(perm_model_d, caption = "Permutation ANOVA results for the $^1 D$ model for taxonomic fungi samples
from northern Baltic region", label = "anova_diversity_tax_balt")

df_marginal_d <- comparisons(model_d, variables = "Veg", newdata = grid)  %>% 
                mutate(contrast.var = str_split_fixed(contrast, " - ", 2)[, 1])



In [None]:
# Create a named list of data frames
sheets <- list(
  "All Results"                     = df_result,
  "Analysis Results (sigma 0.02)"   = df_analysis,
  "Multiplicity ANOVA Perm Results" = df_anova_m,
  "Multiplicity Marginal Results"   = df_marginal_m,
  "Diversity ANOVA Perm Results"    = df_anova_d,
  "Diversity Marginal Results"      = df_marginal_d
)

# Write to Excel file
write_xlsx(sheets, path = paste0("EST-LAT/results/", "results_", tolower(taxa_level) ,".xlsx"))



### Sigificance Analysis

In [None]:
# Multiplicity
set.seed(1)


# Marginal

make_chart = function (df, title, ncols = 4)
{

      # Create significance stars
      df <- df %>%
        mutate(
          signif = case_when(
            p.value < 0.001 ~ "***",
            p.value < 0.01  ~ "**",
            p.value < 0.05  ~ "*",
            TRUE            ~ ""
          ),
          is_signif = p.value < 0.05  # logical flag
        )



      # Plot
      p <- ggplot(df, aes(y = contrast.var)) +
        geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.3, color = "grey50") +
        geom_point(aes(x = estimate, color = is_signif), size = 3.5) +
        geom_text(aes(x = conf.high + 0.05, label = signif), size = 6, hjust = 0) +
        scale_color_manual(values = c(`TRUE` = "darkgreen", `FALSE` = "red"), guide = "none") +
        facet_wrap2(~ Taxa, scales = "fixed", ncol = ncols, axes = "all") +
        theme_minimal(base_size = 16) +
        labs(
          title = title,
          x = "Estimate (with 95% CI)",
          y = "Contrast Vegetation"
        ) +
        theme(
          plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
          strip.text = element_text(size = 18, face = "bold"),
          axis.title = element_text(size = 18),
          axis.text = element_text(size = 16),
          plot.margin = margin(t = 20, r = 40, b = 20, l = 40),
          panel.spacing = unit(1.5, "lines")
        )

}

options(repr.plot.width = 14, repr.plot.height = 8)

# Multiplicity
df <- df_marginal_m

p_all <- make_chart(df, ncols = 3, title = TeX(paste0("$\\delta M_{0.02}$ Contrasts against 'Energy Plantation' with 95% Confidence Intervals by ", taxa_level,  " (All)")))
# p_sig <- make_chart(df %>% filter(p.value  <=  0.05), ncols = 3, title = TeX(paste0("$\\delta M_{0.02}$ Contrasts against 'Energy Plantation' with 95% Confidence Intervals by ", taxa_level, " (Significant at 0.05)")))

print(p_all)
# print(p_sig)


ggsave(paste0("EST-LAT/results/plots/", tolower(taxa_level), "_multiplicity_all.png"), plot = p_all, width = 14, height = 8, dpi = 400, bg = "white")
# ggsave(paste0("EST-LAT/results/plots/", tolower(taxa_level), "_multiplicity_significant.png"), plot = p_sig, width = 16, height = 15, dpi = 400, bg = "white")

# Diversity

df <- df_marginal_d

p_all <- make_chart(df, ncols = 3, title = TeX(paste0("$^1D$ Contrasts against 'Energy Plantation' with 95% Confidence Intervals by ", taxa_level,  " (All)")))
# p_sig <- make_chart(df %>% filter(p.value  <=  0.05), ncols = 3, title = TeX(paste0("$^1D$ Contrasts against 'Energy Plantation' with 95% Confidence Intervals by ", taxa_level,  " (Significant at 0.05)")))

print(p_all)
# print(p_sig)


ggsave(paste0("EST-LAT/results/plots/", tolower(taxa_level), "_diversity_all.png"), plot = p_all, width = 14, height = 8, dpi = 400, bg = "white")
# ggsave(paste0("EST-LAT/results/plots/", tolower(taxa_level), "_diversity_significant.png"), plot = p_sig, width = 16, height = 15, dpi = 400, bg = "white")



## Diversity vs Multiplicity Plots

In [None]:
DEBUG = FALSE

set.seed(1)

rows <- list()

for(txg in c(taxa_groups))
{

      cat(paste0("% --------- ", txg, " ----------\n"))
      flush.console()
      
      df_plot <- df_analysis %>% filter(Taxa == txg)



      # Parameterrs
      df <- df_plot

      multiplicity_col <- "delta_M"
      diversity_col <- "D"
      group_col <- "Veg"
      diversity_name <- "Diversity $^1D$"
      group_name <- "Vegetation Type"
      multiplicity_name <- paste0("Distance-based Multiplicity $\\delta \\, M_{", sig, "}$", sep = "")

      label_map <- c('forest' = "Forest (F)",                    
                    'park' = 'Park (P)',                     
                    'energy plantation' = "Energy Plantation (EP)"              
                    )

      short_label_map <- c(
                    'forest' = "F",                    
                    'park' = 'P',                     
                    'energy plantation' = "EP"                    
                    )

      label_order <- c('forest', 'park', 'energy plantation')


      custom_colors <- c(
                    '#7D917E',                 
                    '#DAC5AA',
                    '#BE5E58'
      )
      

      title <- TeX(paste0("\\textit{", txg, "} Distance-based Multiplicity $\\delta \\, M_{", sig, "}$ vs Diversity $^1D$", sep = ""))
      if(txg == "All")
          title <- TeX(paste0("Distance-based Multiplicity $\\delta \\, M_{", sig, "}$ vs Diversity $^1D$", sep = ""))
          



      plot_location <- paste0("EST-LAT/results/plots/", tolower(taxa_level),"/", tolower(txg), "_combined.png")

      short_label_order <- as.vector(short_label_map[label_order])
    
      kw_caption <- paste0("Kruskal-Wallis test results for diversity ($^1D$) and distance-based multiplicity ($\\delta M_{0.02}$) comparing the different vegetation types for genus \\textit{",txg,"}")
      dun_caption <- paste0("Dunn's Test Results for Diversity ($^1D$) and distance-based multiplicity ($\\delta M_{0.02}$) comparing different vegetation types for genus \\textit{",txg,"}")

      
      kw_label <- paste0("tab:kw_results_balt_", str_to_lower(txg))
      dunn_label <- paste0("tab:dunn_results_balt_", str_to_lower(txg))


      # Tests
      tests_resp <- kw_and_dunn_test(
        df_test = df,
        order = label_order,
        labels = short_label_order, 
        multiplicity_col = multiplicity_col,
        diversity_col = diversity_col,
        group_col = group_col,
        diversity_name = diversity_name,
        group_name = group_name,
        multiplicity_name = multiplicity_name,
        kw_caption = kw_caption,
        dun_caption = dun_caption,
        kw_label = kw_label,
        dunn_label = dunn_label
      )

      compute_cld(dunn_result = tests_resp$dunn_div, short_label_order = short_label_order, label_order = label_order)

      # Prints

    if(DEBUG)
    {
      print(tests_resp$kw_df)
      cat("\n\n")
      print(tests_resp$dunn_df)
      flush.console()

    }
    else {
       
      cat(tests_resp$kw_table)
      cat("\n\n")
      cat(tests_resp$dunn_table)
      flush.console()
    }



    # Compact Letter Display
    cld_div <- compute_cld(dunn_result = tests_resp$dunn_div, short_label_order = short_label_order, label_order = label_order)
    cld_mul <- compute_cld(dunn_result = tests_resp$dunn_mul, short_label_order = short_label_order, label_order = label_order)
    
    rows[[length(rows) + 1]] <- tibble(Taxa = txg, Veg = label_order, D = cld_div$Letters[label_order], delta_M= cld_mul$Letters[label_order])
    
    cat("\n\n\n\n")
    flush.console()
      
}

      



In [None]:
df_cld <- bind_rows(rows)

df <- df_analysis %>% filter(Taxa != "All")

label_order <- c('forest', 'park', 'energy plantation')

custom_colors <- c(
              '#7D917E',                 
              '#DAC5AA',
              '#BE5E58'
)

multiplicity_col <- "delta_M"
diversity_col <- "D"
group_col <- "Veg"

diversity_name <- "Diversity $^1D$"
group_name <- "Habitat Type"
multiplicity_name <- paste0("Distance-based Multiplicity $\\delta \\, M_{", sig, "}$", sep = "")


label_map <- c('forest' = "Forest (F)",                    
              'park' = 'Park (P)',                     
              'energy plantation' = "Energy Plantation (EP)"              
              )
              

p_div <- plot_taxa_boxplot(
  df = df,
  taxa_col = "Taxa",
  taxa_name = NULL,
  group_col = group_col,
  value_col = diversity_col,
  df_cld = df_cld,
  label_order = label_order,
  custom_colors = custom_colors,
  value_name = diversity_name,
  group_name = group_name,
  label_map = label_map  
) 

p_mult <- plot_taxa_boxplot(
  df = df,
  taxa_col = "Taxa",
  taxa_name = "Genus",
  group_col = group_col,
  value_col = multiplicity_col,
  df_cld = df_cld,
  label_order = label_order,
  custom_colors = custom_colors,
  value_name = multiplicity_name,
  group_name = group_name,
  label_map = label_map  
) 

# Set plot size
options(repr.plot.width = 14, repr.plot.height = 10)

p_mult <- p_mult + theme(
  strip.text.x = element_blank()   # Remove facet strip labels on bottom plot
)

combined <- (p_div / p_mult) +
  plot_layout(guides = "collect") +
  plot_annotation(
    title = "Diversity and Distance-based Multiplicity by Vegetation Type for selected Genus"
  ) &
  theme(
    legend.position = "bottom",               
    legend.text = element_text(size = 14),    
    legend.title = element_text(size = 16),  
    legend.key.size = unit(1.2, "cm"),        
    plot.title = element_text(size = 20, hjust = 0.5)
  )

combined

ggsave(paste0("EST-LAT/results/plots/", tolower(taxa_level), "_diversity_multiplicity_selected_baltics.png"), plot = combined, width = 14, height = 10, dpi = 400, bg = "white")
