In [3]:
# importing all required packages & notebook extensions at the start of the notebook
import pandas as pd
import biom
from scipy.stats import spearmanr               
from statsmodels.stats.multitest import multipletests
from pathlib import Path

%matplotlib inline

In [None]:
Path("Data/aroma").mkdir(parents=True, exist_ok=True)

In [2]:
#needed variables
Data_raw='Data/raw'
Data_classified='Data/classified'
Data_aroma='Data/aroma'

<div style="background-color: skyblue; padding: 10px;">
    Titles
    </div>
<div style="background-color: aliceblue; padding: 10px;">
    Results

# Prepare Data
<div style="background-color: skyblue; padding: 10px;">


Export all necessary files, so that they can be read in as a data frame

In [3]:
!qiime tools export \
  --input-path $Data_classified/taxonomy.qza \
  --output-path $Data_aroma

  import pkg_resources
[32mExported Data/classified/taxonomy.qza as TSVTaxonomyDirectoryFormat to directory Data/aroma[0m
[0m[?25h

In [5]:
taxonomy = pd.read_csv("Data/aroma/taxonomy.tsv", sep="\t")

In [5]:
!qiime tools export \
  --input-path $Data_classified/table-filtered-sourdough_only.qza \
  --output-path $Data_aroma/table-filtered-sourdough_only

  import pkg_resources
[32mExported Data/classified/table-filtered-sourdough_only.qza as BIOMV210DirFmt to directory Data/aroma/table-filtered-sourdough_only[0m
[0m[?25h

filtered table needed to be transposed so that it matches with analysis

In [6]:
table = biom.load_table('Data/aroma/table-filtered-sourdough_only/feature-table.biom')

table_filtered_sourdough = table.to_dataframe()

table_filtered_sourdough = table_filtered_sourdough.T

Load metadata and check which are the columns with the aroma information and set index to sample ID so no issues with merging later on

In [7]:
metadata = pd.read_csv("Data/raw/merged_output_usable.tsv", sep="\t") #using merged_output_usable because the spaces already substitued with _
metadata = metadata.set_index('sample ID')
aroma_columns = metadata.columns[-222:]

print(aroma_columns)

#so last 222 columns are aroma associated

Index(['ALCOHOLIC_D7', 'ANIMAL_FEED_D7', 'ANIMAL_STABLE_D7', 'APPLE_D7',
       'BANANA_D7', 'BEER_D7', 'BERRIES_D7', 'BREAD_D7', 'BUTTER_MILK_D7',
       'BUTYRIC_ACID_D7',
       ...
       'animal_score_D28', 'chemical_score_D28', 'body_odour_score_D28',
       'fruity_score_D28', 'maillard_score_D28', 'sour_score_D28',
       'ocean_score_D28', 'earthy_score_D28', 'fermented_dairy_score_D28',
       'nutty_score_D28'],
      dtype='object', length=222)


take only sourdough data

In [8]:
metadata_sd = metadata[metadata['sample_type'] == 'sourdough'].copy()

Check if aromas have missing values or if there are sourdough samples with no aroma analysis at all (if this would be the case, the samples with no aroma analysis would be taken out)

In [9]:
missing_aromas = metadata_sd[aroma_columns].isnull().sum()
if missing_aromas.any():
    print(f"\n⚠ Warning: Missing aroma values detected:")
    print(missing_aromas[missing_aromas > 0])


PORRIDGE_D28                 21
WHOLE_GRAIN_D28              21
HAY_D28                      21
BREAD_D28                    21
CORN_D28                     21
                             ..
sour_score_D28               21
ocean_score_D28              21
earthy_score_D28             21
fermented_dairy_score_D28    21
nutty_score_D28              21
Length: 87, dtype: int64


In [10]:
metadata_sd[aroma_columns].isnull().sum(axis=1)

sample ID
366291_386-LP4-ITS-0386     0
366291_387-LP4-ITS-0387     0
366291_388-LP4-ITS-0388     0
366291_389-LP4-ITS-0389     0
366291_390-LP4-ITS-0390    87
                           ..
366292_105-LP4-ITS-0605     0
366292_106-LP4-ITS-0606     0
366292_107-LP4-ITS-0607     0
366292_108-LP4-ITS-0608    87
366292_110-LP4-ITS-0610     0
Length: 125, dtype: int64

In [11]:
print("Before:", metadata_sd.shape)
metadata_sd = metadata_sd.dropna(subset=aroma_columns, how='all')
print("After:", metadata_sd.shape)

Before: (125, 303)
After: (125, 303)


<div style="background-color: aliceblue; padding: 10px;">

so there are no samples that miss all aroma analyses

# Define functions
<div style="background-color: skyblue; padding: 10px;">


Define function to collaps on taxonomic level

In [12]:
def collapse_to_taxonomic_level(feature_table, taxonomy_df, level='Family'):
    """
    Collapse ASV table to specified taxonomic level
    
    Parameters:
    -----------
    feature_table : pd.DataFrame
        ASV abundance table (samples × features)
    taxonomy_df : pd.DataFrame
        Taxonomy table with 'Feature ID' and 'Taxon' columns
    level : str
        Taxonomic level ('Genus' or 'Family')
    
    Returns:
    --------
    pd.DataFrame : Collapsed abundance table
    """
    # Extract taxonomic level
    level_prefix = {'Genus': 'g__', 'Family': 'f__'}[level]
    
    taxonomy_df = taxonomy_df.copy()
    if 'Feature ID' in taxonomy_df.columns:
        taxonomy_df = taxonomy_df.set_index('Feature ID')
    
    taxonomy_df[level] = taxonomy_df['Taxon'].str.extract(f'{level_prefix}([^;]+)')
    taxonomy_df[level] = taxonomy_df[level].fillna('Unassigned')
    
    # Map features to taxonomy
    feature_to_taxon = taxonomy_df[level].to_dict()
    collapsed = feature_table.copy()
    collapsed.columns = [feature_to_taxon.get(col, 'Unknown') for col in collapsed.columns]
    
    # Sum by taxonomic group (suppress FutureWarning)
    collapsed = collapsed.T.groupby(level=0).sum().T
    
    print(f"  ✓ Collapsed to {level} level: {collapsed.shape[1]} taxa")
    
    # Show dominant taxa
    relative = collapsed.div(collapsed.sum(axis=1), axis=0)
    top5 = relative.mean().sort_values(ascending=False).head(5)
    print(f"\n  Top 5 {level.lower()}s by mean relative abundance:")
    for taxon, abund in top5.items():
        print(f"    {taxon}: {abund*100:.2f}%")
    
    return collapsed

Define function for correlation calculation

In [27]:
def calculate_correlations(data, fungal_columns, aroma_columns, min_samples=10):
    """Calculate Spearman correlations between fungi and aromas"""
    
    results = []
    
    for fungus in fungal_columns:
        for aroma in aroma_columns:
            # Remove samples where fungus is absent OR aroma is missing
            mask = (data[fungus] > 0) & (data[aroma].notna())
            n_samples = mask.sum()
            
            if n_samples < min_samples:
                continue
            
            x = data.loc[mask, fungus]
            y = data.loc[mask, aroma]
            
            # Skip if either variable is constant
            if x.nunique() < 2 or y.nunique() < 2:
                continue
            
            # Calculate Spearman correlation
            rho, p_value = spearmanr(x, y)
            
            results.append({
                'Fungus': fungus,
                'Aroma': aroma,
                'Spearman_rho': rho,
                'P_value': p_value,
                'N_samples': n_samples
            })
    
    results_df = pd.DataFrame(results)
    
    # Multiple testing correction (FDR)
    if len(results_df) > 0:
        results_df['FDR'] = multipletests(results_df['P_value'], method='fdr_bh')[1]
    
    return results_df


Define function for summarizing results

In [33]:
def summarize_results(
    cor_results: pd.DataFrame,
    rho_threshold: float = 0.1,
    fdr_threshold: float = 0.2,
    taxon_col: str = "Fungus",      # change to "Taxon" if that's your column name
    aroma_col: str = "Aroma",
    save_path: str | Path | None = "significant_correlations.csv",
    top_n: int = 10,
) -> pd.DataFrame:
    """
    Summarize correlation results with FDR and |rho| thresholds.
    
    Parameters
    ----------
    cor_results : pd.DataFrame
        DataFrame with at least: 'Spearman_rho', 'P_value', 'FDR', taxon_col, aroma_col, 'N_samples'
    rho_threshold : float
        Absolute correlation cutoff (|rho| > rho_threshold).
    fdr_threshold : float
        FDR cutoff (FDR < fdr_threshold).
    taxon_col : str
        Column name for the taxon (e.g. 'Fungus' or 'Taxon').
    aroma_col : str
        Column name for the aroma variable.
    save_path : str or Path or None
        If not None, path to save significant correlations as CSV.
    top_n : int
        Number of top associations to display.
    
    Returns
    -------
    pd.DataFrame
        DataFrame of significant correlations (possibly empty).
    """
    print("\n" + "="*72)
    print("CORRELATION ANALYSIS SUMMARY")
    print("="*72)

    # Handle empty input
    if cor_results is None or len(cor_results) == 0:
        print("\n⚠ cor_results is empty – no correlations to summarize.")
        return pd.DataFrame()

    # Basic count
    print(f"\nTotal correlations tested: {len(cor_results):,}")

    # Check required columns
    required = {"Spearman_rho", "P_value", "FDR", taxon_col, aroma_col}
    missing = required - set(cor_results.columns)
    if missing:
        print(f"\n⚠ Missing required columns: {missing}")
        print("   Available columns:", list(cor_results.columns))
        return pd.DataFrame()

    # Distribution of rho / p / FDR
    print("\nDistribution of correlation coefficients (Spearman_rho):")
    print(cor_results["Spearman_rho"].describe().to_string())

    print("\nDistribution of p-values:")
    print(cor_results["P_value"].describe().to_string())

    print("\nDistribution of FDR:")
    print(cor_results["FDR"].describe().to_string())

    # Select significant correlations
    sig_cors = cor_results[
        (cor_results["FDR"] < fdr_threshold) &
        (cor_results["Spearman_rho"].abs() > rho_threshold)
    ].copy()

    sig_cors = sig_cors.sort_values("Spearman_rho", key=abs, ascending=False)

    print("\n" + "-"*72)
    print(f"Significant correlations (FDR < {fdr_threshold}, |ρ| > {rho_threshold}): {len(sig_cors)}")
    print("-"*72)

    if len(sig_cors) > 0:
        print(f"\nTop {min(top_n, len(sig_cors))} strongest significant correlations:")
        display_cols = [taxon_col, aroma_col, "Spearman_rho", "FDR", "N_samples"]
        display_cols = [c for c in display_cols if c in sig_cors.columns]
        print(sig_cors.head(top_n)[display_cols].to_string(index=False))

        # Save if requested
        if save_path is not None:
            save_path = Path(save_path)
            sig_cors.to_csv(save_path, index=False)
            print(f"\n✓ Saved significant correlations to: {save_path}")
    else:
        print("\n⚠ No associations pass significance thresholds after FDR correction.")
        print("  Consider lowering thresholds or checking data quality.")

    # Show top correlations overall (regardless of significance)
    print("\nTop correlations by |Spearman_rho| (all results):")
    top_all = cor_results.sort_values("Spearman_rho", key=abs, ascending=False)
    display_cols = [taxon_col, aroma_col, "Spearman_rho", "P_value", "FDR", "N_samples"]
    display_cols = [c for c in display_cols if c in top_all.columns]
    print(top_all.head(top_n)[display_cols].to_string(index=False))

    return sig_cors


# Collapsing & merging table with metadata
<div style="background-color: skyblue; padding: 10px;">

Collapsing feature table to Family & Genus level, to reduce number of statistical tests

In [15]:
family_table = collapse_to_taxonomic_level(table_filtered_sourdough, taxonomy, level='Family')
family_rel = family_table.div(family_table.sum(axis=1), axis=0)

  ✓ Collapsed to Family level: 128 taxa

  Top 5 familys by mean relative abundance:
    Saccharomycetaceae: 70.61%
    Pleosporaceae: 17.10%
    Didymellaceae: 5.56%
    Pichiaceae: 1.39%
    Saccotheciaceae: 1.25%


In [16]:
genus_table = collapse_to_taxonomic_level(table_filtered_sourdough, taxonomy, level='Genus')
genus_rel = genus_table.div(genus_table.sum(axis=1), axis=0)

  ✓ Collapsed to Genus level: 238 taxa

  Top 5 genuss by mean relative abundance:
    Saccharomyces: 70.61%
    Alternaria: 14.70%
    Unassigned: 5.44%
    Pyrenophora: 1.77%
    Pichia: 1.39%


Merging collapsed feature tables with metadata

In [17]:
merged_family = family_rel.join(metadata_sd, how='inner')
merged_genus = genus_rel.join(metadata_sd, how='inner')
    
family_columns = family_rel.columns.tolist()
genus_columns = genus_rel.columns.tolist()

# Spearman correlations
<div style="background-color: skyblue; padding: 10px;">


Family level

In [34]:
cor_results_family = calculate_correlations(merged_family, family_columns, aroma_columns)

print(f"✓ Calculated {len(cor_results_family)} correlations")

✓ Calculated 7073 correlations


In [35]:
summarize_results(cor_results_family)


CORRELATION ANALYSIS SUMMARY

Total correlations tested: 7,073

Distribution of correlation coefficients (Spearman_rho):
count    7073.000000
mean       -0.004160
std         0.178902
min        -0.712525
25%        -0.097590
50%         0.000000
75%         0.090189
max         0.720158

Distribution of p-values:
count    7073.000000
mean        0.495997
std         0.291627
min         0.000328
25%         0.241590
50%         0.477006
75%         0.758125
max         1.000000

Distribution of FDR:
count    7073.000000
mean        0.966678
std         0.025243
min         0.943803
25%         0.943803
50%         0.953877
75%         1.000000
max         1.000000

------------------------------------------------------------------------
Significant correlations (FDR < 0.2, |ρ| > 0.1): 0
------------------------------------------------------------------------

⚠ No associations pass significance thresholds after FDR correction.
  Consider lowering thresholds or checking data quality.


Unnamed: 0,Fungus,Aroma,Spearman_rho,P_value,N_samples,FDR


In [1]:
top_families = family_rel.mean().sort_values(ascending=False).head(10)
print("\nTop 10 family:")
for family, abundance in top_families.items():
    print(f"  {family}: {abundance*100:.2f}%")

NameError: name 'family_rel' is not defined

Genus level

In [36]:
cor_results_genus = calculate_correlations(merged_family, family_columns, aroma_columns)

print(f"✓ Calculated {len(cor_results_genus)} correlations")

✓ Calculated 7073 correlations


In [37]:
summarize_results(cor_results_genus)


CORRELATION ANALYSIS SUMMARY

Total correlations tested: 7,073

Distribution of correlation coefficients (Spearman_rho):
count    7073.000000
mean       -0.004160
std         0.178902
min        -0.712525
25%        -0.097590
50%         0.000000
75%         0.090189
max         0.720158

Distribution of p-values:
count    7073.000000
mean        0.495997
std         0.291627
min         0.000328
25%         0.241590
50%         0.477006
75%         0.758125
max         1.000000

Distribution of FDR:
count    7073.000000
mean        0.966678
std         0.025243
min         0.943803
25%         0.943803
50%         0.953877
75%         1.000000
max         1.000000

------------------------------------------------------------------------
Significant correlations (FDR < 0.2, |ρ| > 0.1): 0
------------------------------------------------------------------------

⚠ No associations pass significance thresholds after FDR correction.
  Consider lowering thresholds or checking data quality.


Unnamed: 0,Fungus,Aroma,Spearman_rho,P_value,N_samples,FDR


In [41]:
top_genera = genus_rel.mean().sort_values(ascending=False).head(10)
print("\nTop 10 genera:")
for genus, abundance in top_genera.items():
    print(f"  {genus}: {abundance*100:.2f}%")


Top 10 genera:
  Saccharomyces: 70.61%
  Alternaria: 14.70%
  Unassigned: 5.44%
  Pyrenophora: 1.77%
  Pichia: 1.39%
  Aureobasidium: 1.22%
  Cladosporium: 1.08%
  Stemphylium: 0.44%
  Parastagonospora: 0.40%
  Sporobolomyces: 0.29%


<div style="background-color: aliceblue; padding: 10px;">
even at loose settings for rho (>0.1) and FRD (<0.2) no significant correlations

# MixOmics
<div style="background-color: skyblue; padding: 10px;">

Turn data frames into csv files, so that they can be read into R studio

In [40]:
family_rel.to_csv("Data/aroma/Analysis/X_fungi_family_rel.csv")
genus_rel.to_csv("Data/aroma/Analysis/X_fungi_genus_rel.csv")
metadata_sd[aroma_columns].to_csv("Data/aroma/Analysis/Y_aromas.csv")        
metadata_sd[['background']].to_csv("Data/aroma/Analysis/meta_background.csv")

This code was run on RStudio to create the circos plot

## --- 0. Install & load packages ---------------------------------------------

# install.packages("BiocManager")
# BiocManager::install("mixOmics")
# installing packages only once

library(mixOmics)

## --- 1. Load data -----------------------------------------------------------

## Read in fungi, aroma and metadata tables
setwd('C:/Users/nschw/Downloads/')

X_genus <- read.csv("X_fungi_genus_rel.csv",
                    row.names = 1, check.names = FALSE)
X_family <- read.csv("X_fungi_family_rel.csv",
                     row.names = 1, check.names = FALSE)
Y       <- read.csv("Y_aromas.csv",
                    row.names = 1, check.names = FALSE)
meta_bg <- read.csv("meta_background.csv",
                    row.names = 1, check.names = FALSE)

##first analysis only for genus
## Find common sample IDs (intersection of row names)
common <- Reduce(intersect, list(rownames(X_genus), rownames(Y), rownames(meta_bg)))
length(common)   # just to see how many common samples you have


## --- 2. Subset & align samples ----------------------------------------------

## Subset all three tables to the common samples
## (this also aligns the rows in the same order)
X_genus <- X_genus[common, , drop = FALSE]
Y       <- Y[common, , drop = FALSE]
meta_bg <- meta_bg[common, , drop = FALSE]

## Keep only rows with complete data in X, Y and background variable
keep <- complete.cases(X_genus, Y, meta_bg$background)
X_genus <- X_genus[keep, , drop = FALSE]
Y       <- Y[keep, , drop = FALSE]
meta_bg <- meta_bg[keep, , drop = FALSE]


## --- 3. Remove zero-variance variables --------------------------------------

## Fungi block: remove columns with zero standard deviation
sd_X_genus      <- apply(X_genus, 2, sd, na.rm = TRUE)
zero_var_X_genus <- names(sd_X_genus[sd_X_genus == 0])
if (length(zero_var_X_genus) > 0) {
  X_genus <- X_genus[, !(colnames(X_genus) %in% zero_var_X_genus), drop = FALSE]
}

## Aroma block: remove columns with zero standard deviation
sd_Y      <- apply(Y, 2, sd, na.rm = TRUE)
zero_var_Y <- names(sd_Y[sd_Y == 0])
if (length(zero_var_Y) > 0) {
  Y <- Y[, !(colnames(Y) %in% zero_var_Y), drop = FALSE]
}

## Sanity check: should both be FALSE
any(apply(X_genus, 2, sd) == 0)
any(apply(Y, 2, sd) == 0)

## Check final dimensions
dim(X_genus); dim(Y)


## --- 4. Prepare data for block.splsda ---------------------------------------

## Put blocks into a named list (one element per block)
data_list <- list(
  fungi = X_genus,
  aroma = Y
)

## Outcome: background as a factor
Y_class <- as.factor(meta_bg$background)
Y_class <- droplevels(Y_class)       # drop unused levels (just in case)
levels(Y_class)
table(Y_class)                        # check sample sizes per class


## --- 5. Design matrix between blocks ----------------------------------------

## 2 x 2 design matrix for 2 blocks (fungi, aroma)
design <- matrix(c(0, 1,
                   1, 0),
                 ncol = 2, byrow = TRUE)
diag(design) <- 0                     # ensure no self-links on diagonal
colnames(design) <- rownames(design) <- c("fungi", "aroma")
design


## --- 6. Model specification --------------------------------------------------

ncomp <- 3  # number of components; three because usually done like this and a lot of correlation until 3rd component

## Number of variables to keep per component and per block
keepX_genus_fungi <- rep(min(10, ncol(X_genus)), ncomp)
keepX_genus_aroma <- rep(min(10, ncol(Y)), ncomp)

keepX_genus_list <- list(
  fungi = keepX_genus_fungi,
  aroma = keepX_genus_aroma
)
keepX_genus_list

set.seed(300)  # for reproducibility of the sparse selection

## Fit block sPLS-DA model
res_block_class <- block.splsda(
  X      = data_list,          # <- argument name is X
  Y      = Y_class,
  ncomp  = ncomp,
  keepX  = keepX_genus_list,   # <- argument name is keepX
  design = design
)


## --- 7. Circos plots ---------------------------------------------------------

graphics.off()
par(mfrow = c(1, 1))   # single plot per device

## Circos plot for component 1
circosPlot(
  res_block_class,
  comp           = 1,
  cutoff         = 0.3,           # show only |correlation| ≥ 0.3
  line           = TRUE,
  color.blocks   = c("darkgreen", "darkorange"),
  title          = "Fungi–Aroma associations (Background as class) – Comp 1",
  size.variables = 1,
  size.blocks    = 2
)

## Circos plot for component 2
circosPlot(
  res_block_class,
  comp           = 2,
  cutoff         = 0.3,
  line           = TRUE,
  color.blocks   = c("darkgreen", "darkorange"),
  title          = "Fungi–Aroma associations (Background as class) – Comp 2",
  size.variables = 1,
  size.blocks    = 2
)

## Circos plot for component 3
circosPlot(
  res_block_class,
  comp           = 3,
  cutoff         = 0.3,
  line           = TRUE,
  color.blocks   = c("darkgreen", "darkorange"),
  title          = "Fungi–Aroma associations (Background as class) – Comp 3",
  size.variables = 1,
  size.blocks    = 2
)


##everything again for family
## --- 1. Load data -----------------------------------------------------------

## Find common sample IDs (intersection of row names)
common <- Reduce(intersect, list(rownames(X_family), rownames(Y), rownames(meta_bg)))
length(common)   # just to see how many common samples you have


## --- 2. Subset & align samples ----------------------------------------------

## Subset all three tables to the common samples
## (this also aligns the rows in the same order)
X_family <- X_family[common, , drop = FALSE]
Y       <- Y[common, , drop = FALSE]
meta_bg <- meta_bg[common, , drop = FALSE]

## Keep only rows with complete data in X, Y and background variable
keep <- complete.cases(X_family, Y, meta_bg$background)
X_family <- X_family[keep, , drop = FALSE]
Y       <- Y[keep, , drop = FALSE]
meta_bg <- meta_bg[keep, , drop = FALSE]


## --- 3. Remove zero-variance variables --------------------------------------

## Fungi block: remove columns with zero standard deviation
sd_X_family      <- apply(X_family, 2, sd, na.rm = TRUE)
zero_var_X_family <- names(sd_X_family[sd_X_family == 0])
if (length(zero_var_X_family) > 0) {
  X_family <- X_family[, !(colnames(X_family) %in% zero_var_X_family), drop = FALSE]
}

## Aroma block: remove columns with zero standard deviation
sd_Y      <- apply(Y, 2, sd, na.rm = TRUE)
zero_var_Y <- names(sd_Y[sd_Y == 0])
if (length(zero_var_Y) > 0) {
  Y <- Y[, !(colnames(Y) %in% zero_var_Y), drop = FALSE]
}

## Sanity check: should both be FALSE
any(apply(X_family, 2, sd) == 0)
any(apply(Y, 2, sd) == 0)

## Check final dimensions
dim(X_family); dim(Y)


## --- 4. Prepare data for block.splsda ---------------------------------------

## Put blocks into a named list (one element per block)
data_list <- list(
  fungi = X_family,
  aroma = Y
)

## Outcome: background as a factor
Y_class <- as.factor(meta_bg$background)
Y_class <- droplevels(Y_class)       # drop unused levels (just in case)
levels(Y_class)
table(Y_class)                        # check sample sizes per class


## --- 5. Design matrix between blocks ----------------------------------------

## 2 x 2 design matrix for 2 blocks (fungi, aroma)
design <- matrix(c(0, 1,
                   1, 0),
                 ncol = 2, byrow = TRUE)
diag(design) <- 0                     # ensure no self-links on diagonal
colnames(design) <- rownames(design) <- c("fungi", "aroma")
design


## --- 6. Model specification --------------------------------------------------

ncomp <- 2  # number of components; third component was very sparse in circosPlot

## Number of variables to keep per component and per block
keepX_family_fungi <- rep(min(10, ncol(X_family)), ncomp)
keepX_family_aroma <- rep(min(10, ncol(Y)), ncomp)

keepX_family_list <- list(
  fungi = keepX_family_fungi,
  aroma = keepX_family_aroma
)
keepX_family_list

set.seed(300)  # for reproducibility of the sparse selection

## Fit block sPLS-DA model
res_block_family <- block.splsda(
  X      = data_list,          # <- argument name is X
  Y      = Y_class,
  ncomp  = ncomp,
  keepX  = keepX_family_list,   # <- argument name is keepX
  design = design
)


## --- 7. Circos plots ---------------------------------------------------------

## Circos plot for component 1
circosPlot(
  res_block_family,
  comp           = 1,
  cutoff         = 0.3,           # show only |correlation| ≥ 0.3
  line           = TRUE,
  color.blocks   = c("darkgreen", "darkorange"),
  title          = "Fungi–Aroma associations (Background as class) – Comp 1",
  size.variables = 1,
  size.blocks    = 2
)

## Circos plot for component 2
circosPlot(
  res_block_family,
  comp           = 2,
  cutoff         = 0.3,
  line           = TRUE,
  color.blocks   = c("darkgreen", "darkorange"),
  title          = "Fungi–Aroma associations (Background as class) – Comp 2",
  size.variables = 1,
  size.blocks    = 2
)
