In [1]:
# importing all required packages & notebook extensions at the start of the notebook
import pandas as pd
import biom
from scipy.stats import spearmanr               
from statsmodels.stats.multitest import multipletests
from pathlib import Path

%matplotlib inline

In [None]:
Path("Data/aroma").mkdir(parents=True, exist_ok=True)

In [2]:
#needed variables
Data_raw='Data/raw'
Data_classified='Data/classified'
Data_aroma='Data/aroma'

<div style="background-color: skyblue; padding: 10px;">
    Titles
    </div>
<div style="background-color: aliceblue; padding: 10px;">
    Results

# Prepare Data
<div style="background-color: skyblue; padding: 10px;">


Export all necessary files, so that they can be read in as a data frame

In [3]:
!qiime tools export \
  --input-path $Data_classified/taxonomy.qza \
  --output-path $Data_aroma

  import pkg_resources
[32mExported Data/classified/taxonomy.qza as TSVTaxonomyDirectoryFormat to directory Data/aroma[0m
[0m[?25h

In [11]:
taxonomy = pd.read_csv("Data/aroma/taxonomy.tsv", sep="\t")

In [5]:
!qiime tools export \
  --input-path $Data_classified/table-filtered-sourdough_only.qza \
  --output-path $Data_aroma/table-filtered-sourdough_only

  import pkg_resources
[32mExported Data/classified/table-filtered-sourdough_only.qza as BIOMV210DirFmt to directory Data/aroma/table-filtered-sourdough_only[0m
[0m[?25h

filtered table needed to be transposed so that it matches with analysis

In [12]:
table = biom.load_table('Data/aroma/table-filtered-sourdough_only/feature-table.biom')

table_filtered_sourdough = table.to_dataframe()

table_filtered_sourdough = table_filtered_sourdough.T

Load metadata and check which are the columns with the aroma information and set index to sample ID so no issues with merging later on

In [42]:
metadata = pd.read_csv("Data/raw/merged_output_usable.tsv", sep="\t") #using merged_output_usable because the spaces already substitued with _
metadata = metadata.set_index('sample ID')
aroma_columns = metadata.columns[-222:]

print(aroma_columns)

#so last 222 columns are aroma associated

Index(['ALCOHOLIC_D7', 'ANIMAL_FEED_D7', 'ANIMAL_STABLE_D7', 'APPLE_D7',
       'BANANA_D7', 'BEER_D7', 'BERRIES_D7', 'BREAD_D7', 'BUTTER_MILK_D7',
       'BUTYRIC_ACID_D7',
       ...
       'animal_score_D28', 'chemical_score_D28', 'body_odour_score_D28',
       'fruity_score_D28', 'maillard_score_D28', 'sour_score_D28',
       'ocean_score_D28', 'earthy_score_D28', 'fermented_dairy_score_D28',
       'nutty_score_D28'],
      dtype='object', length=222)


take only sourdough data

In [43]:
metadata_sd = metadata[metadata['sample_type'] == 'sourdough'].copy()

Check if aromas have missing values or if there are sourdough samples with no aroma analysis at all (if this would be the case, the samples with no aroma analysis would be taken out)

In [13]:
missing_aromas = metadata_sd[aroma_columns].isnull().sum()
if missing_aromas.any():
    print(f"\n⚠ Warning: Missing aroma values detected:")
    print(missing_aromas[missing_aromas > 0])


PORRIDGE_D28                 21
WHOLE_GRAIN_D28              21
HAY_D28                      21
BREAD_D28                    21
CORN_D28                     21
                             ..
sour_score_D28               21
ocean_score_D28              21
earthy_score_D28             21
fermented_dairy_score_D28    21
nutty_score_D28              21
Length: 87, dtype: int64


In [14]:
metadata_sd[aroma_columns].isnull().sum(axis=1)

0       0
1       0
2       0
3       0
4      87
       ..
129     0
130     0
131     0
132    87
133     0
Length: 125, dtype: int64

In [15]:
print("Before:", metadata_sd.shape)
metadata_sd = metadata_sd.dropna(subset=aroma_columns, how='all')
print("After:", metadata_sd.shape)

Before: (125, 304)
After: (125, 304)


<div style="background-color: aliceblue; padding: 10px;">

so there are no samples that miss all aroma analyses

# Define functions

Define function to collaps on taxonomic level

In [8]:
def collapse_to_taxonomic_level(feature_table, taxonomy_df, level='Family'):
    """
    Collapse ASV table to specified taxonomic level
    
    Parameters:
    -----------
    feature_table : pd.DataFrame
        ASV abundance table (samples × features)
    taxonomy_df : pd.DataFrame
        Taxonomy table with 'Feature ID' and 'Taxon' columns
    level : str
        Taxonomic level ('Genus' or 'Family')
    
    Returns:
    --------
    pd.DataFrame : Collapsed abundance table
    """
    # Extract taxonomic level
    level_prefix = {'Genus': 'g__', 'Family': 'f__'}[level]
    
    taxonomy_df = taxonomy_df.copy()
    if 'Feature ID' in taxonomy_df.columns:
        taxonomy_df = taxonomy_df.set_index('Feature ID')
    
    taxonomy_df[level] = taxonomy_df['Taxon'].str.extract(f'{level_prefix}([^;]+)')
    taxonomy_df[level] = taxonomy_df[level].fillna('Unassigned')
    
    # Map features to taxonomy
    feature_to_taxon = taxonomy_df[level].to_dict()
    collapsed = feature_table.copy()
    collapsed.columns = [feature_to_taxon.get(col, 'Unknown') for col in collapsed.columns]
    
    # Sum by taxonomic group (suppress FutureWarning)
    collapsed = collapsed.T.groupby(level=0).sum().T
    
    print(f"  ✓ Collapsed to {level} level: {collapsed.shape[1]} taxa")
    
    # Show dominant taxa
    relative = collapsed.div(collapsed.sum(axis=1), axis=0)
    top5 = relative.mean().sort_values(ascending=False).head(5)
    print(f"\n  Top 5 {level.lower()}s by mean relative abundance:")
    for taxon, abund in top5.items():
        print(f"    {taxon}: {abund*100:.2f}%")
    
    return collapsed

Define function for correlation calculation

In [47]:
def calculate_correlations_optimized(data, taxa_columns, aroma_columns, 
                                     min_samples=10, min_presence=0.1):
    """
    Calculate Spearman correlations with data quality filters
    
    Parameters:
    -----------
    data : pd.DataFrame
        Merged dataset with taxa and aroma data
    taxa_columns : list
        Column names for taxonomic abundances
    aroma_columns : list
        Column names for aroma attributes
    min_samples : int
        Minimum samples required for correlation
    min_presence : float
        Minimum prevalence (fraction of samples) for taxa inclusion
    
    Returns:
    --------
    pd.DataFrame : Correlation results with FDR correction
    """
    results = []
    
    # Filter low-prevalence taxa upfront
    prevalence = (data[taxa_columns] > 0).sum() / len(data)
    taxa_filtered = [col for col in taxa_columns if prevalence[col] >= min_presence]
    print(f"  Analyzing {len(taxa_filtered)}/{len(taxa_columns)} taxa (≥{min_presence*100}% prevalence)")
    
    for taxon in taxa_filtered:
        for aroma in aroma_columns:
            # Create mask: taxon present AND aroma measured
            mask = (data[taxon] > 0) & (data[aroma].notna())
            n = mask.sum()
            
            if n < min_samples:
                continue
            
            x = data.loc[mask, taxon]
            y = data.loc[mask, aroma]
            
            # Skip if insufficient variation
            if x.nunique() < 2 or y.nunique() < 2:
                continue
            
            # Calculate correlation
            rho, p_val = spearmanr(x, y)
            
            results.append({
                'Taxon': taxon,
                'Aroma': aroma,
                'Spearman_rho': rho,
                'P_value': p_val,
                'N_samples': n
            })
    
    results_df = pd.DataFrame(results)
    
    # FDR correction
    if len(results_df) > 0:
        results_df['FDR'] = multipletests(results_df['P_value'], method='fdr_bh')[1]
        results_df = results_df.sort_values('Spearman_rho', key=abs, ascending=False)
    
    print(f"  ✓ Calculated {len(results_df):,} correlations")
    
    return results_df



In [None]:
def calculate_correlations(data, fungal_columns, aroma_columns, min_samples=10):
    """Calculate Spearman correlations between fungi and aromas"""
    
    results = []
    
    for fungus in fungal_columns:
        for aroma in aroma_columns:
            # Remove samples where fungus is absent OR aroma is missing
            mask = (data[fungus] > 0) & (data[aroma].notna())
            n_samples = mask.sum()
            
            if n_samples < min_samples:
                continue
            
            x = data.loc[mask, fungus]
            y = data.loc[mask, aroma]
            
            # Skip if either variable is constant
            if x.nunique() < 2 or y.nunique() < 2:
                continue
            
            # Calculate Spearman correlation
            rho, p_value = spearmanr(x, y)
            
            results.append({
                'Fungus': fungus,
                'Aroma': aroma,
                'Spearman_rho': rho,
                'P_value': p_value,
                'N_samples': n_samples
            })
    
    results_df = pd.DataFrame(results)
    
    # Multiple testing correction (FDR)
    if len(results_df) > 0:
        results_df['FDR'] = multipletests(results_df['P_value'], method='fdr_bh')[1]
    
    return results_df


Define function for summarizing results

In [5]:
def summarize_results(cor_results, fdr_threshold=0.1, rho_threshold=0.3):
    """Print summary statistics of correlation analysis"""
    print("\n" + "="*70)
    print("CORRELATION ANALYSIS SUMMARY")
    print("="*70)
    
    print(f"\nTotal correlations tested: {len(cor_results):,}")
    
    # Descriptive statistics
    print("\nDistribution of correlation coefficients:")
    print(cor_results['Spearman_rho'].describe().to_string())
    
    sig = cor_results[
        (cor_results['FDR'] < fdr_threshold) & 
        (cor_results['Spearman_rho'].abs() > rho_threshold)
    ]
    
    print(f"\n{'='*70}")
    print(f"Significant associations (FDR<{fdr_threshold}, |ρ|>{rho_threshold}): {len(sig)}")
    print(f"{'='*70}")
    
    if len(sig) > 0:
        print(f"\nTop 10 strongest associations:")
        display_cols = ['Taxon', 'Aroma', 'Spearman_rho', 'FDR', 'N_samples']
        print(sig.head(10)[display_cols].to_string(index=False))
    else:
        print("\n⚠ No associations pass significance thresholds after FDR correction")
        print(f"\nStrongest correlations (top 10, uncorrected p-values):")
        display_cols = ['Taxon', 'Aroma', 'Spearman_rho', 'P_value', 'N_samples']
        print(cor_results.head(10)[display_cols].to_string(index=False))


def export_for_mixomics(family_rel, metadata_sd, aroma_columns):
    """Export data for mixOmics analysis in R"""
    
    output_dir = Path(DATA_AROMA) / "Analysis"
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print("\n" + "="*70)
    print("EXPORTING FOR MIXOMICS (R)")
    print("="*70)
    
    # Export fungi relative abundances
    family_rel.to_csv(output_dir / "X_fungi_family_rel.csv")
    print(f"  ✓ X_fungi_family_rel.csv ({family_rel.shape})")
    
    # Export aroma data
    metadata_sd[aroma_columns].to_csv(output_dir / "Y_aromas.csv")
    print(f"  ✓ Y_aromas.csv ({len(aroma_columns)} attributes)")
    
    # Export background metadata
    metadata_sd[['background']].to_csv(output_dir / "meta_background.csv")
    print(f"  ✓ meta_background.csv")
    
    print(f"\n✓ All files exported to: {output_dir}")

In [58]:
def summarize_results(
    cor_results: pd.DataFrame,
    rho_threshold: float = 0.3,
    fdr_threshold: float = 0.1,
    taxon_col: str = "Fungus",      # change to "Taxon" if that's your column name
    aroma_col: str = "Aroma",
    save_path: str | Path | None = "significant_correlations.csv",
    top_n: int = 10,
) -> pd.DataFrame:
    """
    Summarize correlation results with FDR and |rho| thresholds.
    
    Parameters
    ----------
    cor_results : pd.DataFrame
        DataFrame with at least: 'Spearman_rho', 'P_value', 'FDR', taxon_col, aroma_col, 'N_samples'
    rho_threshold : float
        Absolute correlation cutoff (|rho| > rho_threshold).
    fdr_threshold : float
        FDR cutoff (FDR < fdr_threshold).
    taxon_col : str
        Column name for the taxon (e.g. 'Fungus' or 'Taxon').
    aroma_col : str
        Column name for the aroma variable.
    save_path : str or Path or None
        If not None, path to save significant correlations as CSV.
    top_n : int
        Number of top associations to display.
    
    Returns
    -------
    pd.DataFrame
        DataFrame of significant correlations (possibly empty).
    """
    print("\n" + "="*72)
    print("CORRELATION ANALYSIS SUMMARY")
    print("="*72)

    # Handle empty input
    if cor_results is None or len(cor_results) == 0:
        print("\n⚠ cor_results is empty – no correlations to summarize.")
        return pd.DataFrame()

    # Basic count
    print(f"\nTotal correlations tested: {len(cor_results):,}")

    # Check required columns
    required = {"Spearman_rho", "P_value", "FDR", taxon_col, aroma_col}
    missing = required - set(cor_results.columns)
    if missing:
        print(f"\n⚠ Missing required columns: {missing}")
        print("   Available columns:", list(cor_results.columns))
        return pd.DataFrame()

    # Distribution of rho / p / FDR
    print("\nDistribution of correlation coefficients (Spearman_rho):")
    print(cor_results["Spearman_rho"].describe().to_string())

    print("\nDistribution of p-values:")
    print(cor_results["P_value"].describe().to_string())

    print("\nDistribution of FDR:")
    print(cor_results["FDR"].describe().to_string())

    # Select significant correlations
    sig_cors = cor_results[
        (cor_results["FDR"] < fdr_threshold) &
        (cor_results["Spearman_rho"].abs() > rho_threshold)
    ].copy()

    sig_cors = sig_cors.sort_values("Spearman_rho", key=abs, ascending=False)

    print("\n" + "-"*72)
    print(f"Significant correlations (FDR < {fdr_threshold}, |ρ| > {rho_threshold}): {len(sig_cors)}")
    print("-"*72)

    if len(sig_cors) > 0:
        print(f"\nTop {min(top_n, len(sig_cors))} strongest significant correlations:")
        display_cols = [taxon_col, aroma_col, "Spearman_rho", "FDR", "N_samples"]
        display_cols = [c for c in display_cols if c in sig_cors.columns]
        print(sig_cors.head(top_n)[display_cols].to_string(index=False))

        # Save if requested
        if save_path is not None:
            save_path = Path(save_path)
            sig_cors.to_csv(save_path, index=False)
            print(f"\n✓ Saved significant correlations to: {save_path}")
    else:
        print("\n⚠ No associations pass significance thresholds after FDR correction.")
        print("  Consider lowering thresholds or checking data quality.")

    # Show top correlations overall (regardless of significance)
    print("\nTop correlations by |Spearman_rho| (all results):")
    top_all = cor_results.sort_values("Spearman_rho", key=abs, ascending=False)
    display_cols = [taxon_col, aroma_col, "Spearman_rho", "P_value", "FDR", "N_samples"]
    display_cols = [c for c in display_cols if c in top_all.columns]
    print(top_all.head(top_n)[display_cols].to_string(index=False))

    return sig_cors


# main Workflow

Collapsing feature table to Family & Genus level

In [14]:
family_table = collapse_to_taxonomic_level(table_filtered_sourdough, taxonomy, level='Family')
family_rel = family_table.div(family_table.sum(axis=1), axis=0)

  ✓ Collapsed to Family level: 128 taxa

  Top 5 familys by mean relative abundance:
    Saccharomycetaceae: 70.61%
    Pleosporaceae: 17.10%
    Didymellaceae: 5.56%
    Pichiaceae: 1.39%
    Saccotheciaceae: 1.25%


In [15]:
genus_table = collapse_to_taxonomic_level(table_filtered_sourdough, taxonomy, level='Genus')
genus_rel = genus_table.div(genus_table.sum(axis=1), axis=0)

  ✓ Collapsed to Genus level: 238 taxa

  Top 5 genuss by mean relative abundance:
    Saccharomyces: 70.61%
    Alternaria: 14.70%
    Unassigned: 5.44%
    Pyrenophora: 1.77%
    Pichia: 1.39%


Merging collapsed feature tables with metadata

In [50]:
merged_family = family_rel.join(metadata_sd, how='inner')
merged_genus = genus_rel.join(metadata_sd, how='inner')
    
family_columns = family_rel.columns.tolist()
genus_columns = genus_rel.columns.tolist()

# Spearman correlations

Family level

In [51]:
cor_results_family = calculate_correlations_optimized(
        merged_family, 
        family_columns, 
        aroma_columns,
        min_samples=10,
        min_presence=0.1
    )
    
summarize_results(cor_results_family)

  Analyzing 0/128 taxa (≥10.0% prevalence)
  ✓ Calculated 0 correlations

CORRELATION ANALYSIS SUMMARY

Total correlations tested: 0

Distribution of correlation coefficients:


KeyError: 'Spearman_rho'

In [53]:
cor_results_family = calculate_correlations(merged_family, family_columns, aroma_columns)

print(f"✓ Calculated {len(cor_results_family)} correlations")

✓ Calculated 7073 correlations


In [54]:
summarize_results(cor_results_family)


CORRELATION ANALYSIS SUMMARY

Total correlations tested: 7,073

Distribution of correlation coefficients:
count    7073.000000
mean       -0.004160
std         0.178902
min        -0.712525
25%        -0.097590
50%         0.000000
75%         0.090189
max         0.720158

Significant associations (FDR<0.1, |ρ|>0.3): 0

⚠ No associations pass significance thresholds after FDR correction

Strongest correlations (top 10, uncorrected p-values):


KeyError: "['Taxon'] not in index"

Genus level

In [55]:
cor_results_genus = calculate_correlations(merged_family, family_columns, aroma_columns)

print(f"✓ Calculated {len(cor_results_genus)} correlations")

✓ Calculated 7073 correlations


In [56]:
summarize_results(cor_results_genus)


CORRELATION ANALYSIS SUMMARY

Total correlations tested: 7,073

Distribution of correlation coefficients:
count    7073.000000
mean       -0.004160
std         0.178902
min        -0.712525
25%        -0.097590
50%         0.000000
75%         0.090189
max         0.720158

Significant associations (FDR<0.1, |ρ|>0.3): 0

⚠ No associations pass significance thresholds after FDR correction

Strongest correlations (top 10, uncorrected p-values):


KeyError: "['Taxon'] not in index"

In [60]:
sig_cors = summarize_results(
    cor_results,
    rho_threshold=0.3,
    fdr_threshold=0.1,
    taxon_col="Fungus",           # or "Taxon" depending on your column name
    aroma_col="Aroma",
    save_path="Data/aroma/significant_correlations.csv",
    top_n=10,
)


CORRELATION ANALYSIS SUMMARY

Total correlations tested: 7,073

Distribution of correlation coefficients (Spearman_rho):
count    7073.000000
mean       -0.004160
std         0.178902
min        -0.712525
25%        -0.097590
50%         0.000000
75%         0.090189
max         0.720158

Distribution of p-values:
count    7073.000000
mean        0.495997
std         0.291627
min         0.000328
25%         0.241590
50%         0.477006
75%         0.758125
max         1.000000

Distribution of FDR:
count    7073.000000
mean        0.966678
std         0.025243
min         0.943803
25%         0.943803
50%         0.953877
75%         1.000000
max         1.000000

------------------------------------------------------------------------
Significant correlations (FDR < 0.1, |ρ| > 0.3): 0
------------------------------------------------------------------------

⚠ No associations pass significance thresholds after FDR correction.
  Consider lowering thresholds or checking data quality.


# Spearman correlation on genus level
<div style="background-color: skyblue; padding: 10px;">


In [16]:
# Extract genus from taxonomy
taxonomy['Genus'] = taxonomy['Taxon'].str.extract(r'g__([^;]+)')
taxonomy['Genus'] = taxonomy['Genus'].fillna('Unassigned')

print(f"\n✓ Extracted {taxonomy['Genus'].nunique()} unique genera")


✓ Extracted 1542 unique genera


In [17]:
# 1) Make sure taxonomy index is feature IDs
if 'Feature ID' in taxonomy.columns:
    taxonomy = taxonomy.set_index('Feature ID')

# 2) Extract genus
taxonomy['Genus'] = taxonomy['Taxon'].str.extract(r'g__([^;]+)')
taxonomy['Genus'] = taxonomy['Genus'].fillna('Unassigned')


# 3) Map ASVs to genus
feature_to_genus = taxonomy['Genus'].to_dict()

feature_table_genus = table_filtered_sourdough.copy()
feature_table_genus.columns = [
    feature_to_genus.get(col, 'Unknown') for col in feature_table_genus.columns
]

# 4) Collapse to genus
genus_table = feature_table_genus.groupby(level=0, axis=1).sum()

print("✓ Collapsed to genus level:", genus_table.shape[1], "genera")


  genus_table = feature_table_genus.groupby(level=0, axis=1).sum()


✓ Collapsed to genus level: 238 genera


In [49]:
genus_rel = genus_table.div(genus_table.sum(axis=1), axis=0)
top_genera = genus_rel.mean().sort_values(ascending=False).head(10)
print("\nTop 10 genera:")
for genus, abundance in top_genera.items():
    print(f"  {genus}: {abundance*100:.2f}%")



Top 10 genera:
  Saccharomyces: 70.61%
  Alternaria: 14.70%
  Unassigned: 5.44%
  Pyrenophora: 1.77%
  Pichia: 1.39%
  Aureobasidium: 1.22%
  Cladosporium: 1.08%
  Stemphylium: 0.44%
  Parastagonospora: 0.40%
  Sporobolomyces: 0.29%


Merging metadata & genus

In [19]:
metadata = metadata.set_index('sample ID')

In [20]:
# Merge genus data with metadata
merged_data = genus_table.join(metadata, how='inner')

print(f"✓ Merged data: {merged_data.shape}")

# Define fungal columns (from genus table)
fungal_columns = genus_table.columns.tolist()

print(f"\nFinal dataset:")
print(f"  Total samples: {len(merged_data)}")
print(f"  Fungal genera: {len(fungal_columns)}")
print(f"  Aroma attributes: {len(aroma_columns)}")
print(f"  Timepoints: {sorted(merged_data['day'].unique())}")
print(f"  Backgrounds: {merged_data['background'].unique().tolist()}")

✓ Merged data: (125, 541)

Final dataset:
  Total samples: 125
  Fungal genera: 238
  Aroma attributes: 222
  Timepoints: [7.0, 14.0, 21.0]
  Backgrounds: ['non-sterile', 'sterile']


In [21]:
fungal_columns = list(fungal_columns)
aroma_columns = list(aroma_columns)

complete_cases = merged_data[fungal_columns + aroma_columns].notna().all(axis=1).sum()
print(f"  Complete cases (no missing data): {complete_cases}/{len(merged_data)}")


  Complete cases (no missing data): 104/125


In [33]:
def calculate_correlations(data, fungal_columns, aroma_columns, min_samples=10):
    """Calculate Spearman correlations between fungi and aromas"""
    
    results = []
    
    for fungus in fungal_columns:
        for aroma in aroma_columns:
            # Remove samples where fungus is absent OR aroma is missing
            mask = (data[fungus] > 0) & (data[aroma].notna())
            n_samples = mask.sum()
            
            if n_samples < min_samples:
                continue
            
            x = data.loc[mask, fungus]
            y = data.loc[mask, aroma]
            
            # Skip if either variable is constant
            if x.nunique() < 2 or y.nunique() < 2:
                continue
            
            # Calculate Spearman correlation
            rho, p_value = spearmanr(x, y)
            
            results.append({
                'Fungus': fungus,
                'Aroma': aroma,
                'Spearman_rho': rho,
                'P_value': p_value,
                'N_samples': n_samples
            })
    
    results_df = pd.DataFrame(results)
    
    # Multiple testing correction (FDR)
    if len(results_df) > 0:
        results_df['FDR'] = multipletests(results_df['P_value'], method='fdr_bh')[1]
    
    return results_df


In [23]:
# Calculate correlations
cor_results = calculate_correlations(merged_data, fungal_columns, aroma_columns)

print(f"✓ Calculated {len(cor_results)} correlations")

✓ Calculated 9684 correlations


In [45]:
rho_threshold = 0.3
fdr_threshold = 0.1
sig_cors = cor_results[
    (cor_results['FDR'] < fdr_threshold) & 
    (cor_results['Spearman_rho'].abs() > rho_threshold)
].sort_values('Spearman_rho', key=abs, ascending=False)

print(f"✓ Significant correlations (FDR<{fdr_threshold}, |ρ|>{rho_threshold}): {len(sig_cors)}")

if len(sig_cors) > 0:
    print(f"\nTop 10 strongest correlations:")
    display_cols = ['Fungus', 'Aroma', 'Spearman_rho', 'FDR', 'N_samples']
    print(sig_cors.head(10)[display_cols].to_string(index=False))
    
    # Save results
    sig_cors.to_csv('significant_correlations.csv', index=False)
    print(f"\n✓ Saved to: significant_correlations.csv")
else:
    print("\n⚠ No significant correlations found!")
    print("  Consider lowering thresholds or checking data quality")

✓ Significant correlations (FDR<0.1, |ρ|>0.3): 0

⚠ No significant correlations found!
  Consider lowering thresholds or checking data quality


In [25]:
cor_results[['Spearman_rho', 'P_value', 'FDR']].describe()

Unnamed: 0,Spearman_rho,P_value,FDR
count,9684.0,9684.0,9684.0
mean,0.00857,0.461018,0.87539
std,0.177406,0.288921,0.072596
min,-0.717127,0.000178,0.793966
25%,-0.096576,0.200898,0.793966
50%,0.015846,0.431062,0.861939
75%,0.112836,0.712428,0.949057
max,0.736213,1.0,1.0


In [61]:
cor_results.sort_values("Spearman_rho", key=abs, ascending=False).head(20)

Unnamed: 0,Fungus,Aroma,Spearman_rho,P_value,N_samples,FDR
8700,Tricellula,fruity_score_D28,0.736213,0.00633,12,0.793966
4645,Microstromatales_gen_Incertae_sedis,dayx_leavening_D28,-0.717127,0.005798,13,0.793966
6615,Pseudopithomyces,fruity_score_D28,-0.711918,0.006339,13,0.793966
8108,Starmerella,LACTIC_ACID_D28,0.709327,0.021603,10,0.793966
1251,Candida,BEER_D14,0.701793,0.010962,12,0.793966
1905,Curvibasidium,PINEAPPLE_D28,-0.694677,0.00841,13,0.793966
1903,Curvibasidium,BERRIES_D28,-0.694677,0.00841,13,0.793966
6608,Pseudopithomyces,dayx_pH_D28,-0.6629,0.013532,13,0.793966
8667,Tricellula,PINEAPPLE_D28,0.655273,0.02072,12,0.793966
6639,Pseudozyma,VINEGAR_D7,0.652649,0.015602,13,0.793966


<div style="background-color: aliceblue; padding: 10px;">

correlations are statistically weak also due to high number of testing

# Re-run Spearman correlation on family basis
<div style="background-color: skyblue; padding: 10px;">

to reduce reduction of statistical results through multiple testing issue

Reload taxonomy to avoid issues

In [26]:
taxonomy = pd.read_csv("Data/aroma/taxonomy.tsv", sep="\t")

In [27]:
# Extract genus from taxonomy
taxonomy['Family'] = taxonomy['Taxon'].str.extract(r'f__([^;]+)')
taxonomy['Family'] = taxonomy['Family'].fillna('Unassigned')

print(f"\n✓ Extracted {taxonomy['Family'].nunique()} unique families")


✓ Extracted 523 unique families


In [52]:
# 1) Make sure taxonomy index is feature IDs
if 'Feature ID' in taxonomy.columns:
    taxonomy = taxonomy.set_index('Feature ID')

# 2) Extract genus
taxonomy['Family'] = taxonomy['Taxon'].str.extract(r'f__([^;]+)')
taxonomy['Family'] = taxonomy['Family'].fillna('Unassigned')


# 3) Map ASVs to genus
feature_to_family = taxonomy['Family'].to_dict()

feature_table_family = table_filtered_sourdough.copy()
feature_table_family.columns = [
    feature_to_family.get(col, 'Unknown') for col in feature_table_family.columns
]

# 4) Collapse to genus
family_table = feature_table_family.groupby(level=0, axis=1).sum()

print("✓ Collapsed to family level:", family_table.shape[1], "families")


  family_table = feature_table_family.groupby(level=0, axis=1).sum()


✓ Collapsed to family level: 128 families


In [53]:
family_rel = family_table.div(family_table.sum(axis=1), axis=0)
top_family = family_rel.mean().sort_values(ascending=False).head(10)
print("\nTop 10 families:")
for family, abundance in top_family.items():
    print(f"  {family}: {abundance*100:.2f}%")



Top 10 families:
  Saccharomycetaceae: 70.61%
  Pleosporaceae: 17.10%
  Didymellaceae: 5.56%
  Pichiaceae: 1.39%
  Saccotheciaceae: 1.25%
  Cladosporiaceae: 1.08%
  Phaeosphaeriaceae: 0.45%
  Sporidiobolaceae: 0.31%
  Aspergillaceae: 0.29%
  Filobasidiaceae: 0.27%


In [54]:
# Merge genus data with metadata
merged_data_family = family_table.join(metadata, how='inner')

print(f"✓ Merged data: {merged_data_family.shape}")

# Define fungal columns (from genus table)
fungal_columns_family = family_table.columns.tolist()

print(f"\nFinal dataset:")
print(f"  Total samples: {len(merged_data_family)}")
print(f"  Fungal families: {len(fungal_columns_family)}")
print(f"  Aroma attributes: {len(aroma_columns)}")
print(f"  Timepoints: {sorted(merged_data['day'].unique())}")
print(f"  Backgrounds: {merged_data['background'].unique().tolist()}")

✓ Merged data: (125, 431)

Final dataset:
  Total samples: 125
  Fungal families: 128
  Aroma attributes: 222
  Timepoints: [7.0, 14.0, 21.0]
  Backgrounds: ['non-sterile', 'sterile']


In [55]:
fungal_columns_family = list(fungal_columns_family)
aroma_columns_family = list(aroma_columns)

complete_cases = merged_data_family[fungal_columns_family + aroma_columns].notna().all(axis=1).sum()
print(f"  Complete cases (no missing data): {complete_cases}/{len(merged_data_family)}")

  Complete cases (no missing data): 104/125


In [56]:
def calculate_correlations(data, fungal_columns_family, aroma_columns, min_samples=10):
    """Calculate Spearman correlations between fungi and aromas"""
    
    results = []
    
    for fungus in fungal_columns_family:
        for aroma in aroma_columns:
            # Remove samples where fungus is absent OR aroma is missing
            mask = (data[fungus] > 0) & (data[aroma].notna())
            n_samples = mask.sum()
            
            if n_samples < min_samples:
                continue
            
            x = data.loc[mask, fungus]
            y = data.loc[mask, aroma]
            
            # Skip if either variable is constant
            if x.nunique() < 2 or y.nunique() < 2:
                continue
            
            # Calculate Spearman correlation
            rho, p_value = spearmanr(x, y)
            
            results.append({
                'Fungus': fungus,
                'Aroma': aroma,
                'Spearman_rho': rho,
                'P_value': p_value,
                'N_samples': n_samples
            })
    
    results_df = pd.DataFrame(results)
    
    # Multiple testing correction (FDR)
    if len(results_df) > 0:
        results_df['FDR'] = multipletests(results_df['P_value'], method='fdr_bh')[1]
    
    return results_df


In [57]:
# Calculate correlations
cor_results_family = calculate_correlations(merged_data_family, fungal_columns_family, aroma_columns)

print(f"✓ Calculated {len(cor_results_family)} correlations")

✓ Calculated 7073 correlations


In [58]:
rho_threshold = 0.3
fdr_threshold = 0.1
sig_cors_family = cor_results_family[
    (cor_results_family['FDR'] < fdr_threshold) & 
    (cor_results_family['Spearman_rho'].abs() > rho_threshold)
].sort_values('Spearman_rho', key=abs, ascending=False)

print(f"✓ Significant correlations (FDR<{fdr_threshold}, |ρ|>{rho_threshold}): {len(sig_cors_family)}")

if len(sig_cors_family) > 0:
    print(f"\nTop 10 strongest correlations:")
    display_cols = ['Fungus', 'Aroma', 'Spearman_rho', 'FDR', 'N_samples']
    print(sig_cors_family.head(10)[display_cols].to_string(index=False))
    
    # Save results
    sig_cors_family.to_csv('significant_correlations.csv', index=False)
    print(f"\n✓ Saved to: significant_correlations.csv")
else:
    print("\n⚠ No significant correlations found!")
    print("  Consider lowering thresholds or checking data quality")

✓ Significant correlations (FDR<0.1, |ρ|>0.3): 0

⚠ No significant correlations found!
  Consider lowering thresholds or checking data quality


In [59]:
cor_results_family[['Spearman_rho', 'P_value', 'FDR']].describe()

Unnamed: 0,Spearman_rho,P_value,FDR
count,7073.0,7073.0,7073.0
mean,0.006036,0.455335,0.862144
std,0.183923,0.288198,0.076272
min,-0.757588,0.000268,0.779112
25%,-0.10268,0.197269,0.779112
50%,0.012597,0.420228,0.840338
75%,0.112679,0.697379,0.929794
max,0.716981,1.0,1.0


In [60]:
cor_results_family.sort_values("Spearman_rho", key=abs, ascending=False).head(20)

Unnamed: 0,Fungus,Aroma,Spearman_rho,P_value,N_samples,FDR
3038,Metschnikowiaceae,dayx_leavening_D28,-0.757588,0.001698,14,0.779112
2354,Erysiphaceae,VINEGAR_D14,-0.734013,0.010119,11,0.779112
4277,Pezizomycotina_fam_Incertae_sedis,fruity_score_D28,0.716981,0.005812,13,0.779112
6593,Trichomonascaceae,LEAVES_D28,0.703856,0.003406,15,0.779112
6641,Trichomonascaceae,earthy_score_D28,0.703856,0.003406,15,0.779112
6614,Trichomonascaceae,MOLDY_D28,0.703856,0.003406,15,0.779112
6613,Trichomonascaceae,SOIL_D28,0.703856,0.003406,15,0.779112
3133,Microbotryomycetes_fam_Incertae_sedis,PINEAPPLE_D28,-0.694677,0.00841,13,0.779112
3131,Microbotryomycetes_fam_Incertae_sedis,BERRIES_D28,-0.694677,0.00841,13,0.779112
6574,Trichomonascaceae,LACTIC_ACID_D21,0.688973,0.001565,18,0.779112


### Continuing with sPLS

Prepare data to continue on R

In [78]:
metadata_sd = metadata_sd.set_index('sample ID')  # use your actual column name

In [86]:
family_rel.to_csv("Data/aroma/Analysis/X_fungi_family_rel.csv")             
metadata_sd[aroma_columns].to_csv("Data/aroma/Analysis/Y_aromas.csv")        
metadata_sd[['background']].to_csv("Data/aroma/Analysis/meta_background.csv")


This code was run on RStudio to create the circos plot