In [11]:
#importing plugins
import os
import pandas as pd
from qiime2 import Visualization
import matplotlib.pyplot as plt
import numpy as np
import biom
import qiime2 as q2

%matplotlib inline

In [2]:
Data_raw = 'Data/raw'
Data_classified = 'Data/classified'
Data_visualization = 'Data/visualization'

<div style="background-color: skyblue; padding: 10px;">
    Titles
    </div>
<div style="background-color: aliceblue; padding: 10px;">
    Results

### 1) Overview taxa barplot before Filtering ###
<div style="background-color: skyblue; padding: 10px;">
    </div>

In [18]:
! qiime taxa barplot \
  --i-table $Data_raw/modified-dada1-table-highschool.qza \
  --i-taxonomy $Data_classified/taxonomy.qza \
  --m-metadata-file $Data_raw/20250913_metadata_ITS.tsv \
  --o-visualization $Data_classified/taxa-barplot.qzv

  import pkg_resources
[32mSaved Visualization to: Data/classified/taxa-barplot.qzv[0m
[0m[?25h

In [5]:
Visualization.load(f"{Data_classified}/taxa-barplot.qzv")

### 2) Filtering of Table ###
<div style="background-color: skyblue; padding: 10px;">
    </div>

***2.1) Table before Filtering***

In [17]:
! qiime feature-table summarize \
  --i-table $Data_raw/modified-dada1-table-highschool.qza \
  --o-visualization $Data_visualization/modified-dada1-table-highschool.qzv

  import pkg_resources
[32mSaved Visualization to: Data/visualization/modified-dada1-table-highschool.qzv[0m
[0m[?25h

In [7]:
Visualization.load(f"{Data_visualization}/modified-dada1-table-highschool.qzv")

***2.2) Table after Filtering***

In [8]:
! qiime taxa filter-table \
    --i-table $Data_raw/modified-dada1-table-highschool.qza \
    --i-taxonomy $Data_classified/taxonomy.qza \
    --p-exclude mitochondria,chloroplast \
    --p-include "k__Fungi" \
    --o-filtered-table $Data_classified/table-filtered.qza

  import pkg_resources
[32mSaved FeatureTable[Frequency] to: Data/classified/table-filtered.qza[0m
[0m[?25h

In [9]:
! qiime feature-table summarize \
  --i-table $Data_classified/table-filtered.qza \
  --o-visualization $Data_classified/table-filtered.qzv

  import pkg_resources
[32mSaved Visualization to: Data/classified/table-filtered.qzv[0m
[0m[?25h

In [10]:
# Feature Table Visualization after Filtering
Visualization.load(f"{Data_classified}/table-filtered.qzv")

### 3) Filtering of Sequences ###
<div style="background-color: skyblue; padding: 10px;">
    </div>

***3.1) Squences before Filtering***

In [11]:
Visualization.load(f"{Data_visualization}/filtered-dada1-rep-seqs-highschool.qzv")

***3.2) Squences after Filtering***

In [12]:
! qiime taxa filter-seqs \
    --i-sequences $Data_raw/filtered-dada1-rep-seqs-highschool.qza \
    --i-taxonomy $Data_classified/taxonomy.qza \
    --p-exclude mitochondria,chloroplast \
    --p-include "k__Fungi" \
    --o-filtered-sequences $Data_classified/rep-seqs-filtered.qza

  import pkg_resources
[32mSaved FeatureData[Sequence] to: Data/classified/rep-seqs-filtered.qza[0m
[0m[?25h

In [13]:
! qiime feature-table tabulate-seqs \
  --i-data $Data_classified/rep-seqs-filtered.qza \
  --o-visualization $Data_classified/rep-seqs-filtered.qzv

  import pkg_resources
[32mSaved Visualization to: Data/classified/rep-seqs-filtered.qzv[0m
[0m[?25h

In [14]:
Visualization.load(f"{Data_classified}/rep-seqs-filtered.qzv")

### 4) Overview taxa barplot after Filtering - only Fungi left ###
<div style="background-color: skyblue; padding: 10px;">
    </div>

In [20]:
! qiime taxa barplot \
  --i-table $Data_classified/table-filtered.qza \
  --i-taxonomy $Data_classified/taxonomy.qza \
  --m-metadata-file $Data_raw/20250913_metadata_ITS.tsv \
  --o-visualization $Data_classified/taxa-barplot-filtered.qzv

  import pkg_resources
[32mSaved Visualization to: Data/classified/taxa-barplot-filtered.qzv[0m
[0m[?25h

In [21]:
Visualization.load(f"{Data_classified}/taxa-barplot-filtered.qzv")

### 5) Overview of present genera ###
<div style="background-color: skyblue; padding: 10px;">
    </div>

In [27]:
!qiime tools export \
  --input-path $Data_classified/taxonomy.qza \
  --output-path $Data_classified

  import pkg_resources
[32mExported Data/classified/taxonomy.qza as TSVTaxonomyDirectoryFormat to directory Data/classified[0m
[0m[?25h

In [28]:
taxonomy = pd.read_csv(f"{Data_classified}/taxonomy.tsv", sep="\t")

In [29]:
!qiime tools export \
  --input-path $Data_classified/table-filtered-sourdough_only.qza \
  --output-path $Data_classified/table-filtered-sourdough_only

  import pkg_resources
[32mExported Data/classified/table-filtered-sourdough_only.qza as BIOMV210DirFmt to directory Data/classified/table-filtered-sourdough_only[0m
[0m[?25h

filtered table needed to be transposed so that it matches with analysis

In [30]:
table = biom.load_table(f"{Data_classified}/table-filtered-sourdough_only/feature-table.biom')

table_filtered_sourdough = table.to_dataframe()

table_filtered_sourdough = table_filtered_sourdough.T

Load metadata and check which are the columns with the aroma information and set index to sample ID so no issues with merging later on

In [31]:
metadata = pd.read_csv(f"{Data_raw}/merged_output_usable.tsv", sep="\t") #using merged_output_usable because the spaces already substitued with _
metadata = metadata.set_index('sample ID')
aroma_columns = metadata.columns[-222:]

print(aroma_columns)

#so last 222 columns are aroma associated

Index(['ALCOHOLIC_D7', 'ANIMAL_FEED_D7', 'ANIMAL_STABLE_D7', 'APPLE_D7',
       'BANANA_D7', 'BEER_D7', 'BERRIES_D7', 'BREAD_D7', 'BUTTER_MILK_D7',
       'BUTYRIC_ACID_D7',
       ...
       'animal_score_D28', 'chemical_score_D28', 'body_odour_score_D28',
       'fruity_score_D28', 'maillard_score_D28', 'sour_score_D28',
       'ocean_score_D28', 'earthy_score_D28', 'fermented_dairy_score_D28',
       'nutty_score_D28'],
      dtype='object', length=222)


take only sourdough data

In [32]:
metadata_sd = metadata[metadata['sample_type'] == 'sourdough'].copy()

Check if aromas have missing values or if there are sourdough samples with no aroma analysis at all (if this would be the case, the samples with no aroma analysis would be taken out)

In [33]:
metadata_sd[aroma_columns].isnull().sum(axis=1)

sample ID
366291_386-LP4-ITS-0386     0
366291_387-LP4-ITS-0387     0
366291_388-LP4-ITS-0388     0
366291_389-LP4-ITS-0389     0
366291_390-LP4-ITS-0390    87
                           ..
366292_105-LP4-ITS-0605     0
366292_106-LP4-ITS-0606     0
366292_107-LP4-ITS-0607     0
366292_108-LP4-ITS-0608    87
366292_110-LP4-ITS-0610     0
Length: 125, dtype: int64

In [16]:
print("Before:", metadata_sd.shape)
metadata_sd = metadata_sd.dropna(subset=aroma_columns, how='all')
print("After:", metadata_sd.shape)

Before: (125, 303)
After: (125, 303)


### Define function

Define function to collaps on taxonomic level

In [17]:
def collapse_to_taxonomic_level(feature_table, taxonomy_df, level='Family'):
    """
    Collapse ASV table to specified taxonomic level
    
    Parameters:
    -----------
    feature_table : pd.DataFrame
        ASV abundance table (samples × features)
    taxonomy_df : pd.DataFrame
        Taxonomy table with 'Feature ID' and 'Taxon' columns
    level : str
        Taxonomic level ('Genus' or 'Family')
    
    Returns:
    --------
    pd.DataFrame : Collapsed abundance table
    """
    # Extract taxonomic level
    level_prefix = {'Genus': 'g__', 'Family': 'f__'}[level]
    
    taxonomy_df = taxonomy_df.copy()
    if 'Feature ID' in taxonomy_df.columns:
        taxonomy_df = taxonomy_df.set_index('Feature ID')
    
    taxonomy_df[level] = taxonomy_df['Taxon'].str.extract(f'{level_prefix}([^;]+)')
    taxonomy_df[level] = taxonomy_df[level].fillna('Unassigned')
    
    # Map features to taxonomy
    feature_to_taxon = taxonomy_df[level].to_dict()
    collapsed = feature_table.copy()
    collapsed.columns = [feature_to_taxon.get(col, 'Unknown') for col in collapsed.columns]
    
    # Sum by taxonomic group (suppress FutureWarning)
    collapsed = collapsed.T.groupby(level=0).sum().T
    
    print(f"  ✓ Collapsed to {level} level: {collapsed.shape[1]} taxa")
    
    # Show dominant taxa
    relative = collapsed.div(collapsed.sum(axis=1), axis=0)
    top5 = relative.mean().sort_values(ascending=False).head(5)
    print(f"\n  Top 5 {level.lower()}s by mean relative abundance:")
    for taxon, abund in top5.items():
        print(f"    {taxon}: {abund*100:.2f}%")
    
    return collapsed

Merging collapsed feature tables with metadata

In [34]:
merged_family = family_rel.join(metadata_sd, how='inner')
merged_genus = genus_rel.join(metadata_sd, how='inner')
    
family_columns = family_rel.columns.tolist()
genus_columns = genus_rel.columns.tolist()

Family level

In [35]:
top_families = family_rel.mean().sort_values(ascending=False).head(10)
print("\nTop 10 genera:")
for family, abundance in top_families.items():
    print(f"  {family}: {abundance*100:.2f}%")


Top 10 genera:
  Saccharomycetaceae: 70.61%
  Pleosporaceae: 17.10%
  Didymellaceae: 5.56%
  Pichiaceae: 1.39%
  Saccotheciaceae: 1.25%
  Cladosporiaceae: 1.08%
  Phaeosphaeriaceae: 0.45%
  Sporidiobolaceae: 0.31%
  Aspergillaceae: 0.29%
  Filobasidiaceae: 0.27%


Genus level

In [36]:
top_genera = genus_rel.mean().sort_values(ascending=False).head(10)
print("\nTop 10 genera:")
for genus, abundance in top_genera.items():
    print(f"  {genus}: {abundance*100:.2f}%")


Top 10 genera:
  Saccharomyces: 70.61%
  Alternaria: 14.70%
  Unassigned: 5.44%
  Pyrenophora: 1.77%
  Pichia: 1.39%
  Aureobasidium: 1.22%
  Cladosporium: 1.08%
  Stemphylium: 0.44%
  Parastagonospora: 0.40%
  Sporobolomyces: 0.29%


In [37]:
# Presence/absence check for the top 10 genera across all sourdough samples
top10 = top_genera.index.tolist()

presence = (genus_rel[top10] > 0)
n_samples = genus_rel.shape[0]

present_in_all = presence.all(axis=0)
n_present = presence.sum(axis=0)

summary = pd.DataFrame({
    "mean_rel_abundance": genus_rel[top10].mean(),
    "samples_present": n_present,
    "present_in_all_samples": present_in_all
}).sort_values("mean_rel_abundance", ascending=False)

print(f"\nPresence of top 10 genera across sourdough samples (n={n_samples}):")
print(summary.assign(mean_rel_abundance=lambda d: (d["mean_rel_abundance"]*100)))

# Optional: list which genera are NOT in all samples
missing_any = summary[~summary["present_in_all_samples"]]
if len(missing_any) > 0:
    print("\nGenera not present in all samples (and how many samples they're missing from):")
    for genus, row in missing_any.iterrows():
        print(f"  {genus}: missing in {n_samples - int(row['samples_present'])} samples")
else:
    print("\nAll top 10 genera are present in every sourdough sample.")



Presence of top 10 genera across sourdough samples (n=125):
                  mean_rel_abundance  samples_present  present_in_all_samples
Saccharomyces              70.611674             True                    True
Alternaria                 14.701565             True                    True
Unassigned                  5.439263             True                    True
Pyrenophora                 1.767181             True                   False
Pichia                      1.389014             True                   False
Aureobasidium               1.217856             True                    True
Cladosporium                1.082868             True                   False
Stemphylium                 0.439985             True                   False
Parastagonospora            0.403184             True                   False
Sporobolomyces              0.293762             True                   False

Genera not present in all samples (and how many samples they're missing from):
 

  for genus, row in missing_any.iterrows():


<div style="background-color: aliceblue; padding: 10px;">
Saccaromyces and Alternaria are the two most abundant genera and present in all samples