In [8]:
#importing plugins
import os
import pandas as pd
from qiime2 import Visualization
import matplotlib.pyplot as plt
import numpy as np

import qiime2 as q2

%matplotlib inline

In [9]:
Data_raw = 'Data/raw'
Data_classified = 'Data/classified'
Data_visualization = 'Data/visualization'

<div style="background-color: skyblue; padding: 10px;">
    Titles
    </div>
<div style="background-color: aliceblue; padding: 10px;">
    Results

### 1) Overview taxa barplot before Filtering ###
<div style="background-color: skyblue; padding: 10px;">
    </div>

In [18]:
! qiime taxa barplot \
  --i-table $Data_raw/modified-dada1-table-highschool.qza \
  --i-taxonomy $Data_classified/taxonomy.qza \
  --m-metadata-file $Data_raw/20250913_metadata_ITS.tsv \
  --o-visualization $Data_classified/taxa-barplot.qzv

  import pkg_resources
[32mSaved Visualization to: Data/classified/taxa-barplot.qzv[0m
[0m[?25h

In [5]:
Visualization.load(f"{Data_classified}/taxa-barplot.qzv")

### 2) Filtering of Table ###
<div style="background-color: skyblue; padding: 10px;">
    </div>

***2.1) Table before Filtering***

In [17]:
! qiime feature-table summarize \
  --i-table $Data_raw/modified-dada1-table-highschool.qza \
  --o-visualization $Data_visualization/modified-dada1-table-highschool.qzv

  import pkg_resources
[32mSaved Visualization to: Data/visualization/modified-dada1-table-highschool.qzv[0m
[0m[?25h

In [7]:
Visualization.load(f"{Data_visualization}/modified-dada1-table-highschool.qzv")

***2.2) Table after Filtering***

In [8]:
! qiime taxa filter-table \
    --i-table $Data_raw/modified-dada1-table-highschool.qza \
    --i-taxonomy $Data_classified/taxonomy.qza \
    --p-exclude mitochondria,chloroplast \
    --p-include "k__Fungi" \
    --o-filtered-table $Data_classified/table-filtered.qza

  import pkg_resources
[32mSaved FeatureTable[Frequency] to: Data/classified/table-filtered.qza[0m
[0m[?25h

In [9]:
! qiime feature-table summarize \
  --i-table $Data_classified/table-filtered.qza \
  --o-visualization $Data_classified/table-filtered.qzv

  import pkg_resources
[32mSaved Visualization to: Data/classified/table-filtered.qzv[0m
[0m[?25h

In [10]:
# Feature Table Visualization after Filtering
Visualization.load(f"{Data_classified}/table-filtered.qzv")

### 3) Filtering of Sequences ###
<div style="background-color: skyblue; padding: 10px;">
    </div>

***3.1) Squences before Filtering***

In [11]:
Visualization.load(f"{Data_visualization}/filtered-dada1-rep-seqs-highschool.qzv")

***3.2) Squences after Filtering***

In [12]:
! qiime taxa filter-seqs \
    --i-sequences $Data_raw/filtered-dada1-rep-seqs-highschool.qza \
    --i-taxonomy $Data_classified/taxonomy.qza \
    --p-exclude mitochondria,chloroplast \
    --p-include "k__Fungi" \
    --o-filtered-sequences $Data_classified/rep-seqs-filtered.qza

  import pkg_resources
[32mSaved FeatureData[Sequence] to: Data/classified/rep-seqs-filtered.qza[0m
[0m[?25h

In [13]:
! qiime feature-table tabulate-seqs \
  --i-data $Data_classified/rep-seqs-filtered.qza \
  --o-visualization $Data_classified/rep-seqs-filtered.qzv

  import pkg_resources
[32mSaved Visualization to: Data/classified/rep-seqs-filtered.qzv[0m
[0m[?25h

In [14]:
Visualization.load(f"{Data_classified}/rep-seqs-filtered.qzv")

### 4) Overview taxa barplot after Filtering - only Fungi left ###
<div style="background-color: skyblue; padding: 10px;">
    </div>

In [20]:
! qiime taxa barplot \
  --i-table $Data_classified/table-filtered.qza \
  --i-taxonomy $Data_classified/taxonomy.qza \
  --m-metadata-file $Data_raw/20250913_metadata_ITS.tsv \
  --o-visualization $Data_classified/taxa-barplot-filtered.qzv

  import pkg_resources
[32mSaved Visualization to: Data/classified/taxa-barplot-filtered.qzv[0m
[0m[?25h

In [7]:
Visualization.load(f"{Data_classified}/taxa-barplot-filtered.qzv")

In [None]:
Data_classified = 'Data/classified'
metadata_file = 'Data/raw/merged_output.tsv'


In [18]:
table_sourdough.head()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from qiime2 import Artifact

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (16, 10)

# Define data paths
Data_diversity = 'Data/diversity'
Data_raw = 'Data/raw'

# Load metadata
meta = pd.read_csv(f'{Data_raw}/merged_output_usable.tsv', sep='\t')

# Load feature table (abundance data)
print("Loading feature table...")
feature_table_artifact = Artifact.load(f'{Data_diversity}/kmerizer-results-merged/feature_table.qza')
feature_table = feature_table_artifact.view(pd.DataFrame)

print(f"Feature table shape: {feature_table.shape}")
print(f"Number of features (taxa): {feature_table.shape[0]}")
print(f"Number of samples: {feature_table.shape[1]}")

# Load taxonomy data
print("\nLoading taxonomy...")
taxonomy_artifact = Artifact.load(f'{Data_diversity}/kmerizer-results-merged/taxonomy.qza')
taxonomy = taxonomy_artifact.view(pd.DataFrame)

print(f"Taxonomy table shape: {taxonomy.shape}")

def get_top_abundant_taxa(feature_table, taxonomy, n_top=10):
    """
    Identify the top N most abundant taxa overall
    
    Parameters:
    -----------
    feature_table : pd.DataFrame
        Feature table with taxa as rows and samples as columns
    taxonomy : pd.DataFrame
        Taxonomy assignments for each feature
    n_top : int
        Number of top taxa to return
    
    Returns:
    --------
    pd.DataFrame with top abundant taxa information
    """
    # Calculate total abundance for each taxon across all samples
    total_abundance = feature_table.sum(axis=1).sort_values(ascending=False)
    
    # Calculate relative abundance
    total_reads = feature_table.sum().sum()
    relative_abundance = (total_abundance / total_reads) * 100
    
    # Get top N taxa
    top_taxa_ids = total_abundance.head(n_top).index
    
    # Create results dataframe
    results = []
    for rank, taxon_id in enumerate(top_taxa_ids, 1):
        tax_info = taxonomy.loc[taxon_id, 'Taxon'] if taxon_id in taxonomy.index else 'Unknown'
        
        results.append({
            'Rank': rank,
            'Feature_ID': taxon_id,
            'Taxonomy': tax_info,
            'Total_Abundance': total_abundance[taxon_id],
            'Relative_Abundance_%': relative_abundance[taxon_id],
            'Present_in_N_samples': (feature_table.loc[taxon_id] > 0).sum(),
            'Total_N_samples': feature_table.shape[1]
        })
    
    return pd.DataFrame(results)

def check_taxa_in_sourdough(feature_table, metadata, top_taxa_ids):
    """
    Check presence of top taxa in sourdough samples
    
    Parameters:
    -----------
    feature_table : pd.DataFrame
        Feature table with taxa as rows and samples as columns
    metadata : pd.DataFrame
        Metadata with sample information
    top_taxa_ids : list
        List of feature IDs to check
    
    Returns:
    --------
    pd.DataFrame with presence/absence information
    """
    # Filter metadata for sourdough samples only
    sourdough_meta = metadata[metadata['sample_type'] == 'sourdough'].copy()
    
    # Get sourdough sample IDs that are in the feature table
    sourdough_samples = [s for s in sourdough_meta['sample ID'].tolist() 
                        if s in feature_table.columns]
    
    print(f"\nTotal sourdough samples in feature table: {len(sourdough_samples)}")
    
    # Check presence for each taxon
    results = []
    for taxon_id in top_taxa_ids:
        if taxon_id in feature_table.index:
            # Get abundance in sourdough samples
            abundances = feature_table.loc[taxon_id, sourdough_samples]
            
            # Calculate presence/absence
            present_samples = (abundances > 0).sum()
            total_samples = len(sourdough_samples)
            presence_pct = (present_samples / total_samples) * 100
            
            # Get mean abundance in samples where present
            mean_abundance_present = abundances[abundances > 0].mean() if present_samples > 0 else 0
            
            # Get relative abundance in sourdough
            total_sourdough_reads = feature_table[sourdough_samples].sum().sum()
            relative_abundance = (abundances.sum() / total_sourdough_reads) * 100
            
            results.append({
                'Feature_ID': taxon_id,
                'Present_in_N_sourdough': present_samples,
                'Total_sourdough_samples': total_samples,
                'Presence_%': presence_pct,
                'Mean_abundance_when_present': mean_abundance_present,
                'Relative_abundance_in_sourdough_%': relative_abundance,
                'Present_in_all': 'Yes' if present_samples == total_samples else 'No'
            })
    
    return pd.DataFrame(results)

def check_taxa_by_background(feature_table, metadata, top_taxa_ids):
    """
    Check presence of top taxa in sourdough by background type
    """
    # Filter for sourdough only
    sourdough_meta = metadata[metadata['sample_type'] == 'sourdough'].copy()
    
    results = []
    
    for background in ['sterile', 'non-sterile']:
        bg_meta = sourdough_meta[sourdough_meta['background'] == background]
        bg_samples = [s for s in bg_meta['sample ID'].tolist() 
                     if s in feature_table.columns]
        
        print(f"\n{background.upper()} background: {len(bg_samples)} samples")
        
        for taxon_id in top_taxa_ids:
            if taxon_id in feature_table.index:
                abundances = feature_table.loc[taxon_id, bg_samples]
                present = (abundances > 0).sum()
                total = len(bg_samples)
                
                results.append({
                    'Feature_ID': taxon_id,
                    'Background': background,
                    'Present_in_N': present,
                    'Total_N': total,
                    'Presence_%': (present/total)*100 if total > 0 else 0,
                    'Mean_abundance': abundances.mean(),
                    'Present_in_all': 'Yes' if present == total else 'No'
                })
    
    return pd.DataFrame(results)

def create_presence_heatmap(feature_table, metadata, taxonomy, top_taxa_ids, save_path='taxa_presence_heatmap.png'):
    """
    Create a heatmap showing presence/absence of top taxa in each sourdough sample
    """
    # Get sourdough samples
    sourdough_meta = metadata[metadata['sample_type'] == 'sourdough'].copy()
    sourdough_samples = [s for s in sourdough_meta['sample ID'].tolist() 
                        if s in feature_table.columns]
    
    # Create presence/absence matrix
    presence_matrix = []
    taxa_labels = []
    
    for taxon_id in top_taxa_ids:
        if taxon_id in feature_table.index:
            # Get presence (1) or absence (0)
            presence = (feature_table.loc[taxon_id, sourdough_samples] > 0).astype(int)
            presence_matrix.append(presence.values)
            
            # Get taxonomic label
            if taxon_id in taxonomy.index:
                tax_str = taxonomy.loc[taxon_id, 'Taxon']
                # Extract genus and species if available
                parts = tax_str.split(';')
                label = parts[-1].strip() if parts else taxon_id[:20]
            else:
                label = taxon_id[:20]
            taxa_labels.append(label)
    
    presence_df = pd.DataFrame(presence_matrix, 
                               index=taxa_labels, 
                               columns=sourdough_samples)
    
    # Add metadata annotations
    sample_annotations = sourdough_meta.set_index('sample ID').loc[sourdough_samples, 
                                                                    ['person-id', 'background', 'day']]
    
    # Create figure
    fig, ax = plt.subplots(figsize=(20, 8))
    
    # Create heatmap
    sns.heatmap(presence_df, cmap=['white', 'darkgreen'], cbar_kws={'label': 'Present'},
                linewidths=0.5, linecolor='lightgray', ax=ax)
    
    ax.set_xlabel('Sourdough Samples', fontsize=12, fontweight='bold')
    ax.set_ylabel('Top 10 Abundant Taxa', fontsize=12, fontweight='bold')
    ax.set_title('Presence/Absence of Top 10 Most Abundant Taxa in Sourdough Samples', 
                fontsize=14, fontweight='bold', pad=20)
    
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.savefig(save_path.replace('.png', '.pdf'), bbox_inches='tight')
    print(f"\nHeatmap saved to {save_path}")
    plt.show()
    
    return presence_df

def plot_abundance_distribution(feature_table, metadata, taxonomy, top_taxa_ids, 
                                save_path='taxa_abundance_distribution.png'):
    """
    Plot abundance distribution of top taxa in sourdough vs other sample types
    """
    fig, axes = plt.subplots(2, 5, figsize=(20, 10))
    axes = axes.flatten()
    
    for idx, taxon_id in enumerate(top_taxa_ids[:10]):
        ax = axes[idx]
        
        if taxon_id not in feature_table.index:
            continue
        
        # Get taxonomy label
        if taxon_id in taxonomy.index:
            tax_str = taxonomy.loc[taxon_id, 'Taxon']
            parts = tax_str.split(';')
            label = parts[-1].strip() if parts else taxon_id[:20]
        else:
            label = taxon_id[:20]
        
        # Prepare data for plotting
        plot_data = []
        for sample_type in ['sourdough', 'hand_swabs']:
            type_meta = metadata[metadata['sample_type'] == sample_type]
            type_samples = [s for s in type_meta['sample ID'].tolist() 
                          if s in feature_table.columns]
            
            if type_samples:
                abundances = feature_table.loc[taxon_id, type_samples]
                # Convert to relative abundance (log scale for better visualization)
                rel_abundance = (abundances / feature_table[type_samples].sum()) * 100
                
                for val in rel_abundance[rel_abundance > 0]:
                    plot_data.append({
                        'Sample_Type': sample_type.replace('_', ' ').title(),
                        'Relative_Abundance_%': val
                    })
        
        if plot_data:
            plot_df = pd.DataFrame(plot_data)
            
            # Create violin plot
            sns.violinplot(data=plot_df, x='Sample_Type', y='Relative_Abundance_%', 
                          ax=ax, palette=['#8B4513', '#FFA07A'])
            
            ax.set_yscale('log')
            ax.set_title(f'{label}', fontsize=10, fontweight='bold')
            ax.set_xlabel('')
            ax.set_ylabel('Relative Abundance (%)' if idx % 5 == 0 else '', fontsize=9)
            ax.tick_params(axis='x', rotation=45)
    
    plt.suptitle('Abundance Distribution of Top 10 Taxa: Sourdough vs Hand Swabs', 
                fontsize=14, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.savefig(save_path.replace('.png', '.pdf'), bbox_inches='tight')
    print(f"Abundance distribution plot saved to {save_path}")
    plt.show()

# ============================================================================
# MAIN ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("IDENTIFYING TOP 10 MOST ABUNDANT TAXA")
print("="*80)

# Get top 10 abundant taxa overall
top_taxa_df = get_top_abundant_taxa(feature_table, taxonomy, n_top=10)

print("\nTop 10 Most Abundant Taxa (Overall):")
print("-"*80)
print(top_taxa_df.to_string(index=False))

# Save to file
top_taxa_df.to_csv('top_10_abundant_taxa_overall.csv', index=False)
print("\nSaved to: top_10_abundant_taxa_overall.csv")

# Get list of top taxa IDs
top_taxa_ids = top_taxa_df['Feature_ID'].tolist()

print("\n" + "="*80)
print("CHECKING PRESENCE IN SOURDOUGH SAMPLES")
print("="*80)

# Check presence in sourdough
sourdough_presence = check_taxa_in_sourdough(feature_table, meta, top_taxa_ids)

# Merge with taxonomy info
sourdough_results = top_taxa_df[['Rank', 'Feature_ID', 'Taxonomy', 'Relative_Abundance_%']].merge(
    sourdough_presence, on='Feature_ID'
)

print("\nPresence of Top 10 Taxa in Sourdough:")
print("-"*80)
print(sourdough_results.to_string(index=False))

# Save to file
sourdough_results.to_csv('top_10_taxa_presence_in_sourdough.csv', index=False)
print("\nSaved to: top_10_taxa_presence_in_sourdough.csv")

# Summary statistics
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"\nTaxa present in ALL sourdough samples: {sourdough_results['Present_in_all'].value_counts().get('Yes', 0)}/10")
print(f"Taxa present in >90% of sourdough samples: {(sourdough_results['Presence_%'] > 90).sum()}/10")
print(f"Taxa present in >75% of sourdough samples: {(sourdough_results['Presence_%'] > 75).sum()}/10")
print(f"Taxa present in >50% of sourdough samples: {(sourdough_results['Presence_%'] > 50).sum()}/10")

print("\n" + "="*80)
print("CHECKING PRESENCE BY BACKGROUND TYPE")
print("="*80)

# Check by background
background_presence = check_taxa_by_background(feature_table, meta, top_taxa_ids)

# Merge with taxonomy
background_results = background_presence.merge(
    top_taxa_df[['Feature_ID', 'Rank', 'Taxonomy']], 
    on='Feature_ID'
).sort_values(['Rank', 'Background'])

print("\nPresence by Background Type:")
print("-"*80)
print(background_results[['Rank', 'Taxonomy', 'Background', 'Present_in_N', 
                         'Total_N', 'Presence_%', 'Present_in_all']].to_string(index=False))

# Save to file
background_results.to_csv('top_10_taxa_by_background.csv', index=False)
print("\nSaved to: top_10_taxa_by_background.csv")

print("\n" + "="*80)
print("GENERATING VISUALIZATIONS")
print("="*80)

# Create presence/absence heatmap
presence_matrix = create_presence_heatmap(feature_table, meta, taxonomy, top_taxa_ids)

# Create abundance distribution plot
plot_abundance_distribution(feature_table, meta, taxonomy, top_taxa_ids)

print("\n" + "="*80)
print("ANALYSIS COMPLETE!")
print("="*80)
print("\nFiles generated:")
print("  1. top_10_abundant_taxa_overall.csv")
print("  2. top_10_taxa_presence_in_sourdough.csv")
print("  3. top_10_taxa_by_background.csv")
print("  4. taxa_presence_heatmap.png (and .pdf)")
print("  5. taxa_abundance_distribution.png (and .pdf)")