In [1]:
import os
import pandas as pd
import qiime2 as q2
from qiime2 import Visualization
from qiime2 import Artifact

%matplotlib inline

In [2]:
data = 'Data'

In [3]:
data_classified = 'Data/classified'

In [4]:
data_raw = 'Data/raw'

In [5]:
data_alpha = 'Data/diversity'

### 1) Alpha rarefaction: choose the sampling depth ###

In [6]:
! qiime diversity alpha-rarefaction \
    --i-table $data_classified/table-filtered.qza \
    --p-max-depth 10000 \
    --p-iterations 10 \
    --m-metadata-file $data_raw/20250913_metadata_ITS.tsv \
    --o-visualization $data_alpha/alpha-rarefaction.qzv

  import pkg_resources
^C

Aborted!
[0m[?25h

In [7]:
Visualization.load(f"{data_alpha}/alpha-rarefaction.qzv")

ValueError: Data/diversity/alpha-rarefaction.qzv does not exist.

In [None]:
Visualization.load(f"{data_classified}/table-filtered.qzv")

### 2) we chose a sampling depth of 3000 because it can retain 87% of the samples


### 3) Core metrics creation with sampling depth 3000 ###

In [None]:
! qiime diversity core-metrics \
  --i-table $data_classified/table-filtered.qza \
  --m-metadata-file $data_raw/20250913_metadata_ITS.tsv \
  --p-sampling-depth 3000 \
  --output-dir $data_alpha/core-metrics-results

### 4) Test data for normal distribution (parametric testing) ###

In [None]:
#Testen, whether normally distributed!
from scipy.stats import shapiro

shannon_path = f"{data_alpha}/core-metrics-results/shannon_vector.qza"
shannon = q2.Artifact.load(shannon_path).view(pd.Series)

stat, p = shapiro(shannon)
print('Statistics=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print("Daten wahrscheinlich normalverteilt")
else:
    print("Daten wahrscheinlich nicht normalverteilt")

### 5) Kruskal-Wallis: Non-parametric testing ###

### 5.1) Kruskal-Wallis for the metadata ITS: ###

In [None]:
! qiime diversity alpha-group-significance \
  --i-alpha-diversity $data_alpha/core-metrics-results/shannon_vector.qza \
  --m-metadata-file $data_raw/20250913_metadata_ITS.tsv \
  --o-visualization $data_alpha/core-metrics-results/shannon-group-significance.qzv

In [None]:
Visualization.load(f"{data_alpha}/core-metrics-results/shannon-group-significance.qzv")

### 5.2) Kruskal-Wallis for the metadata personal environment: ###

**But first, merge the two metadata files**

In [8]:
its = pd.read_csv("Data/raw/20250913_metadata_ITS.tsv", sep="\t")

sensory = pd.read_csv("Data/raw/20250914_metadata_personal_environmental_sensory_details.tsv", sep="\t")

# merge, keep all ITS
merged = its.merge(
    sensory,
    left_on='person-id',
    right_on='person-id',
    how='left'
)
# safe
merged.to_csv("Data/raw/merged_output.tsv", sep="\t", index=False)


left_on='person-id' → Wir nehmen die IDs aus der ITS-Tabelle.
right_on='sample-id' → Wir vergleichen sie mit den IDs aus der Sensory-Tabelle.
how='left' → Wir behalten alle ITS-Samples, auch wenn es keinen passenden Eintrag in Sensory gibt.
Ergebnis: Eine neue Tabelle (merged), die alle ITS-Samples enthält, und Sensory-Daten dort hinzufügt, wo sie verfügbar sind.

Wichtiger Punkt:

Wenn ein Sample keine Sensory-Daten hat, sind die entsprechenden Spalten NaN (leer).

In [None]:
! qiime diversity alpha-group-significance \
  --i-alpha-diversity $data_alpha/core-metrics-results/shannon_vector.qza \
  --m-metadata-file $data_raw/merged_output.tsv \
  --o-visualization $data_alpha/core-metrics-results/shannon-group-significance-environment.qzv

In [None]:
Visualization.load(f"{data_alpha}/core-metrics-results/shannon-group-significance-environment.qzv")

### 6) Alpha Correlation ###

### 6.1) Alpha Correlation for the metadata ITS ###

In [None]:
! qiime diversity alpha-correlation \
  --i-alpha-diversity $data_alpha/core-metrics-results/shannon_vector.qza \
  --m-metadata-file $data_raw/20250913_metadata_ITS.tsv \
  --o-visualization $data_alpha/core-metrics-results/shannon-group-significance-numeric.qzv

In [None]:
Visualization.load(f"{data_alpha}/core-metrics-results/shannon-group-significance-numeric.qzv")

--> kategoriale spalten wurden in kruskal wallis ausgewertet, numerische in alpha correlation


### 6.2) Alpha Correlation for the metadata personal environment: ###

In [None]:
from qiime2 import Artifact
shannon_series = (Artifact.load(f"{data_alpha}/core-metrics-results/shannon_vector.qza")).view(pd.Series)
shannon = shannon_series.to_frame()

metadata_merged = pd.read_csv(f"{data_raw}/merged_output.tsv", sep = "\t", index_col=0) 

#joining metadata (merged) with the shannon entropy vector
join = shannon.join(metadata_merged, how='inner')
join.index.name = 'sampleID'
join

### 6.2.1) Spearman correlation für numerische Daten ###

In [None]:
numeric_metadata = metadata_merged.select_dtypes(include='number')
joined = shannon.join(numeric_metadata, how='inner')

# Spearman-correlations calculations
correlations = joined.corr(method='spearman')['shannon_entropy']
correlations = correlations.drop('shannon_entropy')  #remove shannon entropy as a row
correlations = correlations.dropna()
correlations

In [None]:
from scipy.stats import spearmanr
from statsmodels.stats.multitest import multipletests

p_values = {}
for col in numeric_metadata.columns:
    corr, p = spearmanr(joined['shannon_entropy'], joined[col])
    p_values[col] = p

p_values_df = (pd.DataFrame.from_dict(p_values, orient='index', columns=['p_value'])).join(correlations, how = "inner")
p_values_df = p_values_df.dropna(subset=['p_value'])

pvals = p_values_df['p_value'].values

reject, pvals_corr, _, _ = multipletests(pvals, method='fdr_bh')

p_values_df['p_value_FDR'] = pvals_corr
p_values_df['significant_FDR'] = reject

# Ergebnis anzeigen

p_values_df

### 7) Kruskal Wallis evaluations ###

***7.1) label barcode 1***

In [None]:
kruskal_barcode1 = pd.read_csv(f"{data_alpha}/kruskal-wallis-pairwise-label_barcode1.csv", sep = ",", index_col=0)
sig = kruskal_barcode1[kruskal_barcode1['p-value'] <= 0.05]


grouped = sig.groupby('Group 1').size()
print(grouped)

***7.2) Plate***

In [None]:
kruskal_plate = pd.read_csv(f"{data_alpha}/kruskal-wallis-pairwise-plate.csv", sep = ",", index_col=0)
sig = kruskal_plate[kruskal_plate['p-value'] <= 0.05]
sig

In [None]:
plate = sig.groupby('Group 1').size()
plate

***7.3) DNA extraction***

In [None]:
kruskal_DNA = pd.read_csv(f"{data_alpha}/kruskal-wallis-pairwise-DNA_extraction_plate.csv", sep = ",", index_col=0)
sig = kruskal_DNA[kruskal_DNA['p-value'] <= 0.05]
sig

***7.4) Project***

In [None]:
kruskal_project = pd.read_csv(f"{data_alpha}/kruskal-wallis-pairwise-project.csv", sep = ",", index_col=0)
sig = kruskal_project[kruskal_project['p-value'] <= 0.05]
sig

***7.5) Sample type***

In [None]:
kruskal_type = pd.read_csv(f"{data_alpha}/kruskal-wallis-pairwise-sample_type.csv", sep = ",", index_col=0)
sig = kruskal_type[kruskal_type['p-value'] <= 0.05]
sig

### 8) Kruskal-Wallis: Non-parametric testing only for sourdoughs ###

***8.1) Filtering out sourdough***

In [None]:
! qiime feature-table filter-samples \
  --i-table $data_classified/table-filtered.qza \
  --m-metadata-file $data_raw/merged_output.tsv  \
  --p-where "sample_type='sourdough'" \
  --o-filtered-table $data_classified/table-filtered-sourdough_only.qza

In [None]:
! qiime diversity core-metrics \
  --i-table $data_classified/table-filtered-sourdough_only.qza \
  --m-metadata-file $data_raw/merged_output.tsv \
  --p-sampling-depth 3000 \
  --output-dir $data_alpha/core-metrics-results-sourdough_only

In [None]:
! qiime diversity alpha-group-significance \
  --i-alpha-diversity $data_alpha/core-metrics-results-sourdough_only/shannon_vector.qza \
  --m-metadata-file $data_raw/merged_output.tsv \
  --o-visualization $data_alpha/core-metrics-results-sourdough_only/shannon-group-significance-sourdough.qzv

In [None]:
Visualization.load(f"{data_alpha}/core-metrics-results-sourdough_only/shannon-group-significance-sourdough.qzv")

notes: conditions significant only for sourdough =
1) background (sterile, non-sterile)
2) sd_bake_last time - nur zwei samples, also schon significant aber useless

***8.2.1) Visualization sourdough***

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Metadata laden (möglicherweise hast du merged_output.tsv)
metadata = pd.read_csv(f"{data_raw}/merged_output.tsv", sep='\t', index_col=0)

shannon_qza = q2.Artifact.load(f"{data_alpha}/core-metrics-results-sourdough_only/shannon_vector.qza")
shannon_df = shannon_qza.view(pd.Series).to_frame(name='shannon')
shannon_df.index.name = 'SampleID'

shannon_df.head()

In [None]:
df = shannon_df.join(metadata)
df.head()


In [None]:
!pip install statannot

In [None]:
#Darstellung von sterile non-sterile boxplot klären

plt.figure(figsize=(6,5))
ax = sns.boxplot(x='background', y='shannon', data=df, palette='Set2')
sns.swarmplot(x='background', y='shannon', data=df, color='0.25', ax=ax)

# Whitespace entfernen
df['background'] = df['background'].str.strip()

add_stat_annotation(
    ax, 
    data=df, 
    x='background', 
    y='shannon',
    box_pairs=[("Sterile", "Non sterile")],  # genau wie in df['background'].unique()
    test='t-test_ind',
    text_format='star',
    loc='outside',
    verbose=2
)

ax.set_xlabel('Background', labelpad=15)
ax.set_ylabel('Shannon Diversity', labelpad=15)
ax.set_title('Alpha-Diversity of Background', size=16, pad=15)

plt.show()


***8.2) Alpha-Correlation for sourdough***

In [None]:
from qiime2 import Artifact
shannon_series = (Artifact.load(f"{data_alpha}/core-metrics-results-sourdough_only/shannon_vector.qza")).view(pd.Series)
shannon = shannon_series.to_frame()

metadata_merged = pd.read_csv(f"{data_raw}/merged_output.tsv", sep = "\t", index_col=0) 

#joining metadata (merged) with the shannon entropy vector for sourdough
join = shannon.join(metadata_merged, how='inner')
join.index.name = 'sampleID'
join

In [None]:
numeric_metadata = metadata_merged.select_dtypes(include='number')
joined = shannon.join(numeric_metadata, how='inner')

# Spearman-correlations calculations
correlations = joined.corr(method='spearman')['shannon_entropy']
correlations = correlations.drop('shannon_entropy')  #remove shannon entropy as a row
correlations = correlations.dropna()
correlations

In [None]:
from scipy.stats import spearmanr
from statsmodels.stats.multitest import multipletests

p_values = {}
for col in numeric_metadata.columns:
    corr, p = spearmanr(joined['shannon_entropy'], joined[col])
    p_values[col] = p

p_values_df = (pd.DataFrame.from_dict(p_values, orient='index', columns=['p_value'])).join(correlations, how = "inner")
p_values_df = p_values_df.dropna(subset=['p_value'])

pvals = p_values_df['p_value'].values

reject, pvals_corr, _, _ = multipletests(pvals, method='fdr_bh')

p_values_df['p_value_FDR'] = pvals_corr
p_values_df['significant_FDR'] = reject

# Ergebnis anzeigen

p_values_df

In [None]:
significant_results = p_values_df[p_values_df['significant_FDR'] == True]
significant_results

#was genau bedeutet significant FDR?