In [10]:
import os
import pandas as pd
from qiime2 import Visualization
import matplotlib.pyplot as plt
import numpy as np
import re

import qiime2 as q2

%matplotlib inline

data_dir = 'data'

In [5]:
! qiime rescript get-unite-data --help

Usage: [94mqiime rescript get-unite-data[0m [OPTIONS]

  Download and import ITS sequences and taxonomy from the UNITE database,
  given a version number and taxon_group, with the option to select a
  cluster_id and include singletons. Downloads data directly from UNITE's
  PlutoF REST API. NOTE: THIS ACTION ACQUIRES DATA FROM UNITE, which is
  licensed under CC BY-SA 4.0. To learn more, please visit
  https://unite.ut.ee/cite.php and https://creativecommons.org/licenses/by-
  sa/4.0/.

[1mParameters[0m:
  [94m--p-version[0m TEXT [32mChoices('10.0', '9.0', '8.3', '8.2')[0m
                          UNITE version to download.         [35m[default: '10.0'][0m
  [94m--p-taxon-group[0m TEXT [32mChoices('fungi', 'eukaryotes')[0m
                          Download a database with only 'fungi' or including
                          all 'eukaryotes'.            [35m[default: 'eukaryotes'][0m
  [94m--p-cluster-id[0m TEXT [32mChoices('99', '97', 'dynamic')[0m
                

## Train Classifier
### Get Database UNITE (99, no singletons, all eukaryotes)

In [9]:
#we did already do this in the clustering part so all the Unite data base data is in the folder data/clustering
#! qiime rescript get-unite-data \
#    --p-version 10.0 \
#    --p-taxon-group eukaryotes \
#    --p-cluster-id 99 \
#    --p-no-singletons \
#    --verbose \
#    --output-dir $data_dir/clustering

[32mSaved FeatureData[Taxonomy] to: data/raw/uniteDB/taxonomy.qza[0m
[32mSaved FeatureData[Sequence] to: data/raw/uniteDB/sequences.qza[0m
[0m

In [12]:
#! qiime rescript cull-seqs \
#    --i-sequences $data_dir/uniteDB/sequences.qza \
#    --o-clean-sequences $data_dir/uniteDB/sequences_cleaned.qza

[32mSaved FeatureData[Sequence] to: data/raw/uniteDB/sequences_cleaned.qza[0m
[0m

remove SH to reduce size of taxonomy file

In [21]:
! qiime rescript edit-taxonomy \
    --i-taxonomy $data_dir/clustering/taxonomy.qza \
    --o-edited-taxonomy $data_dir/clustering/taxonomy-no-SH.qza \
    --p-search-strings ';sh__.*' \
    --p-replacement-strings '' \
    --p-use-regex

[32mSaved FeatureData[Taxonomy] to: data/raw/uniteDB/taxonomy-no-SH.qza[0m
[0m

In [13]:
! qiime taxa filter-seqs --help

Usage: [94mqiime taxa filter-seqs[0m [OPTIONS]

  This method filters sequences based on their taxonomic annotations. Features
  can be retained in the result by specifying one or more include search
  terms, and can be filtered out of the result by specifying one or more
  exclude search terms. If both include and exclude are provided, the
  inclusion critera will be applied before the exclusion critera. Either
  include or exclude terms (or both) must be provided.

[1mInputs[0m:
  [94m[4m--i-sequences[0m ARTIFACT [32mFeatureData[Sequence][0m
                          Feature sequences to be filtered.         [35m[required][0m
  [94m[4m--i-taxonomy[0m ARTIFACT [32mFeatureData[Taxonomy][0m
                          Taxonomic annotations for features in the provided
                          feature sequences. All features in the feature
                          sequences must have a corresponding taxonomic
                          annotation. Taxonomic annotations for

In [17]:
! qiime feature-table tabulate-seqs \
    --i-data $data_dir/clustering/sequences.qza \
    --o-visualization $data_dir/clustering/sequences.qzv

[32mSaved Visualization to: data/raw/uniteDB/sequences.qzv[0m
[0m

In [18]:
Visualization.load(f"{data_dir}/uniteDB/sequences.qzv")

## Taxonomy Classification Evaluation of data classified with pre-trained classifier (EULER)
all code run on Euler is in the "Euler2" jupyter notebook; this is just the evaluation

In [2]:
! qiime metadata tabulate \
    --m-input-file $data_dir/taxonomy_classification/taxonomy.qza \
    --o-visualization $data_dir/taxonomy_classification/taxonomy.qzv

[32mSaved Visualization to: data/taxonomy_classification/taxonomy.qzv[0m
[0m

In [3]:
# for data with forward and reverse compliment trimmed
! qiime metadata tabulate \ 
    --m-input-file $data_dir/taxonomy_classification/taxonomy_30-10a.qza \
    --o-visualization $data_dir/taxonomy_classification/taxonomy_30-10a.qzv

[32mSaved Visualization to: data/taxonomy_classification/taxonomy_30-10a.qzv[0m
[0m

In [7]:
Visualization.load(f"{data_dir}/taxonomy_classification/taxonomy_30-10a.qzv")

Look up some of unassigned reads with BLAST:
- Eubacterium eligens
- Uncultured soil fungus
- Bacteroides uniformes
- Homo Sapiens Chromosome 5
- Phocaeicola vulgatus strain (Gram neg bacteria)
- Debaryomyces hansenii (yeast)
- Bacteroides fragiles
- Homo sapiens 3 BAC RP11-575H20
- Mizuhopecten yessoensis (Mollusk - Weichtier)
- Pseudomonas aeruginosa (bacteria) / Aeromonas veronii
- Caudoviricetes sp (virus)
- Roseburia intestinalis (bacterium)
- 

In [6]:
# Plot the results
! qiime taxa barplot \
    --i-table $data_dir/dada/dada2_table.qza \
    --i-taxonomy $data_dir/taxonomy_classification/taxonomy.qza \
    --m-metadata-file $data_dir/pundemic_metadata.tsv \
    --o-visualization $data_dir/taxonomy_classification/taxa-barplot.qzv

[32mSaved Visualization to: data/taxonomy_classification/taxa-barplot.qzv[0m
[0m

In [3]:
Visualization.load(f"{data_dir}/taxonomy_classification/taxa-barplot.qzv")

In [5]:
# Plot the results
# for data with forward and reverse compliment trimmed
! qiime taxa barplot \
    --i-table $data_dir/dada/dada2_table.qza \
    --i-taxonomy $data_dir/taxonomy_classification/taxonomy_30-10a.qza \
    --m-metadata-file $data_dir/pundemic_metadata.tsv \
    --o-visualization $data_dir/taxonomy_classification/taxa-barplot_30-10a.qzv

[32mSaved Visualization to: data/taxonomy_classification/taxa-barplot_30-10a.qzv[0m
[0m

In [6]:
Visualization.load(f"{data_dir}/taxonomy_classification/taxa-barplot_30-10a.qzv")

In [9]:
meta_df = pd.read_csv(f'{data_dir}/pundemic_metadata.tsv', sep='\t', index_col=0)
meta_df.head()

Unnamed: 0_level_0,patient_id,age,sex,ethnicity,continent,country,region,city,group,disease_subgroup,blinded_clinical_response,puns_per_hour_pre_treatment,puns_per_hour_post_treatment,time_point
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
SRR10505051,1048,36,female,Caucasian,Europe,Switzerland,Zurich,Zurich,Puns,Placebo,NR,9.0,8.0,post-treatment
SRR10505052,1048,36,female,Caucasian,Europe,Switzerland,Zurich,Zurich,Puns,Placebo,NR,9.0,8.0,pre-treatment
SRR10505053,1045,29,male,Caucasian,Europe,Switzerland,Zurich,Zurich,Puns,Placebo,Res,6.0,0.0,pre-treatment
SRR10505054,1045,29,male,Caucasian,Europe,Switzerland,Zurich,Zurich,Puns,Placebo,Res,6.0,0.0,post-treatment
SRR10505055,1044,34,male,Indian Subcontinental,Europe,Switzerland,Zurich,Zurich,Puns,Placebo,,4.0,,pre-treatment


In [26]:
bad_classification_ids_string = "SRR10505118SRR10505099SRR10505121SRR10505142SRR10505109SRR10505113SRR10505131SRR10505070SRR10505122SRR10505127SRR10505084SRR10505059SRR10505078SRR10505083SRR10505089SRR10505077SRR10505134SRR10505107SRR10505090SRR10505076SRR10505106SRR10505148SRR10505075SRR10505065SRR10505132SRR10505112SRR10505081SRR10505068SRR10505062SRR10505152SRR10505066SRR10505133SRR10505074SRR10505116SRR10505114SRR10505103SRR10505126SRR10505064SRR10505151SRR10505136SRR10505056SRR10505135SRR10505146SRR10505108SRR10505141SRR10505147SRR10505101SRR10505123SRR10505069SRR10505117SRR10505067SRR10505053SRR10505079SRR10505104SRR10505143SRR10505115SRR10505097SRR10505140SRR10505061SRR10505087SRR10505098SRR10505138"
bad_classification_ids_list = re.findall(r'SRR\d+', bad_classification_ids_string)
meta_df_bad_classification = meta_df[meta_df.index.isin(bad_classification_ids_list)]
meta_df_bad_classification.to_csv(f'{data_dir}/metadata_bad_classification.csv')
meta_df_bad_classification.head()

Unnamed: 0_level_0,patient_id,age,sex,ethnicity,continent,country,region,city,group,disease_subgroup,blinded_clinical_response,puns_per_hour_pre_treatment,puns_per_hour_post_treatment,time_point
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
SRR10505053,1045,29,male,Caucasian,Europe,Switzerland,Zurich,Zurich,Puns,Placebo,Res,6.0,0.0,pre-treatment
SRR10505056,1044,34,male,Indian Subcontinental,Europe,Switzerland,Zurich,Zurich,Puns,Placebo,,4.0,,post-treatment
SRR10505059,1042,40,male,Caucasian,Europe,Switzerland,Zurich,Zurich,Puns,Placebo,,9.0,8.0,post-treatment
SRR10505061,2212,27,male,Caucasian,Europe,Switzerland,Zurich,Zurich,Puns,FMT,NR,8.0,4.0,pre-treatment
SRR10505062,1041,39,female,Caucasian,Europe,Switzerland,Zurich,Zurich,Puns,FMT,Res,6.0,0.0,post-treatment
