### Extract information about 
- total ASV
- ASV filtered (right phylum)
- total ASV - black proteus
- ASV filtered (right phylum)
- stats taxa numbers for black

In [2]:
import os
import qiime2 as q2
import pandas as pd
from qiime2.plugins.taxa.methods import filter_table
from qiime2.plugins.metadata.visualizers import tabulate

import sys
sys.path.append(os.path.abspath("./../"))
from utils import *

In [3]:
denoisetable = q2.Artifact.load('./results/denoisetable.qza')
taxonomy = q2.Artifact.load(f'./results/taxonomy_sklearn.qza')

### subset *Proteus anguinus parkelj*

In [4]:
# filter denoisetable, keep only d_Bacteria
denoisetable_ba = filter_table(
    table = denoisetable,
    taxonomy = taxonomy, 
    include = 'd__Bacteria'
)

In [5]:
# view denoisetable as dataframe
denoisetable_df = denoisetable_ba.filtered_table.view(view_type=pd.DataFrame)
# drop white specimen & transpose
denoisetable_b_unfiltered = denoisetable_df.tail(4).T
# set index name
denoisetable_b_unfiltered.index.name = 'Feature ID'
# drop rows with all zeros
denoisetable_b= denoisetable_b_unfiltered.loc[(denoisetable_b_unfiltered != 0).any(axis=1)]
# show
denoisetable_b

Unnamed: 0_level_0,P19A,P20A,P21A,P22A
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3b8bea8f89395d7be2a7a8deecd54615,3.0,0.0,1.0,2.0
8d8b84e4b354cb7622160eab1030b91e,0.0,0.0,0.0,27.0
7ca265bf3fe12de6ce2347e73e17aa5d,0.0,0.0,0.0,84.0
b430cce6841abb6c1f317c8c5a4c4b18,8.0,22.0,0.0,0.0
472a17c774dff39b8c3a79b784ae1b04,32.0,181.0,111.0,5.0
...,...,...,...,...
bce6efa26cef65cf8fa89fc549f8d61e,0.0,0.0,0.0,129.0
190b1e37456385abc5bf83593355e625,0.0,7.0,23.0,0.0
937542d158e2e4fac57f30f3b77fbb87,0.0,0.0,0.0,166.0
2a198b1adea24585658eee9e099df7d9,0.0,48.0,35.0,0.0


In [6]:
# view taxonomy metadata as dataframe
taxonomy_metadata = taxonomy.view(q2.Metadata).to_dataframe()
taxonomy_metadata

Unnamed: 0_level_0,Taxon,Confidence
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1eba318014a80c7ef58c4fe5a25317e7,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.7048761204522636
6fb0ae5e8d7e91acc3481ae385563c37,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.9999509990028024
4d4ae718b70724fc25f959b8888ba0c8,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.8440329231396565
b2573224b84130faa841a65345bd6686,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,0.7741138968163481
2d4b6ec0c2f78d209e9f3dea142e80ee,d__Bacteria; p__Planctomycetota; c__Phycisphae...,0.9454181775172625
...,...,...
d944d372f9e6f181de0b1e03964925b9,Unassigned,0.45809593428277495
06c409a280d0b5017226560ee5b53b94,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.9247622511411105
d6e8e226b606c9e3a6a571b0276b623c,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.8512655400051062
7b0206c4b953c37bbe8652b6ca55c52e,d__Bacteria; p__Planctomycetota; c__Phycisphae...,0.9899707803532873


In [7]:
# merge denoisetable_b and taxonomy_metadata by Feature ID
merged_df = pd.merge(denoisetable_b, taxonomy_metadata, on="Feature ID")
merged_df

Unnamed: 0_level_0,P19A,P20A,P21A,P22A,Taxon,Confidence
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3b8bea8f89395d7be2a7a8deecd54615,3.0,0.0,1.0,2.0,d__Bacteria; p__Proteobacteria; c__Alphaproteo...,0.9971918421435888
8d8b84e4b354cb7622160eab1030b91e,0.0,0.0,0.0,27.0,d__Bacteria; p__Planctomycetota; c__Phycisphae...,0.9623742647121521
7ca265bf3fe12de6ce2347e73e17aa5d,0.0,0.0,0.0,84.0,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.9998776142587156
b430cce6841abb6c1f317c8c5a4c4b18,8.0,22.0,0.0,0.0,d__Bacteria,0.8402281230045965
472a17c774dff39b8c3a79b784ae1b04,32.0,181.0,111.0,5.0,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,0.9999998911376039
...,...,...,...,...,...,...
bce6efa26cef65cf8fa89fc549f8d61e,0.0,0.0,0.0,129.0,d__Bacteria; p__Verrucomicrobiota; c__Verrucom...,0.9997011365646831
190b1e37456385abc5bf83593355e625,0.0,7.0,23.0,0.0,d__Bacteria,0.8034380538982779
937542d158e2e4fac57f30f3b77fbb87,0.0,0.0,0.0,166.0,d__Bacteria; p__Desulfobacterota; c__Desulfuro...,0.9978780512029678
2a198b1adea24585658eee9e099df7d9,0.0,48.0,35.0,0.0,d__Bacteria; p__Proteobacteria; c__Alphaproteo...,0.9000939879511664


In [8]:
# view merged_df as q2.viz
q2.plugins.metadata.visualizers.tabulate(q2.Metadata(merged_df)).visualization

In [9]:
# change column order
cols = merged_df.columns.tolist()
cols = cols[-2:] + cols[:-2]
merged_df = merged_df[cols]
merged_df

Unnamed: 0_level_0,Taxon,Confidence,P19A,P20A,P21A,P22A
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3b8bea8f89395d7be2a7a8deecd54615,d__Bacteria; p__Proteobacteria; c__Alphaproteo...,0.9971918421435888,3.0,0.0,1.0,2.0
8d8b84e4b354cb7622160eab1030b91e,d__Bacteria; p__Planctomycetota; c__Phycisphae...,0.9623742647121521,0.0,0.0,0.0,27.0
7ca265bf3fe12de6ce2347e73e17aa5d,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.9998776142587156,0.0,0.0,0.0,84.0
b430cce6841abb6c1f317c8c5a4c4b18,d__Bacteria,0.8402281230045965,8.0,22.0,0.0,0.0
472a17c774dff39b8c3a79b784ae1b04,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,0.9999998911376039,32.0,181.0,111.0,5.0
...,...,...,...,...,...,...
bce6efa26cef65cf8fa89fc549f8d61e,d__Bacteria; p__Verrucomicrobiota; c__Verrucom...,0.9997011365646831,0.0,0.0,0.0,129.0
190b1e37456385abc5bf83593355e625,d__Bacteria,0.8034380538982779,0.0,7.0,23.0,0.0
937542d158e2e4fac57f30f3b77fbb87,d__Bacteria; p__Desulfobacterota; c__Desulfuro...,0.9978780512029678,0.0,0.0,0.0,166.0
2a198b1adea24585658eee9e099df7d9,d__Bacteria; p__Proteobacteria; c__Alphaproteo...,0.9000939879511664,0.0,48.0,35.0,0.0


In [10]:
# prepare new_taxonomy file with taxonomy corrections, same as above

merged_df['Domain'] = merged_df.Taxon.apply(extract_level, level_prefix='d__', null_data='Unassigned')
merged_df['Phylum'] = merged_df.Taxon.apply(extract_level, level_prefix='p__', null_data='Unassigned')
merged_df['Class'] = merged_df.Taxon.apply(extract_level, level_prefix='c__', null_data='Unassigned')
merged_df['Order'] = merged_df.Taxon.apply(extract_level, level_prefix='o__', null_data='Unassigned')
merged_df['Family'] = merged_df.Taxon.apply(extract_level, level_prefix='f__', null_data='Unassigned')
merged_df['Genus'] = merged_df.Taxon.apply(extract_level, level_prefix='g__', null_data='Unassigned')

invalid_words = ['Unknown_Family', "uncultured", "Unassigned"]
unassigned_like_words = ['Sva0081_sediment_group', "Marine_Group_II", 'Sva0081_sediment_group', 'Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium', 'Marine_Group_II', 'WD2101_soil_group', 'WWE3', 'CL500-29_marine_group', 'mle1-7', 'OM27_clade', 'MND1', 'Pir4_lineage','CCM11a', 'TRA3-20', 'hgcI_clade', 'BD2-11_terrestrial_group', 'Ellin6067', 'SC-I-84', 'NB1-j', 'OM182_clade', '[Eubacterium]_eligens_group', 'AKYG587','TM7a', 'OM60(NOR5)_clade', 'CCD24', 'vadinHA49', 'RCP2-54', 'SH-PL14', 'bacteriap25', 'ADurb.Bin063-1', 'NS11-12_marine_group', 'Mitochondria', 'SM1A02', 'Lachnospiraceae_UCG-010']
merged_df = merged_df.apply(backtrace_unassigned, unassigned_like_words=unassigned_like_words, invalid_words=invalid_words, axis=1)

merged_df['Taxon'] = merged_df.apply(reassemble_taxon, axis=1)
merged_df = merged_df.apply(remove_duplicates, axis=1)

columns = ['Phylum', 'Class', 'Order', 'Family', 'Genus']#, 'Species']

new_merged = q2.Artifact.import_data("FeatureData[Taxonomy]", merged_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['Domain'] = merged_df.Taxon.apply(extract_level, level_prefix='d__', null_data='Unassigned')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['Phylum'] = merged_df.Taxon.apply(extract_level, level_prefix='p__', null_data='Unassigned')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 

In [11]:
new_merged_df = new_merged.view(view_type = pd.DataFrame)
new_merged_df

Unnamed: 0_level_0,Taxon,Confidence,P19A,P20A,P21A,P22A,Domain,Phylum,Class,Order,Family,Genus
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3b8bea8f89395d7be2a7a8deecd54615,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,0.9971918421435888,3.0,0.0,1.0,2.0,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingobium
8d8b84e4b354cb7622160eab1030b91e,d__Bacteria;p__Planctomycetota;c__Phycisphaera...,0.9623742647121521,0.0,0.0,0.0,27.0,Bacteria,Planctomycetota,Phycisphaerae,Tepidisphaerales,Unclass. Tepidisphaerales WD2101_soil_group,Unclass. Tepidisphaerales WD2101_soil_group
7ca265bf3fe12de6ce2347e73e17aa5d,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,0.9998776142587156,0.0,0.0,0.0,84.0,Bacteria,Proteobacteria,Gammaproteobacteria,Burkholderiales,Comamonadaceae,Unclass. Comamonadaceae
b430cce6841abb6c1f317c8c5a4c4b18,d__Bacteria;p__Unclass. Bacteria;c__Unclass. B...,0.8402281230045965,8.0,22.0,0.0,0.0,Bacteria,Unclass. Bacteria,Unclass. Bacteria,Unclass. Bacteria,Unclass. Bacteria,Unclass. Bacteria
472a17c774dff39b8c3a79b784ae1b04,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,0.9999998911376039,32.0,181.0,111.0,5.0,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides
...,...,...,...,...,...,...,...,...,...,...,...,...
bce6efa26cef65cf8fa89fc549f8d61e,d__Bacteria;p__Verrucomicrobiota;c__Verrucomic...,0.9997011365646831,0.0,0.0,0.0,129.0,Bacteria,Verrucomicrobiota,Verrucomicrobiae,Pedosphaerales,Pedosphaeraceae,Unclass. Pedosphaeraceae
190b1e37456385abc5bf83593355e625,d__Bacteria;p__Unclass. Bacteria;c__Unclass. B...,0.8034380538982779,0.0,7.0,23.0,0.0,Bacteria,Unclass. Bacteria,Unclass. Bacteria,Unclass. Bacteria,Unclass. Bacteria,Unclass. Bacteria
937542d158e2e4fac57f30f3b77fbb87,d__Bacteria;p__Desulfobacterota;c__Desulfuromo...,0.9978780512029678,0.0,0.0,0.0,166.0,Bacteria,Desulfobacterota,Desulfuromonadia,Geobacterales,Geobacteraceae,Unclass. Geobacteraceae
2a198b1adea24585658eee9e099df7d9,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,0.9000939879511664,0.0,48.0,35.0,0.0,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingomonas


In [12]:
for col in columns:
    cases = new_merged_df.query(f"not {col}.str.contains('Unclass.')")[col]
    print(f'{col}: {cases.nunique()}')
    print(cases.unique().tolist())
    print('')

Phylum: 12
['Proteobacteria', 'Planctomycetota', 'Bacteroidota', 'Patescibacteria', 'Firmicutes', 'Acidobacteriota', 'Spirochaetota', 'Verrucomicrobiota', 'Gemmatimonadota', 'Actinobacteriota', 'Desulfobacterota', 'Chloroflexi']

Class: 21
['Alphaproteobacteria', 'Phycisphaerae', 'Gammaproteobacteria', 'Bacteroidia', 'Microgenomatia', 'Clostridia', 'Vicinamibacteria', 'Spirochaetia', 'Bacilli', 'Verrucomicrobiae', 'Gemmatimonadetes', 'Acidimicrobiia', 'Desulfobacteria', 'Planctomycetes', 'Negativicutes', 'Omnitrophia', 'Kapabacteria', 'Anaerolineae', 'Actinobacteria', 'Acidobacteriae', 'Desulfuromonadia']

Order: 41
['Sphingomonadales', 'Tepidisphaerales', 'Burkholderiales', 'Bacteroidales', 'Phycisphaerales', 'Cytophagales', 'Candidatus_Amesbacteria', 'Lachnospirales', 'Vicinamibacterales', 'Pseudomonadales', 'Spirochaetales', 'Legionellales', 'Bacillales', 'Pedosphaerales', 'Caulobacterales', 'Lactobacillales', 'Chitinophagales', 'Gemmatimonadales', 'Verrucomicrobiales', 'Oscillospir