### Extract information about 
- total ASV
- ASV filtered (right phylum)
- total ASV - black proteus
- ASV filtered (right phylum)
- stats taxa numbers for black

In [20]:
import qiime2 as q2
import pandas as pd
import re
from qiime2.plugins.taxa.methods import filter_table
from qiime2.plugins.metadata.visualizers import tabulate

In [37]:
# define functions
# define extract lvl using regex (regular expressions)
def extract_level(tax, level_prefix, null_data = '/'):
    g = re.findall(fr'({level_prefix}[^;|^$]*)(?=;|$)', tax)
    if len(g) == 0:
        return null_data
    
    return g[0].replace(level_prefix, '')

# define backtrace_unassigned 
def backtrace_unassigned(row, unassigned_like_words, invalid_words):
    columns = ['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus']
    try:
        valid_name = [val for val in row[columns].values if val not in invalid_words][-1]
        row[columns] = [val if val not in invalid_words else f"Unclass. {valid_name}" for val in row[columns].values]
        
        valid_name = [val for val in row[columns].values if val not in unassigned_like_words][-1]
        row[columns] = [val if val not in unassigned_like_words else f"Unclass. {valid_name} {val}" for val in row[columns].values]
    except:
        pass
    return row

# define reassemble taxon column
def reassemble_taxon(row):
    columns = ['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus']
    prefixes = ['d__', 'p__', 'c__', 'o__', 'f__', 'g__']
    return ";".join(["".join(pair) for pair in zip(prefixes, row[columns].values)])


# remove duplicates
def remove_duplicates(row):
    columns = ['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus']
    try:
        values = [val for val in row[columns].values if "Unclass." not in val]
        unique_values, counts = np.unique(values, return_counts=True)
        duplicate_id = np.where(counts > 1)[0][0] if any(counts > 1) else None
        
        if duplicate_id is not None:
            count = counts[duplicate_id]
            duplicate_str = unique_values[duplicate_id]
            joined_str = '|'.join(row[columns].values[::-1])
            row[columns] = re.sub(duplicate_str, f'Unclass. {duplicate_str}', joined_str, count=count-1).split('|')[::-1]
    except:
        pass
    return row

In [18]:
denoisetable = q2.Artifact.load('./results/denoisetable.qza')
taxonomy = q2.Artifact.load(f'./results/taxonomy_sklearn.qza')

# stats za črne

In [21]:
# filter denoisetable, keep only d_Bacteria
denoisetable_ba = filter_table(
    table = denoisetable,
    taxonomy = taxonomy, 
    include = 'd__Bacteria'
)

In [22]:
# view denoisetable as dataframe
denoisetable_df = denoisetable_ba.filtered_table.view(view_type=pd.DataFrame)
# drop white specimen & transpose
denoisetable_b_unfiltered = denoisetable_df.tail(4).T
# set index name
denoisetable_b_unfiltered.index.name = 'Feature ID'
# drop rows with all zeros
denoisetable_b= denoisetable_b_unfiltered.loc[(denoisetable_b_unfiltered != 0).any(axis=1)]
# show
denoisetable_b

Unnamed: 0_level_0,P19A,P20A,P21A,P22A
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
b2573224b84130faa841a65345bd6686,8.0,13.0,3.0,2.0
38f3ccd8398955efb5c3fc543fffafb1,0.0,12.0,12.0,0.0
f04382e21599286007894f6b6749e547,0.0,0.0,0.0,23.0
19ccf2abfc4b4bed2b27e0708ec92c82,0.0,0.0,0.0,263.0
2f7e8ee13a8524af9a4e356bb0ad3d39,0.0,0.0,813.0,0.0
...,...,...,...,...
654554701c159ca465d2c658562fe954,0.0,0.0,0.0,154.0
c31fee29aee2e5edf707b17b6ef41f54,0.0,0.0,0.0,34.0
7b0206c4b953c37bbe8652b6ca55c52e,0.0,0.0,0.0,20.0
06301024de87837658c116d1a7bdc1f8,0.0,0.0,0.0,52.0


In [23]:
# view taxonomy metadata as dataframe
taxonomy_metadata = taxonomy.view(q2.Metadata).to_dataframe()
taxonomy_metadata

Unnamed: 0_level_0,Taxon,Confidence
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1eba318014a80c7ef58c4fe5a25317e7,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.7048761204522636
6fb0ae5e8d7e91acc3481ae385563c37,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.9999509990028024
4d4ae718b70724fc25f959b8888ba0c8,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.8440329231396565
b2573224b84130faa841a65345bd6686,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,0.7741138968163481
2d4b6ec0c2f78d209e9f3dea142e80ee,d__Bacteria; p__Planctomycetota; c__Phycisphae...,0.9454181775172625
...,...,...
d944d372f9e6f181de0b1e03964925b9,Unassigned,0.45809593428277495
06c409a280d0b5017226560ee5b53b94,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.9247622511411105
d6e8e226b606c9e3a6a571b0276b623c,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.8512655400051062
7b0206c4b953c37bbe8652b6ca55c52e,d__Bacteria; p__Planctomycetota; c__Phycisphae...,0.9899707803532873


In [27]:
# merge denoisetable_b and taxonomy_metadata by Feature ID
merged_df = pd.merge(denoisetable_b, taxonomy_metadata, on="Feature ID")
merged_df

Unnamed: 0_level_0,P19A,P20A,P21A,P22A,Taxon,Confidence
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
b2573224b84130faa841a65345bd6686,8.0,13.0,3.0,2.0,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,0.7741138968163481
38f3ccd8398955efb5c3fc543fffafb1,0.0,12.0,12.0,0.0,d__Bacteria; p__Planctomycetota; c__Planctomyc...,0.9733552597667586
f04382e21599286007894f6b6749e547,0.0,0.0,0.0,23.0,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.9442858547691642
19ccf2abfc4b4bed2b27e0708ec92c82,0.0,0.0,0.0,263.0,d__Bacteria; p__Desulfobacterota; c__Desulfoba...,0.7767584598602223
2f7e8ee13a8524af9a4e356bb0ad3d39,0.0,0.0,813.0,0.0,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.9999844123885453
...,...,...,...,...,...,...
654554701c159ca465d2c658562fe954,0.0,0.0,0.0,154.0,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.7106313753247304
c31fee29aee2e5edf707b17b6ef41f54,0.0,0.0,0.0,34.0,d__Bacteria; p__Planctomycetota; c__Phycisphae...,0.9758808629991288
7b0206c4b953c37bbe8652b6ca55c52e,0.0,0.0,0.0,20.0,d__Bacteria; p__Planctomycetota; c__Phycisphae...,0.9899707803532873
06301024de87837658c116d1a7bdc1f8,0.0,0.0,0.0,52.0,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.9999394803332947


In [28]:
# view merged_df as q2.viz
q2.plugins.metadata.visualizers.tabulate(q2.Metadata(merged_df)).visualization

In [29]:
# change column order
cols = merged_df.columns.tolist()
cols = cols[-2:] + cols[:-2]
merged_df = merged_df[cols]
merged_df

Unnamed: 0_level_0,Taxon,Confidence,P19A,P20A,P21A,P22A
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
b2573224b84130faa841a65345bd6686,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,0.7741138968163481,8.0,13.0,3.0,2.0
38f3ccd8398955efb5c3fc543fffafb1,d__Bacteria; p__Planctomycetota; c__Planctomyc...,0.9733552597667586,0.0,12.0,12.0,0.0
f04382e21599286007894f6b6749e547,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.9442858547691642,0.0,0.0,0.0,23.0
19ccf2abfc4b4bed2b27e0708ec92c82,d__Bacteria; p__Desulfobacterota; c__Desulfoba...,0.7767584598602223,0.0,0.0,0.0,263.0
2f7e8ee13a8524af9a4e356bb0ad3d39,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.9999844123885453,0.0,0.0,813.0,0.0
...,...,...,...,...,...,...
654554701c159ca465d2c658562fe954,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.7106313753247304,0.0,0.0,0.0,154.0
c31fee29aee2e5edf707b17b6ef41f54,d__Bacteria; p__Planctomycetota; c__Phycisphae...,0.9758808629991288,0.0,0.0,0.0,34.0
7b0206c4b953c37bbe8652b6ca55c52e,d__Bacteria; p__Planctomycetota; c__Phycisphae...,0.9899707803532873,0.0,0.0,0.0,20.0
06301024de87837658c116d1a7bdc1f8,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,0.9999394803332947,0.0,0.0,0.0,52.0


In [40]:
# prepare new_taxonomy file with taxonomy corrections, same as above

merged_df['Domain'] = merged_df.Taxon.apply(extract_level, level_prefix='d__', null_data='Unassigned')
merged_df['Phylum'] = merged_df.Taxon.apply(extract_level, level_prefix='p__', null_data='Unassigned')
merged_df['Class'] = merged_df.Taxon.apply(extract_level, level_prefix='c__', null_data='Unassigned')
merged_df['Order'] = merged_df.Taxon.apply(extract_level, level_prefix='o__', null_data='Unassigned')
merged_df['Family'] = merged_df.Taxon.apply(extract_level, level_prefix='f__', null_data='Unassigned')
merged_df['Genus'] = merged_df.Taxon.apply(extract_level, level_prefix='g__', null_data='Unassigned')

invalid_words = ['Unknown_Family', "uncultured", "Unassigned"]
unassigned_like_words = ['Sva0081_sediment_group', "Marine_Group_II", 'Sva0081_sediment_group', 'Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium', 'Marine_Group_II', 'WD2101_soil_group', 'WWE3', 'CL500-29_marine_group', 'mle1-7', 'OM27_clade', 'MND1', 'Pir4_lineage','CCM11a', 'TRA3-20', 'hgcI_clade', 'BD2-11_terrestrial_group', 'Ellin6067', 'SC-I-84', 'NB1-j', 'OM182_clade', '[Eubacterium]_eligens_group', 'AKYG587','TM7a', 'OM60(NOR5)_clade', 'CCD24', 'vadinHA49', 'RCP2-54', 'SH-PL14', 'bacteriap25', 'ADurb.Bin063-1', 'NS11-12_marine_group', 'Mitochondria', 'SM1A02', 'Lachnospiraceae_UCG-010']
merged_df = merged_df.apply(backtrace_unassigned, unassigned_like_words=unassigned_like_words, invalid_words=invalid_words, axis=1)

merged_df['Taxon'] = merged_df.apply(reassemble_taxon, axis=1)
merged_df = merged_df.apply(remove_duplicates, axis=1)

columns = ['Phylum', 'Class', 'Order', 'Family', 'Genus']#, 'Species']

new_merged = q2.Artifact.import_data("FeatureData[Taxonomy]", merged_df)

In [41]:
new_merged_df = new_merged.view(view_type = pd.DataFrame)
new_merged_df

Unnamed: 0_level_0,Taxon,Confidence,P19A,P20A,P21A,P22A,Domain,Phylum,Class,Order,Family,Genus
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
b2573224b84130faa841a65345bd6686,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,0.7741138968163481,8.0,13.0,3.0,2.0,Bacteria,Bacteroidota,Bacteroidia,Cytophagales,Spirosomaceae,Arcicella
38f3ccd8398955efb5c3fc543fffafb1,d__Bacteria;p__Planctomycetota;c__Planctomycet...,0.9733552597667586,0.0,12.0,12.0,0.0,Bacteria,Planctomycetota,Planctomycetes,Planctomycetales,Unclass. Planctomycetales,Unclass. Planctomycetales
f04382e21599286007894f6b6749e547,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,0.9442858547691642,0.0,0.0,0.0,23.0,Bacteria,Proteobacteria,Gammaproteobacteria,Salinisphaerales,Solimonadaceae,Unclass. Solimonadaceae
19ccf2abfc4b4bed2b27e0708ec92c82,d__Bacteria;p__Desulfobacterota;c__Desulfobact...,0.7767584598602223,0.0,0.0,0.0,263.0,Bacteria,Desulfobacterota,Desulfobacteria,Desulfobacterales,Desulfosarcinaceae,Unclass. Desulfosarcinaceae Sva0081_sediment_g...
2f7e8ee13a8524af9a4e356bb0ad3d39,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,0.9999844123885453,0.0,0.0,813.0,0.0,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas
...,...,...,...,...,...,...,...,...,...,...,...,...
654554701c159ca465d2c658562fe954,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,0.7106313753247304,0.0,0.0,0.0,154.0,Bacteria,Proteobacteria,Gammaproteobacteria,Burkholderiales,Oxalobacteraceae,Unclass. Oxalobacteraceae
c31fee29aee2e5edf707b17b6ef41f54,d__Bacteria;p__Planctomycetota;c__Phycisphaera...,0.9758808629991288,0.0,0.0,0.0,34.0,Bacteria,Planctomycetota,Phycisphaerae,Tepidisphaerales,Unclass. Tepidisphaerales WD2101_soil_group,Unclass. Tepidisphaerales WD2101_soil_group
7b0206c4b953c37bbe8652b6ca55c52e,d__Bacteria;p__Planctomycetota;c__Phycisphaera...,0.9899707803532873,0.0,0.0,0.0,20.0,Bacteria,Planctomycetota,Phycisphaerae,Tepidisphaerales,Unclass. Tepidisphaerales WD2101_soil_group,Unclass. Tepidisphaerales WD2101_soil_group
06301024de87837658c116d1a7bdc1f8,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,0.9999394803332947,0.0,0.0,0.0,52.0,Bacteria,Proteobacteria,Gammaproteobacteria,Burkholderiales,Nitrosomonadaceae,Unclass. Nitrosomonadaceae MND1


In [42]:
for col in columns:
    cases = new_merged_df.query(f"not {col}.str.contains('Unclass.')")[col]
    print(f'{col}: {cases.nunique()}')
    print(cases.unique().tolist())
    print('')

Phylum: 12
['Bacteroidota', 'Planctomycetota', 'Proteobacteria', 'Desulfobacterota', 'Firmicutes', 'Actinobacteriota', 'Acidobacteriota', 'Verrucomicrobiota', 'Spirochaetota', 'Gemmatimonadota', 'Chloroflexi', 'Patescibacteria']

Class: 21
['Bacteroidia', 'Planctomycetes', 'Gammaproteobacteria', 'Desulfobacteria', 'Alphaproteobacteria', 'Phycisphaerae', 'Clostridia', 'Bacilli', 'Acidimicrobiia', 'Kapabacteria', 'Vicinamibacteria', 'Negativicutes', 'Verrucomicrobiae', 'Spirochaetia', 'Actinobacteria', 'Omnitrophia', 'Acidobacteriae', 'Gemmatimonadetes', 'Anaerolineae', 'Desulfuromonadia', 'Microgenomatia']

Order: 41
['Cytophagales', 'Planctomycetales', 'Salinisphaerales', 'Desulfobacterales', 'Pseudomonadales', 'Pirellulales', 'Methylococcales', 'Reyranellales', 'Phycisphaerales', 'Lachnospirales', 'Burkholderiales', 'Bacillales', 'Sphingomonadales', 'Gammaproteobacteria_Incertae_Sedis', 'Microtrichales', 'Bacteroidales', 'Kapabacteriales', 'Vicinamibacterales', 'Flavobacteriales', 'Ve