In [1]:
from qiime2 import Artifact
import pandas as pd
from Bio.Blast import NCBIWWW, NCBIXML
import time

In [None]:
def blast_classifier(rep_seqs_path, taxonomy_path, top_n=10, max_results=1):
    '''
    This classifier classifies the sequences by performing BLAST search on the unassigned sequences.
    
    Parameters:
    rep_seqs_path (str): path to the representative sequences file
    taxonomy_path (str): path to the taxonomy file
    top_n (int): number of top alignments to consider each BLAST search
     max_results (int): number of unassigned sequences to process
      
    Returns:
    pd.DataFrame: DataFrame containing BLAST results.  
    '''
    # load in files
    rep_seqs_qza = Artifact.load(rep_seqs_path) 
    taxonomy_qza = Artifact.load(taxonomy_path)

    # convert taxonomy to df
    taxonomy_df = taxonomy_qza.view(pd.DataFrame)

    # convert req seqs to df
    rep_seqs = rep_seqs_qza.view(pd.Series).to_frame(name='Sequence')
    rep_seqs.index.name = 'Feature ID'

    # merge dfs on Feature ID
    merged = pd.merge(rep_seqs, taxonomy_df, left_index=True, right_index=True)

    # filter for entries without species assignment
    unassigned = merged[merged['Taxon'].str.endswith('s__')].head(max_results)

    # empty list for BLAST results
    blast_results = []

    # perform BLAST for each unassigned sequencec
    for feature_id, row in unassigned.iterrows():
        sequence = row['Sequence']
        
        # BLAST search
        result_handle = NCBIWWW.qblast('blastn', 'nt', sequence)
        blast_record = NCBIXML.read(result_handle)
        if blast_record.alignments:
            for alignment in blast_record.alignments[:top_n]:
                for hsp in alignment.hsps:
                    taxonomy = row['Taxon']
                    blast_results.append({
                        'Feature ID': feature_id,
                        'expect': hsp.expect,
                        'score': hsp.score,
                        'bits': hsp.bits, 
                        'align_length': hsp.align_length,
                        'identities': hsp.identities, 
                        'positives': hsp.positives,
                        'gaps': hsp.gaps,
                        'query': hsp.query,
                        'sbjct': hsp.sbjct,
                        'match': hsp.match,
                        'taxonomy': taxonomy  
                    })
                time.sleep(1)

    blast_results_df = pd.DataFrame(blast_results)

    return blast_results_df

# file paths
rep_seqs_path = 'qiime2_blast_data/ERA1115542-rep_seqs.qza'
taxonomy_path = 'qiime2_blast_data/ERA1115542-taxonomy.qza'

# Load the taxonomy artifact
taxonomy_qza_path = 'qiime2_blast_data/ERA1115542-taxonomy.qza'
taxonomy_qza = Artifact.load(taxonomy_qza_path)

# Convert the taxonomy artifact to a DataFrame
taxonomy_df = taxonomy_qza.view(pd.DataFrame)

# Display the first few rows of the taxonomy DataFrame
taxonomy_df.head()

# Display the first few rows of the taxonomy DataFrame
print(taxonomy_df.head())

# Save the DataFrame to a CSV file for further inspection if needed
#taxonomy_df.to_csv('taxonomy.csv', index=False)

# Perform BLAST classification and get results
results_df = blast_classifier(rep_seqs_path, taxonomy_path) 

# Display the results
results_df









                                                                              Taxon  \
Feature ID                                                                            
25cef8e738313df56dded85a4e57dbc5  k__Bacteria; p__Firmicutes; c__Bacilli; o__Bac...   
2c2ff5f860fa0e6a03a14152ce6ce0ce  k__Bacteria; p__Actinobacteria; c__Actinobacte...   
6722fabb0f59d2d6942dffb65c8ac0c3  k__Bacteria; p__Actinobacteria; c__Actinobacte...   
5299a0de540765765e98253cd86f65dd  k__Bacteria; p__Actinobacteria; c__Actinobacte...   
0203a5fdabe513861a8936651ef4019f  k__Bacteria; p__Firmicutes; c__Bacilli; o__Lac...   

                                          Confidence  
Feature ID                                            
25cef8e738313df56dded85a4e57dbc5  0.7605489472822277  
2c2ff5f860fa0e6a03a14152ce6ce0ce  0.8228727992486198  
6722fabb0f59d2d6942dffb65c8ac0c3  0.9900719420545704  
5299a0de540765765e98253cd86f65dd  0.9332780981959647  
0203a5fdabe513861a8936651ef4019f  0.9999994549609157  


In [None]:
#test

In [None]:
def get_blast_taxonomy(sequence, expect_threshold=0.001):
    result_handle = NCBIWWW.qblast('blastn', 'nt', sequence)
    blast_record = NCBIXML.read(result_handle)
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < expect_threshold:
                taxonomy = f"k__{alignment.hit_def.split(';')[0]}; p__{alignment.hit_def.split(';')[1]}; " \
                           f"c__{alignment.hit_def.split(';')[2]}; o__{alignment.hit_def.split(';')[3]}; " \
                           f"f__{alignment.hit_def.split(';')[4]}; g__{alignment.hit_def.split(';')[5]}; " \
                           f"s__{alignment.hit_def.split(';')[6]}"
                return taxonomy
    return None

# perform BLAST for each sequence and create reference database files
otu_list = []
taxonomy_list = []

for feature_id, sequence in rep_seqs.items():
    taxonomy = get_blast_taxonomy(sequence)
    if taxonomy:
        otu_list.append(feature_id)
        taxonomy_list.append(taxonomy)

# df for OTUs and Taxonomy
otu_df = pd.DataFrame({'Feature ID': otu_list, 'Taxonomy': taxonomy_list})
otu_artifact = Artifact.import_data('FeatureData[Sequence]', otu_df)
taxonomy_artifact = Artifact.import_data('FeatureData[Taxonomy]', otu_df)

# save artifacts
otu_artifact.save('otus.qza')
taxonomy_artifact.save('taxonomy.qza')

In [None]:
#naive bayes
# import necessary plugins
qiime tools import \
  --type 'FeatureData[Sequence]' \
  --input-path otus.qza \
  --output-path ref-seqs.qza

qiime tools import \
  --type 'FeatureData[Taxonomy]' \
  --input-path taxonomy.qza \
  --output-path ref-taxonomy.qza

# Train the classifier
qiime feature-classifier fit-classifier-naive-bayes \
  --i-reference-reads ref-seqs.qza \
  --i-reference-taxonomy ref-taxonomy.qza \
  --o-classifier classifier.qza
