In [1]:
from qiime2 import Artifact
import pandas as pd
from Bio.Blast import NCBIWWW, NCBIXML
import time

In [2]:
# file paths
rep_seqs_path = ('qiime2_blast_data/ERA1115542-rep_seqs.qza')
taxonomy_path = ('qiime2_blast_data/ERA1115542-taxonomy.qza')

# load in files
rep_seqs_qza = Artifact.load(rep_seqs_path) 
taxonomy_qza = Artifact.load(taxonomy_path)

# convert taxonomy to df
taxonomy_df = taxonomy_qza.view(pd.DataFrame)

# convert req seqs to df
rep_seqs = rep_seqs_qza.view(pd.Series).to_frame(name='Sequence')
rep_seqs.index.name = 'Feature ID'

# merge dfs on Feature ID
merged = pd.merge(rep_seqs, taxonomy_df, left_index=True, right_index=True)

# filter for entries without species assignment
unassigned = merged[merged['Taxon'].str.endswith('s__')].head(3)

# empty list for BLAST results
blast_results = []

# perform BLAST for each unassigned sequencec
for feature_id, row in unassigned.iterrows():
    sequence = row['Sequence']
    
    # BLAST search
    result_handle = NCBIWWW.qblast('blastn', 'nt', sequence)
    blast_record = NCBIXML.read(result_handle)
    if blast_record.alignments:
        for alignment in blast_record.alignments[:10]:
            for hsp in alignment.hsps:
                blast_results.append({
                    'Feature ID': feature_id,
                    'expect': hsp.expect,
                    'score': hsp.score,
                    'bits': hsp.bits, 
                    'align_length': hsp.align_length,
                    'identities': hsp.identities, 
                    'positives': hsp.positives,
                    'gaps': hsp.gaps,
                    'query': hsp.query,
                    'sbjct': hsp.sbjct,
                    'match': hsp.match   
                })
            time.sleep(1)

blast_results_df = pd.DataFrame(blast_results)
print(blast_results_df)

                          Feature ID        expect  score     bits  \
0   2c2ff5f860fa0e6a03a14152ce6ce0ce  3.002850e-61  274.0  248.348   
1   2c2ff5f860fa0e6a03a14152ce6ce0ce  3.002850e-61  274.0  248.348   
2   2c2ff5f860fa0e6a03a14152ce6ce0ce  3.002850e-61  274.0  248.348   
3   2c2ff5f860fa0e6a03a14152ce6ce0ce  3.002850e-61  274.0  248.348   
4   2c2ff5f860fa0e6a03a14152ce6ce0ce  3.002850e-61  274.0  248.348   
5   2c2ff5f860fa0e6a03a14152ce6ce0ce  3.002850e-61  274.0  248.348   
6   2c2ff5f860fa0e6a03a14152ce6ce0ce  3.002850e-61  274.0  248.348   
7   2c2ff5f860fa0e6a03a14152ce6ce0ce  3.002850e-61  274.0  248.348   
8   2c2ff5f860fa0e6a03a14152ce6ce0ce  3.002850e-61  274.0  248.348   
9   2c2ff5f860fa0e6a03a14152ce6ce0ce  3.002850e-61  274.0  248.348   
10  2c2ff5f860fa0e6a03a14152ce6ce0ce  3.002850e-61  274.0  248.348   
11  2c2ff5f860fa0e6a03a14152ce6ce0ce  3.002850e-61  274.0  248.348   
12  2c2ff5f860fa0e6a03a14152ce6ce0ce  3.002850e-61  274.0  248.348   
13  9df50183d1e1d434