In [1]:
from qiime2 import Artifact
import pandas as pd
from Bio.Blast import NCBIWWW, NCBIXML
import time

In [4]:
rep_seqs_qza = Artifact.load('qiime2_blast_data/SRA1038019-rep_seqs.qza') 
taxonomy_qza = Artifact.load('qiime2_blast_data/SRA1038019-taxonomy.qza')

rep_seqs = rep_seqs_qza.view(pd.Series).to_frame(name='Sequence')
rep_seqs.index.name = 'Feature ID'
tax = taxonomy_qza.view(pd.Series)
tax.index.name = 'Feature ID' 

merged = pd.merge(rep_seqs, tax, left_index=True, right_index=True)
merged.reset_index(inplace=True)
unassigned = merged[merged['Taxon'].str.endswith('s__')]
unique_values_count = unassigned.iloc[:, 0].nunique()
first_sequence = unassigned['Sequence'].iloc[0]
print (first_sequence)

GACTACAAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCCTCAGTGTCAGACATACCTTTGTGAGCTGCCTTCGCAATCGGAGTTCTGCGTAATATCTATGCATTTCACCGCTACACTACGCATTCCGCCCACATCATGTACTCTCAAGTCTTACAGTTTCAACGGCAAGGCTGAGGTTGAGCCTCAGCTTTTCACCACTGACTTGCAAGACCACCTGCGCACCCTTTAAACCCAATAAATCCGGATAACGCTCGTATCCTCCGTATTACCGCGGCTGCTGGCACGGAGTTAGCCGATACTTATTCTCGAGCTACTTTCAAACTCCCATCACGTGGAAGTCCTTACTCGCTCGCTAAAGAAGTTTACAATCCTTAGGACATTCTTCCTTCACGCGACTTGGCTGGTTCAGAGTTTCCTCCATTGACCAATATTCCTCACTGCTGCCTCCCGTAGG


In [5]:
def blast_classifier(rep_seqs_path, taxonomy_path, top_n=10):
    '''
    This classifier classifies the sequences by performing BLAST search on the unassigned sequences.
    
    Parameters:
    rep_seqs_path (str): path to the representative sequences file
    taxonomy_path (str): path to the taxonomy file
    top_n (int): number of top alignments to consider each BLAST search
     max_results (int): number of unassigned sequences to process
      
    Returns:
    pd.DataFrame: DataFrame containing BLAST results.  
    '''
    # load in files
    rep_seqs_qza = Artifact.load(rep_seqs_path) 
    taxonomy_qza = Artifact.load(taxonomy_path)

    # convert taxonomy to df
    taxonomy_df = taxonomy_qza.view(pd.DataFrame)

    # convert req seqs to df
    rep_seqs = rep_seqs_qza.view(pd.Series).to_frame(name='Sequence')
    rep_seqs.index.name = 'Feature ID'

    # merge dfs on Feature ID
    merged = pd.merge(rep_seqs, taxonomy_df, left_index=True, right_index=True)
    merged.reset_index(inplace=True)

    # filter for entries without species assignment
    unassigned = merged[merged['Taxon'].str.endswith('s__')]

    # empty list for BLAST results
    blast_results = []

    # perform BLAST for each unassigned sequencec
    for feature_id, row in unassigned.iterrows():
        sequence = row['Sequence']
        
        # BLAST search
        result_handle = NCBIWWW.qblast('blastn', 'nt', sequence)
        blast_record = NCBIXML.read(result_handle)
        if blast_record.alignments:
            for alignment in blast_record.alignments[:top_n]:
                for hsp in alignment.hsps:
                    #taxonomy = row['Taxon']
                    blast_results.append({
                        'Feature ID': feature_id,
                        'expect': hsp.expect,
                        'score': hsp.score,
                        'bits': hsp.bits, 
                        'align_length': hsp.align_length,
                        'identities': hsp.identities, 
                        'positives': hsp.positives,
                        'gaps': hsp.gaps,
                        'query': hsp.query,
                        'sbjct': hsp.sbjct,
                        'match': hsp.match,
                        'staxids': hsp.staxids
                        #'taxonomy': taxonomy  
                    })
                time.sleep(1)

    blast_results_df = pd.DataFrame(blast_results)

    return blast_results_df

# file paths
rep_seqs_path = 'qiime2_blast_data/SRA1038019-rep_seqs.qza'
taxonomy_path = 'qiime2_blast_data/SRA1038019-taxonomy.qza'

# Load the taxonomy artifact
#taxonomy_qza_path = 'qiime2_blast_data/ERA1115542-taxonomy.qza'
#taxonomy_qza = Artifact.load(taxonomy_qza_path)

# Convert the taxonomy artifact to a DataFrame
#taxonomy_df = taxonomy_qza.view(pd.DataFrame)

# Display the first few rows of the taxonomy DataFrame
#taxonomy_df.head()

# Display the first few rows of the taxonomy DataFrame
#print(taxonomy_df.head())

# Save the DataFrame to a CSV file for further inspection if needed
#taxonomy_df.to_csv('taxonomy.csv', index=False)

# Perform BLAST classification and get results
results_df = blast_classifier(rep_seqs_path, taxonomy_path) 

# Display the results
results_df









Unnamed: 0,Feature ID,expect,score,bits,align_length,identities,positives,gaps,query,sbjct,match
0,3,0.0,917.0,828.131,461,460,460,0,GACTACAAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,GACTACCAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,|||||| |||||||||||||||||||||||||||||||||||||||...
1,3,0.0,917.0,828.131,461,460,460,0,GACTACAAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,GACTACCAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,|||||| |||||||||||||||||||||||||||||||||||||||...
2,3,0.0,917.0,828.131,461,460,460,0,GACTACAAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,GACTACCAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,|||||| |||||||||||||||||||||||||||||||||||||||...
3,3,0.0,912.0,823.622,461,459,459,0,GACTACAAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,GACTACCAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,|||||| |||||||||||||||||||||||||||||||||||||||...
4,3,0.0,912.0,823.622,461,459,459,0,GACTACAAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,GACTACCAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,|||||| |||||||||||||||||||||||||||||||||||||||...
...,...,...,...,...,...,...,...,...,...,...,...
339,53,0.0,873.0,788.457,439,438,438,0,GACTACAAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,GACTACCAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,|||||| |||||||||||||||||||||||||||||||||||||||...
340,53,0.0,873.0,788.457,439,438,438,0,GACTACAAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,GACTACCAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,|||||| |||||||||||||||||||||||||||||||||||||||...
341,53,0.0,873.0,788.457,439,438,438,0,GACTACAAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,GACTACCAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,|||||| |||||||||||||||||||||||||||||||||||||||...
342,53,0.0,873.0,788.457,439,438,438,0,GACTACAAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,GACTACCAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,|||||| |||||||||||||||||||||||||||||||||||||||...


In [3]:
min_expect = results_df['expect'].min()
min_expect

NameError: name 'results_df' is not defined

In [None]:
#test

In [6]:
def get_blast_taxonomy(sequence, expect_threshold=0.001):
    result_handle = NCBIWWW.qblast('blastn', 'nt', sequence)
    blast_record = NCBIXML.read(result_handle)
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < expect_threshold:
                taxonomy = f"k__{alignment.hit_def.split(';')[0]}; p__{alignment.hit_def.split(';')[1]}; " \
                           f"c__{alignment.hit_def.split(';')[2]}; o__{alignment.hit_def.split(';')[3]}; " \
                           f"f__{alignment.hit_def.split(';')[4]}; g__{alignment.hit_def.split(';')[5]}; " \
                           f"s__{alignment.hit_def.split(';')[6]}"
                return taxonomy
    return None

# perform BLAST for each sequence and create reference database files
otu_list = []
taxonomy_list = []

for feature_id, sequence in rep_seqs.items():
    taxonomy = get_blast_taxonomy(sequence)
    if taxonomy:
        otu_list.append(feature_id)
        taxonomy_list.append(taxonomy)

# df for OTUs and Taxonomy
otu_df = pd.DataFrame({'Feature ID': otu_list, 'Taxonomy': taxonomy_list})
otu_artifact = Artifact.import_data('FeatureData[Sequence]', otu_df)
taxonomy_artifact = Artifact.import_data('FeatureData[Taxonomy]', otu_df)

# save artifacts
otu_artifact.save('otus.qza')
taxonomy_artifact.save('taxonomy.qza')

ValueError: Error message from NCBI: Message ID#24 Error: Failed to read the Blast query: Protein FASTA provided for nucleotide sequence

In [8]:
#naive bayes
# import necessary plugins
mkdir training-feature-classifiers
cd training-feature-classifiers

qiime tools import \
  --type 'FeatureData[Sequence]' \
  --input-path otus.qza \
  --output-path ref-seqs.qza

qiime tools import \
  --type 'FeatureData[Taxonomy]' \
  --input-path taxonomy.qza \
  --output-path ref-taxonomy.qza

# Train the classifier
qiime feature-classifier fit-classifier-naive-bayes \
  --i-reference-reads ref-seqs.qza \
  --i-reference-taxonomy ref-taxonomy.qza \
  --o-classifier classifier.qza


SyntaxError: invalid syntax (<ipython-input-8-d3da90829954>, line 3)