In [12]:
from qiime2 import Artifact
import pandas as pd
from Bio.Blast import NCBIWWW, NCBIXML
import time

In [13]:
rep_seqs_qza = Artifact.load('qiime2_blast_data/SRA1038019-rep_seqs.qza') 
taxonomy_qza = Artifact.load('qiime2_blast_data/SRA1038019-taxonomy.qza')

rep_seqs = rep_seqs_qza.view(pd.Series).to_frame(name='Sequence')
rep_seqs.index.name = 'Feature ID'
tax = taxonomy_qza.view(pd.Series)
tax.index.name = 'Feature ID' 

merged = pd.merge(rep_seqs, tax, left_index=True, right_index=True)
merged.reset_index(inplace=True)
unassigned = merged[merged['Taxon'].str.endswith('s__')]
unique_values_count = unassigned.iloc[:, 0].nunique()
first_sequence = unassigned['Taxon'].iloc[0]
print (first_sequence)

k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Porphyromonadaceae; g__Porphyromonas; s__


In [6]:
#def fetch_taxonomy_data(taxon_id): 
    #try: 
        #handle = Entrez.efetch(db="taxonomy", id=taxon_id, retmode="xml") 
        #record = Entrez.read(handle) 
        #return record 
    #except Exception as e: 
        #print(f"Error fetching taxonomy data: {e}") 
        #return None

In [8]:
def blast_classifier(rep_seqs_path, taxonomy_path, top_n=10):
    # load in files
    rep_seqs_qza = Artifact.load(rep_seqs_path) 
    taxonomy_qza = Artifact.load(taxonomy_path)

    # convert taxonomy to df
    taxonomy_df = taxonomy_qza.view(pd.DataFrame)

    # convert req seqs to df
    rep_seqs = rep_seqs_qza.view(pd.Series).to_frame(name='Sequence')
    rep_seqs.index.name = 'Feature ID'

    # merge dfs on Feature ID
    merged = pd.merge(rep_seqs, taxonomy_df, left_index=True, right_index=True)
    merged.reset_index(inplace=True)

    # filter for entries without species assignment
    unassigned = merged[merged['Taxon'].str.endswith('s__')]

    # empty list for BLAST results
    blast_results = []

    # perform BLAST for each unassigned sequencec
    for _, row in unassigned.iterrows():
        feature_id = row['Feature ID']
        sequence = row['Sequence']
        
        # BLAST search
        result_handle = NCBIWWW.qblast('blastn', 'nt', sequence)
        blast_record = NCBIXML.read(result_handle)
        if blast_record.alignments:
            for alignment in blast_record.alignments[:top_n]:
                for hsp in alignment.hsps:
                    extracted_title = alignment.title.rsplit('|', 1)[-1].strip()
                    blast_results.append({
                        'Feature ID': feature_id,
                        'expect': hsp.expect,
                        'score': hsp.score,
                        'bits': hsp.bits, 
                        'align_length': hsp.align_length,
                        'identities': hsp.identities, 
                        'positives': hsp.positives,
                        'gaps': hsp.gaps,
                        'query': hsp.query,
                        'sbjct': hsp.sbjct,
                        'match': hsp.match,
                        'taxonomy': extracted_title
                    })
                time.sleep(1)

    blast_results_df = pd.DataFrame(blast_results)

    return blast_results_df

# file paths
rep_seqs_path = 'qiime2_blast_data/SRA1038019-rep_seqs.qza'
taxonomy_path = 'qiime2_blast_data/SRA1038019-taxonomy.qza'

# Load the taxonomy artifact
#taxonomy_qza_path = 'qiime2_blast_data/ERA1115542-taxonomy.qza'
#taxonomy_qza = Artifact.load(taxonomy_qza_path)

# Convert the taxonomy artifact to a DataFrame
#taxonomy_df = taxonomy_qza.view(pd.DataFrame)

# Display the first few rows of the taxonomy DataFrame
#taxonomy_df.head()

# Display the first few rows of the taxonomy DataFrame
#print(taxonomy_df.head())

# Save the DataFrame to a CSV file for further inspection if needed
#taxonomy_df.to_csv('taxonomy.csv', index=False)

# Perform BLAST classification and get results
results_df = blast_classifier(rep_seqs_path, taxonomy_path) 

# Display the results
results_df









Unnamed: 0,Feature ID,expect,score,bits,align_length,identities,positives,gaps,query,sbjct,match,taxonomy
0,4900b14ef37cec718bf18a0d92210a03,0.0,917.0,828.131,461,460,460,0,GACTACAAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,GACTACCAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,|||||| |||||||||||||||||||||||||||||||||||||||...,Uncultured bacterium clone ncd2437e12c2 16S ri...
1,4900b14ef37cec718bf18a0d92210a03,0.0,917.0,828.131,461,460,460,0,GACTACAAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,GACTACCAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,|||||| |||||||||||||||||||||||||||||||||||||||...,Uncultured bacterium clone ncd2286f10c1 16S ri...
2,4900b14ef37cec718bf18a0d92210a03,0.0,917.0,828.131,461,460,460,0,GACTACAAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,GACTACCAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,|||||| |||||||||||||||||||||||||||||||||||||||...,Uncultured bacterium clone ncd1245g01c1 16S ri...
3,4900b14ef37cec718bf18a0d92210a03,0.0,912.0,823.622,461,459,459,0,GACTACAAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,GACTACCAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,|||||| |||||||||||||||||||||||||||||||||||||||...,Uncultured bacterium clone ncd2787d04c1 16S ri...
4,4900b14ef37cec718bf18a0d92210a03,0.0,912.0,823.622,461,459,459,0,GACTACAAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,GACTACCAGGGTATCTAATCCTGTTCGCTCCCCACGCTTTCGTGCC...,|||||| |||||||||||||||||||||||||||||||||||||||...,Uncultured bacterium clone ncd2554e10c1 16S ri...
...,...,...,...,...,...,...,...,...,...,...,...,...
349,9186e853f994fd8285d68ccd223cd34d,0.0,873.0,788.457,439,438,438,0,GACTACAAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,GACTACCAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,|||||| |||||||||||||||||||||||||||||||||||||||...,Uncultured organism clone ELU0143-T325-S-NIPCR...
350,9186e853f994fd8285d68ccd223cd34d,0.0,873.0,788.457,439,438,438,0,GACTACAAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,GACTACCAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,|||||| |||||||||||||||||||||||||||||||||||||||...,Levyella massiliensis strain 9403326 16S ribos...
351,9186e853f994fd8285d68ccd223cd34d,0.0,873.0,788.457,439,438,438,0,GACTACAAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,GACTACCAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,|||||| |||||||||||||||||||||||||||||||||||||||...,Uncultured bacterium clone ncd2716a04c1 16S ri...
352,9186e853f994fd8285d68ccd223cd34d,0.0,873.0,788.457,439,438,438,0,GACTACAAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,GACTACCAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACC...,|||||| |||||||||||||||||||||||||||||||||||||||...,Uncultured bacterium clone ncd2714d06c1 16S ri...


In [5]:
def get_best_blast_hits(results_df, expect_threshold=0.001):
    best_hits = []

    grouped = results_df.groupby('Feature ID')
    for feature_id, group in grouped:
        # Filter by expect threshold
        filtered_group = group[group['expect'] <= expect_threshold]
        if not filtered_group.empty:
            # Select the best hit with the lowest expect value, then highest bit score
            best_hit = filtered_group.loc[filtered_group.sort_values(['expect', 'bits'], ascending=[True, False]).index[0]]
            best_hits.append(best_hit)

    best_hits_df = pd.DataFrame(best_hits)
    return best_hits_df

best_hits_df = get_best_blast_hits(results_df, expect_threshold=0.001)

# display the best hits
best_hits_df


Unnamed: 0,Feature ID,expect,taxonomy
0,4900b14ef37cec718bf18a0d92210a03,0.0,Uncultured bacterium clone ncd2437e12c2 16S ri...


In [9]:
print (results_df['taxonomy'].iloc[0])

gi|322196871|gb|JF211466.1| Uncultured bacterium clone ncd2437e12c2 16S ribosomal RNA gene, partial sequence


In [6]:
#downloading reference sequences and taxonomy
!wget -O "85_otus.fasta" "https://data.qiime2.org/2024.5/tutorials/training-feature-classifiers/85_otus.fasta"
!wget -O "85_otu_taxonomy.txt" "https://data.qiime2.org/2024.5/tutorials/training-feature-classifiers/85_otu_taxonomy.txt"

#importing into QIIME 2 artifact
!qiime tools import \
  --type 'FeatureData[Sequence]' \
  --input-path 85_otus.fasta \
  --output-path 85_otus.qza

#mport the downloaded taxonomy into a QIIME 2 artifact
!qiime tools import \
  --type 'FeatureData[Taxonomy]' \
  --input-format HeaderlessTSVTaxonomyFormat \
  --input-path 85_otu_taxonomy.txt \
  --output-path 85_otu_taxonomy.qza

#verify the imports by viewing the artifacts
!qiime tools peek 85_otus.qza
!qiime tools peek 85_otu_taxonomy.qza

ValueError: Error message from NCBI: Message ID#24 Error: Failed to read the Blast query: Protein FASTA provided for nucleotide sequence

In [8]:
#load the existing database taxonomy
existing_taxonomy_path = '85_otu_taxonomy.qza'
existing_sequences_path = '85_otus.qza'
existing_taxonomy_qza = Artifact.load(existing_taxonomy_path)
existing_sequences_qza = Artifact.load(existing_sequences_path)

#convert to df
existing_taxonomy_df = existing_taxonomy_qza.view(pd.DataFrame)
existing_sequences_df = existing_sequences_qza.view(pd.Series).to_frame(name='Sequence')
existing_sequences_df.index.name = 'Feature ID'

best_hits_df = pd.read_csv('best_blast_hits.csv')

#combine taxonomy
best_hits_df = best_hits_df[['Feature ID', 'taxonomy']]
best_hits_df.columns = ['Feature ID', 'Taxon']  # Rename for consistency
combined_taxonomy_df = pd.concat([existing_taxonomy_df, best_hits_df], ignore_index=True)
combined_taxonomy_df.to_csv('combined_taxonomy.csv', index=False)

#combine sequences
combined_sequences_df = pd.concat([existing_sequences_df, best_hits_df[['Feature ID', 'Sequence']]], ignore_index=False)
combined_sequences_df.to_csv('combined_sequences.fasta', index=True, header=False)


#import combined taxonomy
!qiime tools import \
  --type 'FeatureData[Taxonomy]' \
  --input-format HeaderlessTSVTaxonomyFormat \
  --input-path combined_taxonomy.csv \
  --output-path combined_taxonomy.qza

#import combined sequences
!qiime tools import \
  --type 'FeatureData[Sequence]' \
  --input-path combined_sequences.fasta \
  --output-path combined_sequences.qza

#extract reference reads
!qiime feature-classifier extract-reads \
  --i-sequences combined_sequences.qza \
  --p-f-primer GTGCCAGCMGCCGCGGTAA \
  --p-r-primer GGACTACHVGGGTWTCTAAT \
  --p-trunc-len 120 \
  --o-reads ref-seqs.qza

SyntaxError: invalid syntax (<ipython-input-8-d3da90829954>, line 3)

In [None]:
#training naive bayes classifier
!qiime feature-classifier fit-classifier-naive-bayes \
  --i-reference-reads ref-seqs.qza \
  --i-reference-taxonomy combined_taxonomy.qza \
  --o-classifier classifier.qza