Load Dependecies

In [None]:
import subprocess
import pandas as pd
db = 'db/mibig_prot_seqs_3.1_db'

def run_blast(query_file, db, output_file, blast_type="blastp", evalue=1e-5, max_target_seqs=10, outfmt="6"):
    blast_command = [
        blast_type,
        "-query", query_file,
        "-db", db,
        "-out", output_file,
        "-outfmt", outfmt,
        "-evalue", str(evalue),
        "-max_target_seqs", str(max_target_seqs)
    ]
    
    result = subprocess.run(blast_command, capture_output=True, text=True)
    if result.returncode == 0:
        print("BLAST search completed successfully.")
    else:
        print("Error:", result.stderr)

Define input and parameters, then run the cell

In [None]:
# Just drag and drop your input fasta file to the file browser on the left and define the name below (w/o file extension) 
query_file='example/blast_example'
# Input here a filename for your output (without .csv)
output_file='example/blast_example_out'
# e-value in BLAST represents the number of hits (alignments) that would be expected to occur by chance in a database of a particular size, 
# with lower e-values indicating more statistically significant matches
evalue=1e-5
# max_target_seqs limits the number of aligned sequences returned, with higher values returning more results (up to the specified maximum)
max_target_seqs=10

# Run the function
run_blast(query_file+'.fasta', db, output_file+'.csv', blast_type="blastp", evalue=evalue, max_target_seqs=max_target_seqs)
run_blast(query_file+'.fasta', db, output_file+'.txt', blast_type="blastp", evalue=evalue, max_target_seqs=max_target_seqs, outfmt="0")

View top X results

In [None]:
# Define how many top hits you want to get printed in the console
X=10

# Load the output file into a DataFrame
results_df = pd.read_csv(output_file+'.csv', sep="\t", header=None)
# Define column names for easy reference
results_df.columns = ["query_id", "subject_id", "percent_identity", "alignment_length", "mismatches", "gap_opens", "q_start", "q_end", "s_start", "s_end", "e_value", "bit_score"]
# Sort the DataFrame by 'percent_identity' in descending order
results_df_sorted = results_df.sort_values(by='percent_identity', ascending=False)
# Save the sorted results back to a CSV file
results_df_sorted.to_csv(output_file+'_sorted.csv', index=False)
# Display the sorted results
results_df_sorted.head(X)