Load Dependecies

In [1]:
import subprocess
import pandas as pd
db = 'db/mibig_prot_seqs_3.1_db'

def run_blast(query_file, db, output_file, blast_type="blastp", evalue=1e-5, max_target_seqs=10, outfmt="6"):
    blast_command = [
        blast_type,
        "-query", query_file,
        "-db", db,
        "-out", output_file,
        "-outfmt", outfmt,
        "-evalue", str(evalue),
        "-max_target_seqs", str(max_target_seqs)
    ]
    
    result = subprocess.run(blast_command, capture_output=True, text=True)
    if result.returncode == 0:
        print("BLAST search completed successfully.")
    else:
        print("Error:", result.stderr)

Define input and parameters, then run the cell

In [3]:
# Just drag and drop your input fasta file to the file browser on the left and define the name below (w/o file extension) 
query_file='example/blast_example'
# Input here a filename for your output (without .csv)
output_file='example/blast_example_out'
# e-value in BLAST represents the number of hits (alignments) that would be expected to occur by chance in a database of a particular size, 
# with lower e-values indicating more statistically significant matches
evalue=1e-5
# max_target_seqs limits the number of aligned sequences returned, with higher values returning more results (up to the specified maximum)
max_target_seqs=10

# Run the function
run_blast(query_file+'.fasta', db, output_file+'.csv', blast_type="blastp", evalue=evalue, max_target_seqs=max_target_seqs)
run_blast(query_file+'.fasta', db, output_file+'.txt', blast_type="blastp", evalue=evalue, max_target_seqs=max_target_seqs, outfmt="0")

BLAST search completed successfully.
BLAST search completed successfully.


View top X results

In [4]:
# Define how many top hits you want to get printed in the console
X=10

# Load the output file into a DataFrame
results_df = pd.read_csv(output_file+'.csv', sep="\t", header=None)
# Define column names for easy reference
results_df.columns = ["query_id", "subject_id", "percent_identity", "alignment_length", "mismatches", "gap_opens", "q_start", "q_end", "s_start", "s_end", "e_value", "bit_score"]
# Sort the DataFrame by 'percent_identity' in descending order
results_df_sorted = results_df.sort_values(by='percent_identity', ascending=False)
# Save the sorted results back to a CSV file
results_df_sorted.to_csv(output_file+'_sorted.csv', index=False)
# Display the sorted results
results_df_sorted.head(X)

Unnamed: 0,query_id,subject_id,percent_identity,alignment_length,mismatches,gap_opens,q_start,q_end,s_start,s_end,e_value,bit_score
82,globG,BGC0002072|c1|32857-55326|-|M444_04815|polyket...,58.774,359,143,2,2834,3192,5290,5643,6.479999999999999e-100,364.0
28,globG,BGC0001856|c1|71238-96572|+|QBF51758.1|type_I_...,58.64,897,350,9,1113,2004,34,914,0.0,987.0
83,globG,BGC0002072|c1|32857-55326|-|M444_04815|polyket...,58.038,367,151,2,2834,3200,1472,1835,4.5999999999999996e-89,328.0
22,globG,BGC0002355|c1|108583-131049|+|BAW35658.1|modul...,57.967,364,151,1,2834,3197,1452,1813,2.48e-97,355.0
23,globG,BGC0002355|c1|108583-131049|+|BAW35658.1|modul...,57.3,363,153,2,2837,3197,5312,5674,3.56e-97,355.0
71,globG,BGC0002356|c1|22380-39473|-|BAW35634.1|modular...,56.986,365,154,3,2834,3197,1423,1785,5.23e-97,354.0
53,globG,BGC0002498|c1|142496-159757|+|QQZ01588.1|PKS|Q...,55.707,368,160,2,2834,3200,1429,1794,3.74e-96,351.0
27,globG,BGC0001856|c1|71238-96572|+|QBF51758.1|type_I_...,54.512,1075,438,17,985,2038,1430,2474,0.0,1024.0
13,globG,BGC0002355|c1|108583-131049|+|BAW35658.1|modul...,54.277,2233,910,35,1021,3204,1739,3909,0.0,2034.0
66,globG,BGC0002356|c1|22380-39473|-|BAW35634.1|modular...,53.985,1217,520,18,1113,2309,34,1230,0.0,1186.0
