# MBT collection

### How to run antismash

Options
- if not annotated: --genefinding-tool prodigal 
- run also clusterblast etc.: -cb-general --cb-knownclusters --cb-subclusters --asf --smcog-trees 


Run one file
- for FILE in *; do antismash --genefinding-tool prodigal --cb-general --cb-knownclusters --cb-subclusters --asf --smcog-trees $FILE; done


Run entire folder
- antismash --genefinding-tool prodigal --cb-general --cb-knownclusters --cb-subclusters --asf --smcog-trees Hm107.contig-sequences gapclosed.fa



### Extract BGC types

In [1]:
import os

from glob import glob

from Bio.Seq import Seq
from Bio import SeqIO

In [2]:
def dir_folder(foldername):
    """Get the path to a folder in current directory."""

    cwd = os.getcwd()
    directory = cwd + '/' + foldername
    
    return directory


In [3]:
def folders_in_directory(directory):
    """Get a list of all the folders in the given directory"""
    
    subdirs_raw = glob(directory + '/*/')
    
    subfoldernames = []
    
    for subdir_raw in subdirs_raw:
        subdir = subdir_raw[:-1]  # remove trailing backslash
        subfoldername = os.path.basename(subdir)   # extract subfolder name
        subfoldernames.append(subfoldername)   # collect subfolder names in list
    
    return subfoldernames


In [4]:
def predicted_bgc_types(directory, start_string):
    """In a given folder, go through all antismash generated genbank sequences to collect the predicted BGC classes. 
    Select the right files based on a start_string."""

    filenames = files_in_directory(directory, start_string)

    dict_hits = dict()

    for filename in filenames:
        record = open_genbank_file(directory, filename)
        bgc_type = extract_bgc_type(record)
        dict_hits[filename] = bgc_type
    
    return dict_hits


In [12]:
def files_in_directory(directory, start_string):
    """Get a list of all the genbank filenames in a given directory that start with a given string."""

    filenames = []
    
    for filename in os.listdir(directory):
        if filename.startswith(start_string) and filename.endswith(".gbk"): 
            filenames.append(filename)
    
    return filenames


In [6]:
def open_genbank_file(directory, filename):
    """This function opens a genbank file from a folderpath and a filename"""
    
    filepath = directory + '/' + filename
    record = SeqIO.read(filepath, "genbank")
    
    return record


In [7]:
def extract_bgc_type(record):
    """This function returns the predict bgc type for a given genbank file location."""

    # extract bgc type from gbk sequence
    for feat in record.features:
        if feat.type == "cand_cluster":
            bgc_type = feat.qualifiers["product"]
            break

    return bgc_type

In [13]:
# Extract predicted BGC type from some antismash folders. 

# Collect all antismash output folders in one directory
foldername = "/test"
start_string = "contig"

# Loop through all antismash folders in the directory
subfoldernames = folders_in_directory(dir_folder(foldername))

# Store the predicted types of each BGC of each genome in a dictionary

data = dict()

for subfoldername in subfoldernames:
    directory = dir_folder(foldername + '/' + subfoldername)
    dict_hits = predicted_bgc_types(directory, start_string)
    data[subfoldername] = dict_hits

In [10]:
data

{'Hm107.contig-sequences-gapclosed': {'contig_00092.region001.gbk': ['lanthipeptide-class-ii'],
  'contig_00003.region001.gbk': ['T3PKS'],
  'contig_00066.region001.gbk': ['butyrolactone'],
  'contig_00020.region001.gbk': ['T2PKS', 'T1PKS', 'PKS-like'],
  'contig_00142.region001.gbk': ['ectoine'],
  'contig_00198.region001.gbk': ['NRPS-like'],
  'contig_00026.region001.gbk': ['PKS-like', 'butyrolactone'],
  'contig_00138.region001.gbk': ['NRPS'],
  'contig_00417.region001.gbk': ['terpene'],
  'contig_00090.region001.gbk': ['terpene', 'NAPAA'],
  'contig_00083.region001.gbk': ['lanthipeptide-class-i'],
  'contig_00038.region001.gbk': ['NRPS', 'terpene'],
  'contig_00005.region001.gbk': ['NRPS'],
  'contig_00029.region001.gbk': ['T3PKS'],
  'contig_00371.region001.gbk': ['lanthipeptide-class-iii'],
  'contig_00170.region001.gbk': ['melanin'],
  'contig_00207.region001.gbk': ['terpene'],
  'contig_00112.region001.gbk': ['NRPS'],
  'contig_00023.region001.gbk': ['siderophore'],
  'contig_0

### Extract most similar known cluster for T2PKS clusters

In [None]:
# get path to file
cwd = os.getcwd()
folder = cwd + '/Hm77.contig-sequences-gapclosed/knownclusterblast'
filename = 'contig_00002_c1.txt'
filepath = folder + '/' + filename

In [None]:
# open file as list

count = 0

with open(filepath, 'r') as f:
    knownclusterblast = f.readlines()

In [None]:
# extract best known hit
start = knownclusterblast.index('Details:\n')
hit_bgc_num = ""
hit_bgc_name = ""
hit_bgc_type = ""

for line in range(start, start + 10):
    if knownclusterblast[line].startswith('1.'):
        hit_bgc_num = knownclusterblast[line][3:].strip()
    if knownclusterblast[line].startswith('Source'):
        hit_bgc_name = knownclusterblast[line][8:].strip()
    if knownclusterblast[line].startswith('Type'):
        hit_bgc_type = knownclusterblast[line][6:].strip()
        break

dict_hits = dict()

dict_hits[filename] = [hit_bgc_num, hit_bgc_name, hit_bgc_type]

In [None]:
print(dict_hits)

In [None]:
print(hit_bgc_num[0:])

In [None]:
knownclusterblast