# MBT collection

### Extract BGC types
Extract predicted BGC type from some antismash output

In [1]:
import os
import pandas as pd

from glob import glob
from collections import Counter

import my_functions as fun

In [2]:
# run this cell to use the input folder in current working directory
cwd = fun.cur_dir()
cwd_subfolders = fun.folders_in_directory2(cwd)

dir_input = 'not found'

for directory in cwd_subfolders:
    if directory.endswith("test"):
        dir_input = directory
        
print(dir_input)

C:\Users\mandy\Documents\Programming\mbt-collection\test


In [3]:
# run this cell to add a manual input folder
dir_input = "C:\\Users\\mandy\\OneDrive - Universiteit Leiden\\Data\\MBT collection\\Antismash results"

print(dir_input)

C:\Users\mandy\OneDrive - Universiteit Leiden\Data\MBT collection\Antismash results


In [4]:
# collect predicted bgc type from each bgc in all input subfolders

dict_bgc_types = dict()   # initialize dictionary to store data

input_subfolders = fun.folders_in_directory2(dir_input)   # get list of folders to search through

for directory in input_subfolders:
    subfoldername = os.path.basename(directory)   # extract strain name from folder name
    dict_hits = fun.predicted_bgc_types(directory)   # get dict of predicted bgc types for each strain
    dict_bgc_types[subfoldername] = dict_hits   # collect predicted bgc types in a dictionary


In [5]:
# extract the number of bgcs of each type present in each strain

dict_all = dict()   # initailize dictionary to store data

for strain in dict_bgc_types:

    bgc_types = []   # initialize list to store predicted bgc types
    
    for bgc in dict_bgc_types[strain]:
        bgc_types.append(dict_bgc_types[strain][bgc])   # collect all bgc types in this strain
    
    bgc_types_flat = [item for sublist in bgc_types for item in sublist]   # convert list of lists to flat list
    
    # count how often each bgc type occurs in this strain
    all_types = Counter(bgc_types_flat).keys()
    all_types_counted = Counter(bgc_types_flat).values()
    
    #  collect all types of bgcs in one dictionary
    dict_bgc_types_count = dict(zip(all_types, all_types_counted))
    dict_all[strain] = dict_bgc_types_count
    

In [6]:
df_bgcs = pd.DataFrame(dict_all)
df_bgcs = df_bgcs.fillna(0)


In [7]:
df_bgcs = df_bgcs.transpose()

In [8]:
with open('dmDoxo.txt') as f:
  dict_dmDoxo = dict(x.rstrip().split(None, 1) for x in f)

df_bgcs["dmDoxo"] = pd.Series(dict_dmDoxo)

In [9]:
dict_dmDoxo
### NOTE: the numbers should be converted to integers!!!

{'MBT1': '4',
 'MBT3': '5',
 'MBT5': '0',
 'MBT7': '0',
 'MBT10': '0',
 'MBT12': '4',
 'MBT13': '0',
 'MBT14': '0',
 'MBT21': '0',
 'MBT34': '4',
 'MBT37': '5',
 'MBT38': '0',
 'MBT39': '0',
 'MBT44': '5',
 'MBT47': '0',
 'Hm77': '0',
 'Hm107': '4',
 'Hm108': '4',
 'Hm111': '4',
 'MBT63': '0',
 'MBT64': '0',
 'Hm129': '4',
 'Hm155': '5',
 'MBT74': '5',
 'MBT77': '0',
 'MBT82': '0',
 'MBT90': '0',
 'DSM40127': '5',
 'Go-475': '5',
 'Streptomyces-scopuliridis': '0',
 'CS057': '5',
 'MBT27': '0'}

In [10]:
df_bgcs

Unnamed: 0,T1PKS,NRPS,NRPS-like,lanthipeptide-class-iv,transAT-PKS,T3PKS,melanin,RiPP-like,terpene,T2PKS,...,linaridin,transAT-PKS-like,nucleoside,thioamide-NRP,phenazine,guanidinotides,redox-cofactor,thioamitides,prodigiosin,dmDoxo
CS057,5.0,10.0,3.0,1.0,1.0,2.0,2.0,2.0,5.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
DSM40127,16.0,21.0,8.0,0.0,0.0,3.0,0.0,2.0,6.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
Go-475,6.0,6.0,3.0,0.0,0.0,1.0,2.0,2.0,8.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
Hm107,3.0,8.0,3.0,0.0,0.0,2.0,1.0,2.0,6.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
Hm108,4.0,7.0,2.0,0.0,0.0,2.0,1.0,2.0,6.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
Hm111,3.0,7.0,3.0,0.0,0.0,2.0,1.0,2.0,6.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
Hm129,3.0,8.0,3.0,0.0,0.0,2.0,1.0,2.0,5.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
Hm155,4.0,7.0,3.0,0.0,0.0,2.0,1.0,2.0,7.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
Hm77,6.0,7.0,1.0,0.0,0.0,0.0,1.0,1.0,6.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
MBT1,16.0,6.0,4.0,0.0,0.0,1.0,1.0,2.0,8.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


In [11]:
df_bgcs[df_bgcs["dmDoxo"] == '0']

Unnamed: 0,T1PKS,NRPS,NRPS-like,lanthipeptide-class-iv,transAT-PKS,T3PKS,melanin,RiPP-like,terpene,T2PKS,...,linaridin,transAT-PKS-like,nucleoside,thioamide-NRP,phenazine,guanidinotides,redox-cofactor,thioamitides,prodigiosin,dmDoxo
Hm77,6.0,7.0,1.0,0.0,0.0,0.0,1.0,1.0,6.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
MBT10,10.0,13.0,3.0,0.0,0.0,0.0,1.0,2.0,6.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
MBT13,8.0,12.0,3.0,0.0,0.0,1.0,2.0,1.0,7.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
MBT14,0.0,7.0,5.0,0.0,0.0,1.0,1.0,3.0,7.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
MBT21,3.0,9.0,1.0,0.0,0.0,0.0,2.0,2.0,7.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
MBT27,2.0,20.0,5.0,2.0,0.0,3.0,1.0,4.0,6.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
MBT38,16.0,5.0,4.0,0.0,0.0,1.0,1.0,2.0,7.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
MBT39,28.0,24.0,8.0,0.0,0.0,4.0,2.0,6.0,13.0,2.0,...,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
MBT47,2.0,14.0,7.0,0.0,0.0,1.0,1.0,5.0,5.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
MBT5,13.0,5.0,4.0,0.0,0.0,1.0,1.0,2.0,7.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [12]:
dict_bgc_types

{'CS057': {'NEVF01000001.1.region001.gbk': ['T1PKS', 'NRPS'],
  'NEVF01000001.1.region002.gbk': ['NRPS-like', 'lanthipeptide-class-iv'],
  'NEVF01000002.1.region001.gbk': ['transAT-PKS'],
  'NEVF01000002.1.region002.gbk': ['NRPS', 'T3PKS'],
  'NEVF01000002.1.region003.gbk': ['melanin'],
  'NEVF01000002.1.region004.gbk': ['NRPS', 'T1PKS'],
  'NEVF01000002.1.region005.gbk': ['T1PKS'],
  'NEVF01000003.1.region001.gbk': ['RiPP-like'],
  'NEVF01000003.1.region002.gbk': ['NRPS', 'T1PKS'],
  'NEVF01000003.1.region003.gbk': ['NRPS', 'NRPS-like'],
  'NEVF01000003.1.region004.gbk': ['terpene'],
  'NEVF01000004.1.region001.gbk': ['terpene'],
  'NEVF01000005.1.region001.gbk': ['RiPP-like'],
  'NEVF01000006.1.region001.gbk': ['T2PKS', 'oligosaccharide', 'NRPS'],
  'NEVF01000006.1.region002.gbk': ['siderophore'],
  'NEVF01000010.1.region001.gbk': ['terpene'],
  'NEVF01000012.1.region001.gbk': ['lanthipeptide-class-iii'],
  'NEVF01000012.1.region002.gbk': ['melanin'],
  'NEVF01000012.1.region003.gbk'

In [13]:
dict_data

NameError: name 'dict_data' is not defined

In [None]:
stop

### Extract most similar known cluster for T2PKS clusters

In [None]:
# get path to file
cwd = os.getcwd()
folder = cwd + '/Hm77.contig-sequences-gapclosed/knownclusterblast'
filename = 'contig_00002_c1.txt'
filepath = folder + '/' + filename

In [None]:
# open file as list

count = 0

with open(filepath, 'r') as f:
    knownclusterblast = f.readlines()

In [None]:
# extract best known hit
start = knownclusterblast.index('Details:\n')
hit_bgc_num = ""
hit_bgc_name = ""
hit_bgc_type = ""

for line in range(start, start + 10):
    if knownclusterblast[line].startswith('1.'):
        hit_bgc_num = knownclusterblast[line][3:].strip()
    if knownclusterblast[line].startswith('Source'):
        hit_bgc_name = knownclusterblast[line][8:].strip()
    if knownclusterblast[line].startswith('Type'):
        hit_bgc_type = knownclusterblast[line][6:].strip()
        break

dict_hits = dict()

dict_hits[filename] = [hit_bgc_num, hit_bgc_name, hit_bgc_type]

In [None]:
print(dict_hits)

In [None]:
print(hit_bgc_num[0:])

In [None]:
knownclusterblast