# Summary
Get antismash classes for the relevant mibig bgcs.

In [20]:
import os
import urllib
import glob
import json
import time
from copy import deepcopy

In [2]:
mibig_folder = "/mnt/scratch/louwe015/mibig_json_2.0/"

base_path = "/mnt/scratch/louwe015/NPLinker/classifying/mibig_classifications/"
out_base = os.path.split(base_path[:-1])[0]
mibig_classes = os.path.join(out_base, "All_MIBiG_compounds_with_BGC_CF_NPC_classes.txt")
print("mibig_classes exists is", os.path.isfile(mibig_classes))

mibig_classes exists is True


## Read input classes file

In [24]:
mibig_dict = {}
with open(mibig_classes) as inf:
    header = inf.readline()
    for line in inf:
        line = line.strip().split('\t')
        chem_id = line.pop(0)
        mibig_dict[chem_id] = line
print(header.strip())
list(mibig_dict.items())[:2]

compound_name	class:subclass	smiles	inchi_key	kingdom	superclass	class	subclass	direct_parent	class_results	superclass_results	pathway_results	isglycoside


[('BGC0000001_abyssomicin C',
  ['Polyketide:Other',
   'CC1C[C@]23OC(=O)C4=C2OC1C(O)C3\\C=C/C(=O)[C@@H](C)C[C@@H](C)C4=O',
   'FNEADFUPWHAVTA-UHFFFAOYSA-N',
   'Organic compounds',
   'Organoheterocyclic compounds',
   'Oxanes',
   '',
   'Oxanes',
   'Spirotetronate macrolides',
   'Macrolides',
   'Polyketides',
   '0']),
 ('BGC0000001_atrop-abyssomicin C',
  ['Polyketide:Other',
   'CC1CC23OC(=O)C4=C2OC1C(O)C3\\C=C/C(=O)C(C)CC(C)C4=O',
   'FNEADFUPWHAVTA-UHFFFAOYSA-N',
   'Organic compounds',
   'Organoheterocyclic compounds',
   'Oxanes',
   '',
   'Oxanes',
   'Spirotetronate macrolides',
   'Macrolides',
   'Polyketides',
   '0'])]

## Fetching antismash result from mibig and getting class info

In [5]:
test_bgc_id, test_vals = list(mibig_dict.items())[2]
test_bgc = test_bgc_id.partition('_')[0]
test_bgc

'BGC0000002'

In [6]:
url = "https://mibig.secondarymetabolites.org/repository/{}/generated/{}.1.region001.json".format(test_bgc, test_bgc)
with urllib.request.urlopen(url) as inf:
    test_result = json.load(inf)

In [7]:
print(test_result.keys())
print(test_result['records'][0].keys())
cand_cluster = []
for feature in test_result['records'][0]['features']:
    if feature['type'] == 'cand_cluster':
        cand_cluster.append(feature)
cand_cluster

dict_keys(['version', 'input_file', 'records', 'timings', 'taxon', 'schema'])
dict_keys(['id', 'seq', 'features', 'name', 'description', 'dbxrefs', 'annotations', 'letter_annotations', 'modules'])


[{'location': '[0:138784]',
  'type': 'cand_cluster',
  'id': '<unknown id>',
  'qualifiers': {'candidate_cluster_number': ['1'],
   'contig_edge': ['True'],
   'detection_rules': ['cds(PKS_AT and (PKS_KS or ene_KS or mod_KS or hyb_KS or itr_KS or tra_KS))',
    '(minimum(3, [DUF1205, Glyco_transf_28, Glycos_transf_1, Glycos_transf_2, MGT]) and minscore(MGT, 150))'],
   'kind': ['neighbouring'],
   'product': ['T1PKS', 'oligosaccharide'],
   'protoclusters': ['1', '2'],
   'tool': ['antismash']}},
 {'location': '[0:126784]',
  'type': 'cand_cluster',
  'id': '<unknown id>',
  'qualifiers': {'candidate_cluster_number': ['2'],
   'contig_edge': ['True'],
   'detection_rules': ['cds(PKS_AT and (PKS_KS or ene_KS or mod_KS or hyb_KS or itr_KS or tra_KS))'],
   'kind': ['single'],
   'product': ['T1PKS'],
   'protoclusters': ['1'],
   'tool': ['antismash']}},
 {'location': '[98200:138784]',
  'type': 'cand_cluster',
  'id': '<unknown id>',
  'qualifiers': {'candidate_cluster_number': ['3'],


In [25]:
all_as_classes = []
mibig_as_classes = {}
start = time.time()
local_mibig_dict = deepcopy(mibig_dict)  # avoid annoying things when rerunning notebook
for bgc_id, vals in local_mibig_dict.items():
    # fetch json of antismash generated result
    bgc_num = bgc_id.partition('_')[0]
    url = "https://mibig.secondarymetabolites.org/repository/{}/generated/{}.1.region001.json".format(bgc_num, bgc_num)
    try:
        with urllib.request.urlopen(url) as inf:
            bgc_json = json.load(inf)
    except urllib.error.HTTPError:
        bgc_json = None
        print("No AS record for", bgc_id)

    # parse json to get classes (products)
    as_classes = ""
    if bgc_json:
        for feature in bgc_json['records'][0]['features']:
            if feature['type'] == 'cand_cluster':
                # 'qualifiers': {'candidate_cluster_number': ['1'],
                qualifiers = feature['qualifiers']
                if qualifiers['candidate_cluster_number'][0] == '1':
                    as_classes = ','.join(qualifiers['product'])
    vals.append(as_classes)
    
    # populate new dict, with added AS class
    mibig_as_classes[bgc_id] = vals
    all_as_classes.append(as_classes)  # record to inspect results

len_recorded = len([cls for cls in all_as_classes if cls])
len_tot = len(all_as_classes)
print("Recorded {}/{} antismash classes. Missing {}".format(len_recorded, len_tot, len_tot-len_recorded))
end = time.time()
print("Took {:.2f} minutes".format((end-start)/60))

No AS record for BGC0000053_elaiophylin
No AS record for BGC0000350_ET-743
No AS record for BGC0000669_marneral
No AS record for BGC0000670_thaliandiol
No AS record for BGC0000670_thalianol
No AS record for BGC0000671_momilactone
No AS record for BGC0000672_oryzalides
No AS record for BGC0000672_phytocassane
No AS record for BGC0000798_dhurrin
No AS record for BGC0000810_benzoxazinone DIMBOA
No AS record for BGC0001068_pyripyropene A
No AS record for BGC0001109_pederin
No AS record for BGC0001314_tirucalla
No AS record for BGC0001316_linamarin
No AS record for BGC0001316_lotaustralin
No AS record for BGC0001317_lupeol
No AS record for BGC0001318_linamarin
No AS record for BGC0001322_lycosantalonol
No AS record for BGC0001325_noscapine
No AS record for BGC0001533_borrelidin
No AS record for BGC0001554_coenzyme F430
No AS record for BGC0001799_thebaine
No AS record for BGC0001816_valactamide A
No AS record for BGC0001883_porphyra-334
No AS record for BGC0001949_domoic acid
No AS record f

In [23]:
all_as_classes[:10]

['T1PKS',
 'T1PKS',
 'T1PKS,oligosaccharide',
 'T1PKS',
 'T1PKS',
 '',
 'T1PKS',
 'T1PKS',
 'T1PKS',
 'T1PKS']

## Writing to out file

In [26]:
out_file = os.path.join(out_base, "All_MIBiG_compounds_with_AS_BGC_CF_NPC_classes.txt")
header_split = header.strip().split('\t')
new_header = header_split[:2] + ["AS_classes"] + header_split[2:]
print(new_header)
local_mibig_as_classes = deepcopy(mibig_as_classes)  # avoid annoying things when rerunning notebook
with open(out_file, 'w') as outf:
    outf.write("{}\n".format('\t'.join(new_header)))
    for key, vals in local_mibig_as_classes.items():
        as_classes = vals.pop(-1)
        classes = vals.pop(0)
        out_str = "{}\t{}\t{}\t{}\n".format(key, classes, as_classes, '\t'.join(vals))
        outf.write(out_str)

print(out_str)  # example

['compound_name', 'class:subclass', 'AS_classes', 'smiles', 'inchi_key', 'kingdom', 'superclass', 'class', 'subclass', 'direct_parent', 'class_results', 'superclass_results', 'pathway_results', 'isglycoside']
BGC0002036_dehydrofosmidomycin	Other:	phosphonate	C(C=CP(=O)(O)O)N(C=O)O	YTTBDWKRMPYBDN-UHFFFAOYSA-N									0

