# Summary
In this notebook I gather BGC classes information from mibig jsons. This continues from notebook 2 where I gathered CF and NPC chemical class information.

In [1]:
import os
import urllib
import glob
import json
from rdkit import Chem

In [2]:
mibig_folder = "/mnt/scratch/louwe015/mibig_json_2.0/"

base_path = "/mnt/scratch/louwe015/NPLinker/classifying/mibig_classifications/"
mibig_smiles_path = os.path.join(base_path, "files/All_MIBiG_compounds_with_SMILES_and_PMID_MAS.txt")
print("mibig_smiles_path exists is", os.path.isfile(mibig_smiles_path))

mibig_smiles_path exists is True


In [3]:
chem_classes_mibig = os.path.join(os.path.split(base_path[:-1])[0], "All_MIBiG_compounds_with_CF_NPC_classes.txt")
print(chem_classes_mibig, os.path.isfile(chem_classes_mibig))

/mnt/scratch/louwe015/NPLinker/classifying/All_MIBiG_compounds_with_CF_NPC_classes.txt True


## Which entries are needed from the jsons?
Original list from Oscar:

['compound_name','biosyn_class','biosyn_subclass',\
    'chem_synonyms','chem_target','molecular_formula','mol_mass',\
    'chem_struct','pubchem_id','chemspider_id','chebi_id','chembl_id',\
    'chem_act','other_chem_act','loci','publications','rdkit_smile_1',\
    'rdkit_smile_2','rdkit_smile_3','rdkit_smile_4','rdkit_inchi_key',\
    'rdkit_inchi_key1']

In [4]:
# as ids take BGC id + '_' + compound name, for now this is all we need
wanted_names = ['biosyn_class','biosyn_subclass']

In [5]:
mibig_files = glob.glob(os.path.join(mibig_folder, "*.json"))

In [30]:
test = mibig_files[4]
test

'/mnt/scratch/louwe015/mibig_json_2.0/BGC0001001.json'

In [31]:
with open(test) as inf:
    test_json = json.load(inf)

In [32]:
test_json["cluster"]

{'biosyn_class': ['NRP', 'Polyketide'],
 'compounds': [{'chem_struct': 'CO\\C(CCNC(=O)CC\\C=C\\C(C)CC\\C(CCCC#CBr)=C\\Cl)=C\\C(=O)N1C(C)C=CC1=O',
   'compound': 'jamaicamide A',
   'database_id': ['npatlas:13036'],
   'mol_mass': 566.1546974120001,
   'molecular_formula': 'C27H36BrClN2O4'},
  {'chem_acts': ['Sodium channel blocking'],
   'chem_struct': 'CO\\C(CCNC(=O)CC\\C=C\\C(C)CC\\C(CCCC#C)=C\\Cl)=C\\C(=O)N1C(C)C=CC1=O',
   'chem_targets': [{'target': 'Sodium channel blocking'}],
   'compound': 'jamaicamide B',
   'database_id': ['npatlas:374', 'pubchem:49787032'],
   'mol_mass': 488.244185344,
   'molecular_formula': 'C27H37ClN2O4'},
  {'chem_struct': 'CO\\C(CCNC(=O)CC\\C=C\\C(C)CC\\C(CCCC=C)=C\\Cl)=C\\C(=O)N1C(C)C=CC1=O',
   'chem_targets': [{'target': 'Sodium channel blocking'}],
   'compound': 'jamaicamide C',
   'database_id': ['npatlas:1111', 'pubchem:49787033'],
   'mol_mass': 490.259835408,
   'molecular_formula': 'C27H39ClN2O4'}],
 'genes': {'annotations': [{'id': 'AAS98794

## Investigate subclasses
It seems (also according to the json schema for mibig) that I need to look for all 'subclass' in biosyn_class, except for pks that have 'subclasses'

In [9]:
all_class_info = []
for mibig_file in mibig_files:
    with open(mibig_file) as inf:
        mibig_js = json.load(inf)
    class_list = mibig_js["cluster"].get("biosyn_class")
    if class_list:
        for elem in class_list:
            low_elem = elem.lower()
            class_dict = mibig_js["cluster"].get(low_elem)
            if class_dict:
                all_class_info.append(class_dict)

In [42]:
print(len(all_class_info))
subcl_len = len([dct["subclass"] for dct in all_class_info if "subclass" in dct])
subcls_len = len([dct["subclasses"] for dct in all_class_info if "subclasses" in dct])
print("total number of subclasses received:", subcl_len+subcls_len)

759
total number of subclasses received: 599


In [15]:
all_class_info[:5]

[{'cyclic': True},
 {'cyclic': True,
  'release_type': ['Macrolactamization'],
  'subclasses': ['Other']},
 {'subclass': 'Lanthipeptide'},
 {'cyclic': False,
  'nrps_genes': [{'gene_id': 'AHB38497.1',
    'modules': [{'a_substr_spec': {'epimerized': False,
       'evidence': ['Sequence-based prediction'],
       'proteinogenic': ['Isoleucine']},
      'active': False,
      'c_dom_subtype': 'Starter',
      'modification_domains': ['Methylation'],
      'module_number': '1'},
     {'a_substr_spec': {'epimerized': False,
       'evidence': ['Sequence-based prediction'],
       'proteinogenic': ['Isoleucine']},
      'active': False,
      'c_dom_subtype': 'LCL',
      'module_number': '2'},
     {'a_substr_spec': {'epimerized': False,
       'evidence': ['Sequence-based prediction'],
       'proteinogenic': ['Threonine']},
      'active': False,
      'c_dom_subtype': 'LCL',
      'module_number': '3'},
     {'a_substr_spec': {'epimerized': False, 'proteinogenic': ['Leucine']},
      'a

In [16]:
[class_info.get('subclasses') for class_info in all_class_info if 'subclasses' in class_info]

[['Other'],
 ['Other'],
 ['Tetracenomycin'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Polyene'],
 ['Polyene'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Angucycline'],
 ['Other'],
 ['Enediyine'],
 ['Macrolide'],
 ['Other'],
 ['Aryl polyene'],
 ['Polyene'],
 ['Angucycline'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Macrolide'],
 ['Enediyine'],
 ['Other'],
 ['Macrolide'],
 ['Other'],
 ['Macrolide'],
 ['Enediyine'],
 ['Polyene'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Polyphenol'],
 ['Polyene'],
 ['Polyene'],
 ['Ansamycin'],
 ['Polyene'],
 ['Macrolide'],
 ['Macrolide'],
 ['Chalcone'],
 ['Other'],
 ['Other'],
 ['Polyene'],
 ['Polyene'],
 ['Tetracycline'],
 ['Other'],
 ['Macrolide'],
 ['Macrolide'],
 ['Macrolide'],
 ['Ansamycin'],
 ['Aryl polyene'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Other'],
 ['Polyene'],
 ['Tetracycline'],
 ['Other'],
 ['Other'],
 ['Anthracyclin

## Getting info from jsons

In [36]:
# make a list of tuples with info for each line (so it can be sorted)
class_results = []
for bgc_json in mibig_files:
    with open(bgc_json, 'r') as inf:
        bgc_dict = json.load(inf)
    mibig_acc = bgc_dict['cluster']['mibig_accession']
    for compound in bgc_dict['cluster']['compounds']:
        compound_name = compound.get('compound', '')
        comp_id = f'{mibig_acc}_{compound_name}'
        biosyn_classes = bgc_dict["cluster"].get("biosyn_class", [])  # list
        
        result_tuple = (comp_id, '', '')
        classes = []
        subclasses = []
        for biosyn_class in biosyn_classes:
            low_class = biosyn_class.lower()
            class_dict = bgc_dict["cluster"].get(low_class, {})
            subclass = class_dict.get('subclass', '')
            if not subclass:  # pks have 'subclasses'
                subclass = class_dict.get('subclasses', [''])[0]
            combined_class = f"{biosyn_class}:{subclass}"
            classes.append(combined_class)
        result_tuple = (comp_id, ','.join(classes))
        class_results.append(result_tuple)

In [38]:
print(len(class_results))
class_results[:10]

2689


[('BGC0000020_maytansine', 'Polyketide:'),
 ('BGC0000020_ansamitocin P-3', 'Polyketide:'),
 ('BGC0000284_phenolic lipids', 'Polyketide:'),
 ('BGC0001483_5-isoprenylindole-3-carboxylate β-D-glycosyl ester', 'Other:'),
 ('BGC0000650_carotenoid', 'Terpene:'),
 ('BGC0001001_jamaicamide A', 'NRP:,Polyketide:Other'),
 ('BGC0001001_jamaicamide B', 'NRP:,Polyketide:Other'),
 ('BGC0001001_jamaicamide C', 'NRP:,Polyketide:Other'),
 ('BGC0001536_brevicidine', 'NRP:'),
 ('BGC0000543_pep5', 'RiPP:Lanthipeptide')]

## Reading chemical classes info

In [47]:
out_base = os.path.split(base_path[:-1])[0]
chem_file = os.path.join(out_base, "All_MIBiG_compounds_with_CF_NPC_classes.txt")

chem_dict = {}
with open(chem_file) as inf:
    header = inf.readline()
    for line in inf:
        line = line.strip().split('\t')
        chem_id = line.pop(0)
        chem_dict[chem_id] = line
print(header.strip())
list(chem_dict.items())[:2]

compound_name	smiles	inchi_key	kingdom	superclass	class	subclass	direct_parent	class_results	superclass_results	pathway_results	isglycoside


[('BGC0000001_abyssomicin C',
  ['CC1C[C@]23OC(=O)C4=C2OC1C(O)C3\\C=C/C(=O)[C@@H](C)C[C@@H](C)C4=O',
   'FNEADFUPWHAVTA-UHFFFAOYSA-N',
   'Organic compounds',
   'Organoheterocyclic compounds',
   'Oxanes',
   '',
   'Oxanes',
   'Spirotetronate macrolides',
   'Macrolides',
   'Polyketides',
   '0']),
 ('BGC0000001_atrop-abyssomicin C',
  ['CC1CC23OC(=O)C4=C2OC1C(O)C3\\C=C/C(=O)C(C)CC(C)C4=O',
   'FNEADFUPWHAVTA-UHFFFAOYSA-N',
   'Organic compounds',
   'Organoheterocyclic compounds',
   'Oxanes',
   '',
   'Oxanes',
   'Spirotetronate macrolides',
   'Macrolides',
   'Polyketides',
   '0'])]

## Combining infos and writing

In [48]:
errors = []
for chem_id, classes in class_results:
    if chem_id in chem_dict:
        chem_dict[chem_id].append(classes)
    else:
        errors.append(chem_id)
print('Amount of mismatches for combining class and chemical info:', len(errors))

# The missing matches do not have a structure

Amount of mismatches for combining class and chemical info: 577


In [49]:
out_file = os.path.join(out_base, "All_MIBiG_compounds_with_BGC_CF_NPC_classes.txt")
header_split = header.strip().split('\t')
new_header = header_split[:1] + ["class:subclass"] + header_split[1:]
print(new_header)
with open(out_file, 'w') as outf:
    outf.write("{}\n".format('\t'.join(new_header)))
    for key, vals in chem_dict.items():
        classes = vals.pop(-1)
        outf.write("{}\t{}\t{}\n".format(key, classes, '\t'.join(vals)))

['compound_name', 'class:subclass', 'smiles', 'inchi_key', 'kingdom', 'superclass', 'class', 'subclass', 'direct_parent', 'class_results', 'superclass_results', 'pathway_results', 'isglycoside']
