# Summary
This notebook extracts smiles from a directory of mibig jsons.

I used my spec_analysis environment for this. (see https://github.com/louwenjjr/ms2_mass_differences)

In [1]:
import glob
import os
import json


mibig_folder = "/mnt/scratch/louwe015/mibig_json_2.0/"

In [2]:
mibig_files = glob.glob(os.path.join(mibig_folder, "*.json"))

In [3]:
bgc_json = mibig_files[0]
bgc_json, os.path.isfile(bgc_json)

('/mnt/scratch/louwe015/mibig_json_2.0/BGC0000020.json', True)

In [4]:
with open(bgc_json, 'r') as inf:
    bgc_dict = json.load(inf)
bgc_dict

{'changelog': [{'comments': ['Submitted'],
   'contributors': ['AAAAAAAAAAAAAAAAAAAAAAAA'],
   'version': '1.0'},
  {'comments': ['Migrated from v1.4',
    'Updated compound(s) information (NPAtlas curation)'],
   'contributors': ['AAAAAAAAAAAAAAAAAAAAAAAA'],
   'version': '2.0'}],
 'cluster': {'biosyn_class': ['Polyketide'],
  'compounds': [{'chem_struct': 'C[C@@H]1[C@@H]2C[C@]([C@@H](/C=C/C=C(/CC3=CC(=C(C(=C3)OC)Cl)N(C(=O)C[C@@H]([C@]4([C@H]1O4)C)OC(=O)[C@H](C)N(C)C(=O)C)C)\\C)OC)(NC(=O)O2)O',
    'compound': 'maytansine',
    'database_id': ['pubchem:5281828'],
    'mol_mass': 691.2871723520001,
    'molecular_formula': 'C34H46ClN3O10'},
   {'chem_struct': 'C[C@@H]1[C@@H]2C[C@]([C@@H](/C=C/C=C(/CC3=CC(=C(C(=C3)OC)Cl)N(C(=O)C[C@@H]([C@]4([C@H]1O4)C)OC(=O)C(C)C)C)\\C)OC)(NC(=O)O2)O',
    'compound': 'ansamitocin P-3',
    'mol_mass': 634.2657086359999,
    'molecular_formula': 'C32H43ClN2O9'}],
  'loci': {'accession': 'AF453501.1', 'completeness': 'Unknown'},
  'mibig_accession': 'BGC

In [5]:
# gather: mibig accession, compound name, SMILES, PMID/DOI (comma seperated)
mibig_acc = bgc_dict['cluster']['mibig_accession']
publications = ','.join(pub.rpartition(":")[-1] for pub in bgc_dict['cluster']['publications'])
for compound in bgc_dict['cluster']['compounds']:
    compound_name = compound['compound']
    smiles = compound['chem_struct']
    print(mibig_acc, compound_name, smiles, publications)

BGC0000020 maytansine C[C@@H]1[C@@H]2C[C@]([C@@H](/C=C/C=C(/CC3=CC(=C(C(=C3)OC)Cl)N(C(=O)C[C@@H]([C@]4([C@H]1O4)C)OC(=O)[C@H](C)N(C)C(=O)C)C)\C)OC)(NC(=O)O2)O 12060743
BGC0000020 ansamitocin P-3 C[C@@H]1[C@@H]2C[C@]([C@@H](/C=C/C=C(/CC3=CC(=C(C(=C3)OC)Cl)N(C(=O)C[C@@H]([C@]4([C@H]1O4)C)OC(=O)C(C)C)C)\C)OC)(NC(=O)O2)O 12060743


In [6]:
# make a list of tuples with info for each line (so it can be sorted)
info_list = []
for bgc_json in mibig_files:
    with open(bgc_json, 'r') as inf:
        bgc_dict = json.load(inf)
    mibig_acc = bgc_dict['cluster']['mibig_accession']
    publications = ','.join(pub.rpartition(":")[-1] for pub in bgc_dict['cluster']['publications'])
    for compound in bgc_dict['cluster']['compounds']:
        compound_name = compound.get('compound', '')
        smiles = compound.get('chem_struct', '')
        info_list.append((mibig_acc, compound_name, smiles, publications))

In [7]:
print(info_list[-1])
print(len(info_list))

('BGC0000522', 'lacticin Z', 'CC[C@H](C)[C@H](NC(=O)CNC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CCCCN)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(O)=O)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@H](CC1=CNC2=C1C=CC=C2)NC(=O)[C@H](CC(O)=O)NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@H](CCC(N)=O)NC(=O)CNC(=O)[C@H](C)NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](NC(=O)[C@H](CC1=CNC2=C1C=CC=C2)NC(=O)[C@H](CC(O)=O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](C)NC(=O)[C@H](CC1=CNC2=C1C=CC=C2)NC(=O)[C@H](C)NC(=O)[C@H](CC1=CNC2=C1C=CC=C2)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CO)NC(=O)CNC(=O)[C@H](CC1=CC=C(O)C=C1)NC(=O)[C@H](CCCCN)NC(=O)[C@H](C)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@H](CCC(N)=O)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)CNC(=O)[C@H](C)NC(=O)[C@@H](N)CCSC)C(C)C)C(C)C)[C@@H](C)CC)C(C)C)[C@@H](C)CC)[C@@H](C)CC)[C@@H](C)CC)C(C)C)C(C)C)[

In [8]:
# sort
info_list.sort()
info_list[0], info_list[-1]

(('BGC0000001',
  'abyssomicin C',
  'CC1C[C@]23OC(=O)C4=C2OC1C(O)C3\\C=C/C(=O)[C@@H](C)C[C@@H](C)C4=O',
  '21656887'),
 ('BGC0002036', 'dehydrofosmidomycin', '', '10.1038/s41589-019-0343-'))

In [9]:
len(info_list)

2689

In [11]:
old_mibig_file = "/mnt/scratch/louwe015/NPLinker/classifying/ClassifyNPDB/InFiles/All_MIBiG_compounds_with_SMILES_and_PMID_MAS.txt"
with open(old_mibig_file, "r") as inf:
    header = inf.readline()
header

'MIBiG accession\tcompound name\tSMILES\tPMID / DOI\tColumn1\tColumn2\tColumn3\n'

In [31]:
out_file = "All_MIBiG2.0_compounds_with_SMILES_and_PMID_MAS.txt"
with open(out_file, 'w') as outf:
    outf.write(header)
    for bgc_info in info_list:
        outf.write("{}\n".format('\t'.join(bgc_info)))