# MIBiG JSON -> SMILES

Load MIBiG database in JSON format and extract the predicted SMILES (either already stored in the json file or retrieved from PubChem).

In [4]:
%load_ext autoreload
%autoreload 2

%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import os
from glob import glob
import json

import sys
sys.path.append('/Users/joewandy/anaconda/envs/mfgcf/lib/python2.7/site-packages')
from pubchempy import *

In [8]:
input_dir = '/Users/joewandy/Dropbox/Meta_clustering/MS2LDA/BGC/data/mibig_json_1.3/'

In [9]:
def get_isomeric_smiles(prop):
    for prop in pubchem_compound.record['props']:
        urn = prop['urn']
        if 'name' in urn and urn['name'] == 'Isomeric' and urn['label'] == 'SMILES':
            smiles = prop['value']['sval']
            return smiles

Sometimes the MIBiG json doesn't contain the chemical structure (SMILES), but it has the pubchem id. In this case, we can fetch this information from pubchem.

In [11]:
data = []
json_files = glob(os.path.join(input_dir, '*.json'))
for fn in sorted(json_files):
    with open(fn, 'r') as f:
        
        d = json.load(f)    
        
        bgc_id = os.path.basename(fn).split('.')[0]
        compounds = d['general_params']['compounds']
        for c in compounds:
            compound_name = bgc_id + '_' + c['compound'].lower().replace(' ', '_')
            if 'chem_struct' in c:
                smiles = c['chem_struct']
            elif 'pubchem_id' in c:
                try:
                    pubchem_id = c['pubchem_id']
                    pubchem_compound = Compound.from_cid(pubchem_id)
                    props = pubchem_compound.record['props']
                    smiles = get_isomeric_smiles(props)
                except NotFoundError:
                    smiles = None
                    
            if smiles is not None:
                row = (compound_name, smiles,)
#                 print row
                data.append(row)
                if len(data) % 100 == 0:
                    print len(data), '/', len(json_files)

<open file '/Users/joewandy/Dropbox/Meta_clustering/MS2LDA/BGC/data/mibig_json_1.3/BGC0000001.json', mode 'r' at 0x1130f0660>


Export the results to csv

In [6]:
import pandas as pd
df = pd.DataFrame(data, columns=['mol', 'smiles'])
df.to_csv('smiles.csv', sep=' ', header=None, index=False)