**This notebook contains a function that fetches a SMILE string starting from a KEGG compound page. It also contains the associated unittests.**

USE KEGG_DF_TO_SMILES and SID_TO_SMILES

This notebook contains the basis of a function(s) to take a PubChem ID number and fetch the associated SMILES string from PubChem.

It also contains code pieces to pull an SID from a KEGG webpage.

In [52]:
import numpy as np
import pandas as pd
import pubchempy as pc
import requests
import re
from time import sleep

from bs4 import BeautifulSoup


There are multiple identifier types for each chemical in PubChem. The two we are interacting with here are **SID** (substance ID) and **CID** (chemical ID). CID can be used to acces SMILES directly with PubChemPy. **KEGG does not have CID**, only SID. SID can be turned into CID from which SMILES can be found. 

### Get SMILES from CID and SID

In [3]:
# get SMILES directly from CID
for compound in pc.get_compounds('243'):
    print(compound.isomeric_smiles)

C1=CC=C(C=C1)C(=O)O


In [4]:
# get SMILES from SID through mapping to CID
substance = pc.Substance.from_sid('3305')
cid = substance.standardized_cid
compound = pc.get_compounds(cid)[0]
print(compound.isomeric_smiles, compound)

C1=CC(=C[N+](=C1)[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O)(O)OP(=O)(O)OC[C@@H]3[C@H]([C@H]([C@@H](O3)N4C=NC5=C4N=CN=C5N)O)O)O)O)C(=O)N Compound(5893)


In [35]:
def sid_to_smiles(df):
    sid = df['SID']
    substance = pc.Substance.from_sid(sid)
    sleep(0.05)
    cid = substance.standardized_cid
    compound = pc.get_compounds(cid)[0]
    print(compound.isomeric_smiles)

In [None]:
# for compound in pc.get_compounds('glucose', 'name'):
#    print(compound.cid, compound.isomeric_smiles, compound.smiles)

In [2]:
#%%writefile make_smiles_utils.py

import pubchempy as pc
# I could easily make this handle SIDs, too, but then the user would have to specify whether it is an SID or a CID.
def user_input_to_smiles(input_cid):
    """Takes a PubChem CID input and outputs the associated SMILES."""
    assert type(input_cid) is int, 'Expected an integer ID input'
    for compound in pc.get_compounds(input_cid):
        print(compound.isomeric_smiles)

Overwriting make_smiles_utils.py


In [72]:
user_input_to_smiles(243)

C1=CC=C(C=C1)C(=O)O


In [33]:
#%%writefile test_make_smiles_utils.py

import pubchempy as pc
import make_smiles_utils

def test_user_input_to_smiles():
    # check that the input is an integer
    # check that the output is a string
    
    # CID for thiophene
    inp_cid = 8030 
    smiles = make_smiles_utils.user_input_to_smiles(inp_cid)
    
    assert len(str(smiles)) == 4, 'This is not the correct SMILES length for thiophene'
    assert str(smiles) == 'C1=CSC=C1', 'This is not the correct SMILES for thiophene'
    
    return #len(str(smiles))

In [35]:
test_user_input_to_smiles()

C1=CSC=C1


__________________

### Manipulate DF containing SID into SMILES

In [111]:
sids = ['3305', '3333', '3480', '3432']
compounds = ['NAD', 'glucose', 'benzoic acid', 'methanol']
tuple_list = list(zip(compounds, sids))
tuple_list
devo_df = pd.DataFrame(tuple_list, columns=['Compound Name','SID'])
devo_df

Unnamed: 0,Compound Name,SID
0,NAD,3305
1,glucose,3333
2,benzoic acid,3480
3,methanol,3432


In [76]:
def sid_to_smiles(sid):
    substance = pc.Substance.from_sid(sid)
    cid = substance.standardized_cid
    compound = pc.get_compounds(cid)[0]
    return compound.isomeric_smiles

In [109]:
def kegg_df_to_smiles(kegg_df):
    """Takes a pandas dataframe that includes a column of SIDs, gets the isomeric SMILES for each SID, stores them as a list, then adds a SMILES column."""

    res = [] 
    
    for i in range(len(kegg_df)):
        sid = kegg_df.iloc[i, 1] #CHANGE THIS 1 TO THE PROPER COLUMN NUMBER FOR SID 
        result = sid_to_smiles(sid)
        res.append(result)
        
    
    kegg_df.insert(2, column='SMILES', value=res) #Change this 2 to the number where the smiles column should be
    
    return kegg_df

In [112]:
kegg_df_to_smiles(devo_df)

Unnamed: 0,Compound Name,SID,SMILES
0,NAD,3305,C1=CC(=C[N+](=C1)[C@H]2[C@@H]([C@@H]([C@H](O2)...
1,glucose,3333,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O
2,benzoic acid,3480,C1=CC=C(C=C1)C(=O)O
3,methanol,3432,CO


---
### Get SID from KEGG url 

This currently works from the compound page. I have not seen if it can be pulled from the reaction page.


In [60]:
#%%writefile kegg_utils.py

import pubchempy as pc
import re
import requests

from bs4 import BeautifulSoup


def kegg_to_sid(url):
    # access the url
    response = requests.get(url)

    # turn the webpage into html
    soup = BeautifulSoup(response.content, 'html.parser')

    # find the link that contains 'pubchem'
    sid = soup.find('a', href=re.compile(r'https://pubchem\.ncbi'))

    sid_string = sid.string

    return sid_string


def kegg_to_smiles(url):
    """Uses the KEGG compound page url to find the compound's PubChem SID, then to find the SMILES for that compound using the SID."""

    # access the url
    response = requests.get(url)

    # turn the webpage into html
    soup = BeautifulSoup(response.content, 'html.parser')

    # find the link that contains 'pubchem'
    sid = soup.find('a', href=re.compile(r'https://pubchem\.ncbi'))

    substance = pc.Substance.from_sid(sid.string)
    cid = substance.standardized_cid
    compound = pc.get_compounds(cid)[0]

    print(compound.isomeric_smiles)

Overwriting kegg_utils.py


In [None]:
#%%writefile kegg_utils.py

import pubchempy as pc
import re
import requests

from bs4 import BeautifulSoup


def kegg_to_sid(url):
    # access the url
    response = requests.get(url)

    # turn the webpage into html
    soup = BeautifulSoup(response.content, 'html.parser')

    # find the link that contains 'pubchem'
    sid = soup.find('a', href=re.compile(r'https://pubchem\.ncbi'))

    sid_string = sid.string

    return sid_string

In [7]:
# url of the desired KEGG compound page
url = 'https://www.genome.jp/dbget-bin/www_bget?cpd:C00180'
# access the url
response = requests.get(url)

# turn the webpage into html
soup = BeautifulSoup(response.content, 'html.parser')

# find the link that contains 'pubchem'
sid = soup.find('a', href=re.compile('https://pubchem\.ncbi'))

# print the string that is displayed as the link
# (this is the SID, which works with pubchempy to get the SMILES)
print(sid.string)

3480


In [62]:
#%%writefile test_kegg_utils.py
import unittest

import kegg_utils


def test_kegg_to_sid():
    url_list = [
        'https://www.genome.jp/dbget-bin/www_bget?cpd:C00180',
        'https://www.genome.jp/dbget-bin/www_bget?cpd:C00587',
        'https://www.genome.jp/dbget-bin/www_bget?cpd:C00002']
    for url in url_list:
        sid_str = kegg_utils.kegg_to_sid(url)
        sid_str.isdigit(), 'SID contains characters other than numbers'
    return


def test_kegg_to_smiles():

    url = 'https://www.genome.jp/dbget-bin/www_bget?cpd:C00587'
    smiles = kegg_utils.kegg_to_sid(url)

    assert len(smiles) >= 1, 'SMILES string is very short. Check SMILES.'
    isinstance(smiles, str), 'SMILES not returned as string.'
    
    return

Overwriting test_kegg_utils.py


biopython kegg api to pull SID

store smiles into dataframe to join later 
