### Structure Analysis

This notebook aim to study simple RDKit usage and try to generate code block for assignment of reaction center

In [4]:
import numpy as pd
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Descriptors import MolWt

In [2]:
def count_C(mol):
    return sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 6)
def count_O(mol):
    return sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 8)
def count_N(mol):
    return sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 7)
def count_P(mol):
    return sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 15)
def count_S(mol):
    return sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 16)
def count_X(mol):
    return sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 9 or atom.GetAtomicNum() == 17 or atom.GetAtomicNum() == 35 or atom.GetAtomicNum() == 53)
def count_H(mol):
    H = 0
    for i in range(mol.GetNumAtoms()):
        H += mol.GetAtomWithIdx(i).GetTotalNumHs(includeNeighbors=True)
    return H

def DoU(mol):
    C = count_C(mol)
    H = count_H(mol)
    O = count_O(mol)
    N = count_N(mol)
    P = count_P(mol)
    S = count_S(mol)
    X = count_X(mol)
    return int((2*C+2+N+P-X-H)/2)

from rdkit.Chem.Descriptors import MolWt

def cpd_inform(SMILES):
    
    """A function for getting compound information from SMILES string
    it received a SMILES string and return a dictionary of information consisted of number of C, H, O , N, P, S, X, Degree of Unsaturation and Molecular Weight"""
    info = {}
    mol = Chem.MolFromSmiles(SMILES)
    info['n_C'] = count_C(mol)
    info['n_H'] = count_H(mol)
    info['n_O'] = count_O(mol)
    info['n_N'] = count_N(mol)
    info['n_P'] = count_P(mol)
    info['n_S'] = count_S(mol)
    info['n_X'] = count_X(mol)
    info['DoU'] = DoU(mol)
    info['MW'] = MolWt(mol)
    
    return info

In [5]:
df_master = pd.read_csv('../../../big-datasets/selected_with_smiles.csv')

In [7]:
df_master = df_master[['enzyme', 'product', 'reacts', 'PubChemID', 'SMILES']]

In [8]:
# _info = pd.DataFrame(np.empty(2,2), columns=['n_C', 'n_H'])
# make it into a function that return dictionary with n_C and others as keys      

l_C = []
l_H = []
l_O = []
l_N = []
l_P = []
l_S = [] 
l_X = []
l_DoU = []
l_MW = []

for index in range(df_master.shape[0]):
    mol = Chem.MolFromSmiles(df_master['SMILES'][index])
    l_C.append(count_C(mol))
    l_H.append(count_H(mol))
    l_O.append(count_O(mol))
    l_N.append(count_N(mol))
    l_P.append(count_P(mol))
    l_S.append(count_S(mol))
    l_X.append(count_X(mol))
    l_DoU.append(DoU(mol))
    l_MW.append(MolWt(mol))

In [9]:
df_info = pd.DataFrame()
df_info['n_C'] = l_C
df_info['n_H'] = l_H
df_info['n_O'] = l_O
df_info['n_N'] = l_N
df_info['n_P'] = l_P
df_info['n_S'] = l_S
df_info['n_X'] = l_X
df_info['n_DoU'] = l_DoU
df_info['MW'] = l_MW

df_info

Unnamed: 0,n_C,n_H,n_O,n_N,n_P,n_S,n_X,n_DoU,MW
0,10,18,2,0,0,0,0,2,170.252
1,9,10,1,0,0,0,0,5,134.178
2,8,7,3,0,0,0,0,5,151.141
3,0,0,0,0,0,0,1,0,35.453
4,20,28,19,3,2,0,0,9,676.394
5,21,30,4,0,0,0,0,7,346.467
6,16,19,3,1,0,0,0,8,273.332
7,0,2,0,0,0,1,0,0,34.083
8,0,0,3,0,0,1,0,1,80.064
9,11,19,6,4,0,0,0,4,303.295


In [74]:
df_master

Unnamed: 0,SMILES
0,C([C@@H]1[C@H]([C@@H]([C@H](C(O1)O)O)O)O)O
1,C([C@@H]1[C@@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O
2,C([C@H]([C@H]([C@@H](C(=O)CO)O)O)O)O
3,C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C/[C@H](C[...


In [10]:
df_result = pd.concat([df_master, df_info], axis=1)
df_result

Unnamed: 0,enzyme,product,reacts,PubChemID,SMILES,n_C,n_H,n_O,n_N,n_P,n_S,n_X,n_DoU,MW
0,EC-1.1.1.321,CPD-685,1.0,5363397,CC(=CCO)CCC=C(C)CO,10,18,2,0,0,0,0,2,170.252
1,EC-1.1.1.111,1-INDANOL,1.0,22819,C1CC2=CC=CC=C2C1O,9,10,1,0,0,0,0,5,134.178
2,EC-1.21.99.M2,4-HYDROXYPHENYLACETATE,1.0,4693933,C1=CC(=CC=C1CC(=O)[O-])O,8,7,3,0,0,0,0,5,151.141
3,EC-1.21.99.M2,CL-,1.0,312,[Cl-],0,0,0,0,0,0,1,0,35.453
4,EC-1.3.1.97,UDP-N-ACETYLMURAMATE,1.0,24772978,CC(C(=O)[O-])OC1C(C(OC(C1O)CO)OP(=O)([O-])OP(=...,20,28,19,3,2,0,0,9,676.394
5,EC-1.14.15.5,CORTICOSTERONE,1.0,5753,CC12CCC(=O)C=C1CCC3C2C(CC4(C3CCC4C(=O)CO)C)O,21,30,4,0,0,0,0,7,346.467
6,EC-1.14.19.50,CPD-19421,1.0,253994,COC1=C(C=C(C=C1)CNCCC2=CC=C(C=C2)O)O,16,19,3,1,0,0,0,8,273.332
7,EC-1.8.2.4,HS,1.0,402,S,0,2,0,0,0,1,0,0,34.083
8,EC-1.8.2.4,SO3,1.0,1099,[O-]S(=O)[O-],0,0,3,0,0,1,0,1,80.064
9,EC-1.5.1.18,CPD-308,1.0,49791983,C(CC(C(=O)[O-])[NH2+]C(CCC(=O)[O-])C(=O)[O-])C...,11,19,6,4,0,0,0,4,303.295


In [11]:
df_result.to_csv('../../../big-datasets/master_df_featurized.csv', index=False)

In [12]:
loaded = pd.read_csv('../../../big-datasets/master_df_featurized.csv')
loaded.head()

Unnamed: 0,enzyme,product,reacts,PubChemID,SMILES,n_C,n_H,n_O,n_N,n_P,n_S,n_X,n_DoU,MW
0,EC-1.1.1.321,CPD-685,1.0,5363397,CC(=CCO)CCC=C(C)CO,10,18,2,0,0,0,0,2,170.252
1,EC-1.1.1.111,1-INDANOL,1.0,22819,C1CC2=CC=CC=C2C1O,9,10,1,0,0,0,0,5,134.178
2,EC-1.21.99.M2,4-HYDROXYPHENYLACETATE,1.0,4693933,C1=CC(=CC=C1CC(=O)[O-])O,8,7,3,0,0,0,0,5,151.141
3,EC-1.21.99.M2,CL-,1.0,312,[Cl-],0,0,0,0,0,0,1,0,35.453
4,EC-1.3.1.97,UDP-N-ACETYLMURAMATE,1.0,24772978,CC(C(=O)[O-])OC1C(C(OC(C1O)CO)OP(=O)([O-])OP(=...,20,28,19,3,2,0,0,9,676.394


In [17]:
unique = loaded.groupby('reacts')['enzyme'].nunique()

In [18]:
unique

reacts
0.0    4626
1.0    4952
Name: enzyme, dtype: int64