In [1]:
import os
import re

import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.AtomPairs import Pairs
from rdkit.Chem.AtomPairs import Torsions

from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

In [2]:
base_path = './data/2020/base_df.csv'
df_base = pd.read_csv(base_path)
df_base.drop(columns = ['Unnamed: 0'], inplace = True)
df_base.head()

Unnamed: 0,pdbcode,year,affinity,set,path,ligand_mol2,ligand_std,protein_pdb,pocket_pdb
0,3zzf,2012,0.4,general,./data/2020/sets/general-set/3zzf,./data/2020/sets/general-set/3zzf/3zzf_ligand....,./data/2020/sets/general-set/3zzf/3zzf_ligand.std,./data/2020/sets/general-set/3zzf/3zzf_protein...,./data/2020/sets/general-set/3zzf/3zzf_pocket.pdb
1,3gww,2009,0.45,general,./data/2020/sets/general-set/3gww,./data/2020/sets/general-set/3gww/3gww_ligand....,./data/2020/sets/general-set/3gww/3gww_ligand.std,./data/2020/sets/general-set/3gww/3gww_protein...,./data/2020/sets/general-set/3gww/3gww_pocket.pdb
2,1w8l,2004,0.49,general,./data/2020/sets/general-set/1w8l,./data/2020/sets/general-set/1w8l/1w8l_ligand....,./data/2020/sets/general-set/1w8l/1w8l_ligand.std,./data/2020/sets/general-set/1w8l/1w8l_protein...,./data/2020/sets/general-set/1w8l/1w8l_pocket.pdb
3,3fqa,2009,0.49,general,./data/2020/sets/general-set/3fqa,./data/2020/sets/general-set/3fqa/3fqa_ligand....,./data/2020/sets/general-set/3fqa/3fqa_ligand.std,./data/2020/sets/general-set/3fqa/3fqa_protein...,./data/2020/sets/general-set/3fqa/3fqa_pocket.pdb
4,1zsb,1996,0.6,general,./data/2020/sets/general-set/1zsb,./data/2020/sets/general-set/1zsb/1zsb_ligand....,./data/2020/sets/general-set/1zsb/1zsb_ligand.std,./data/2020/sets/general-set/1zsb/1zsb_protein...,./data/2020/sets/general-set/1zsb/1zsb_pocket.pdb


In [12]:
path = './data/2020/sets/refined-set/10gs/10gs_pocket.pdb'
mol = Chem.MolFromPDBFile(path)
smi = Chem.MolToSmiles(mol)
len(smi)

1022

In [13]:
path = './data/2020/sets/ref_compl/10gs.pdb'
mol = Chem.MolFromPDBFile(path)
smi = Chem.MolToSmiles(mol)
len(smi)

208

In [15]:
smi

'[NH3+][C@@H](CCC(=O)N[C@@H](CSCc1ccccc1)C(=O)N[C@@H](C(=O)[O-])c1ccccc1)C(=O)[O-]'

In [14]:
path = './data/2020/sets/refined-set/10gs/10gs_ligand.mol2'
mol = Chem.MolFromMol2File(path)
smi = Chem.MolToSmiles(mol)
len(smi)

81

### Adding SMILES to df

In [7]:
smiles_list = []
mol_list = []
i = 0

for path in df_base.pocket_pdb:
    mol = Chem.MolFromPDBFile(path)
    if type(mol) == Chem.rdchem.Mol:
        smi = Chem.MolToSmiles(mol)
        mol_list.append(mol)
        smiles_list.append(smi)
    else:
        mol_list.append(np.NaN)
        smiles_list.append(np.NaN)

[15:24:43] Explicit valence for atom # 64 O, 3, is greater than permitted
[15:24:44] Explicit valence for atom # 202 O, 3, is greater than permitted
[15:24:49] Explicit valence for atom # 527 C, 6, is greater than permitted
[15:24:55] Explicit valence for atom # 143 O, 3, is greater than permitted
[15:25:04] Explicit valence for atom # 31 O, 3, is greater than permitted
[15:25:10] Explicit valence for atom # 32 O, 3, is greater than permitted
[15:25:15] Explicit valence for atom # 73 O, 3, is greater than permitted
[15:25:25] Explicit valence for atom # 103 O, 3, is greater than permitted
[15:25:33] Explicit valence for atom # 351 O, 3, is greater than permitted
[15:25:52] Explicit valence for atom # 51 O, 3, is greater than permitted
[15:26:02] Explicit valence for atom # 367 O, 3, is greater than permitted
[15:26:05] Explicit valence for atom # 316 O, 3, is greater than permitted
[15:26:18] Explicit valence for atom # 211 O, 3, is greater than permitted
[15:26:25] Explicit valence fo

In [9]:
smiles_ligand = []
mol_ligand = []
i = 0

for path in df_base.ligand_mol2:
    mol = Chem.MolFromMol2File(path)
    if type(mol) == Chem.rdchem.Mol:
        smi = Chem.MolToSmiles(mol)
        mol_ligand.append(mol)
        smiles_ligand.append(smi)
    else:
        mol_ligand.append(np.NaN)
        smiles_ligand.append(np.NaN)

[15:34:05] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[15:34:05] Can't kekulize mol.  Unkekulized atoms: 2 4 6 7 8 9 10 11 12
[15:34:15] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[15:34:16] Can't kekulize mol.  Unkekulized atoms: 0 2 3 6 7 8 9 10 11
[15:34:16] Can't kekulize mol.  Unkekulized atoms: 3 4 19 20 22
[15:34:16] Explicit valence for atom # 26 C, 5, is greater than permitted
[15:34:34] Explicit valence for atom # 1 C, 6, is greater than permitted
[15:34:37] Explicit valence for atom # 10 C, 5, is greater than permitted
[15:34:38] Explicit valence for atom # 2 O, 3, is greater than permitted
[15:34:41] Explicit valence for atom # 15 C, 5, is greater than permitted
[15:34:42] Can't kekulize mol.  Unkekulized atoms: 3 4 19 20 22
[15:34:45] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[15:34:46] Explicit valence for atom # 1 C, 5, is greater than permitted
[15:34:53] Explicit valence for atom # 1 C, 6, is greater than permitted
[15:35:01] Can't kekulize mol.  

In [18]:
df_base['pocket_smiles'] = pd.Series(smiles_list)

In [19]:
df_base.pocket_smiles.isna().sum()

30

In [20]:
tmp = Chem.MolFromPDBFile(df_base['protein_pdb'][1100])
fp = Chem.RDKFingerprint(tmp).ToList()
list(str(fp))

['[',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '0',
 ',',
 ' ',
 '1',
 ',',
 ' ',
 '1'

### Fingerprints

In [22]:
fp_rdk = []
#fp_atomPair = []
fp_morgan = []

for mol in mol_list:
    if type(mol) == Chem.rdchem.Mol:
        fp_rdk.append(Chem.RDKFingerprint(mol).ToList())
        #fp_atomPair.append(Pairs.GetAtomPairFingerprint(mol).ToList())
        fp_morgan.append(AllChem.GetMorganFingerprintAsBitVect(mol,4,nBits=2048).ToList())
    else:
        fp_rdk.append(np.NaN)
        #fp_atomPair.append(np.NaN)
        fp_morgan.append(np.NaN)

In [23]:
df_base['pocket_rdk_fp'] = pd.Series(fp_rdk)
df_base['pocket_morgan_fp'] = pd.Series(fp_morgan)
df_base.head()

Unnamed: 0,pdbcode,year,affinity,set,path,ligand_mol2,ligand_std,protein_pdb,pocket_pdb,pocket_smiles,pocket_rdk_fp,pocket_morgan_fp
0,3zzf,2012,0.4,general,./data/2020/sets/general-set/3zzf,./data/2020/sets/general-set/3zzf/3zzf_ligand....,./data/2020/sets/general-set/3zzf/3zzf_ligand.std,./data/2020/sets/general-set/3zzf/3zzf_protein...,./data/2020/sets/general-set/3zzf/3zzf_pocket.pdb,CC(C)C[C@@H](C=O)NC(=O)[C@H](CO)NC(=O)[C@@H](N...,"[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,3gww,2009,0.45,general,./data/2020/sets/general-set/3gww,./data/2020/sets/general-set/3gww/3gww_ligand....,./data/2020/sets/general-set/3gww/3gww_ligand.std,./data/2020/sets/general-set/3gww/3gww_protein...,./data/2020/sets/general-set/3gww/3gww_pocket.pdb,CC(C)C[C@H](N)C(=O)N[C@H](C=O)Cc1ccc(O)cc1.CC(...,"[1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
2,1w8l,2004,0.49,general,./data/2020/sets/general-set/1w8l,./data/2020/sets/general-set/1w8l/1w8l_ligand....,./data/2020/sets/general-set/1w8l/1w8l_ligand.std,./data/2020/sets/general-set/1w8l/1w8l_protein...,./data/2020/sets/general-set/1w8l/1w8l_pocket.pdb,CC(C)C[C@@H](C=O)NC(=O)[C@@H](N)Cc1c[nH]c2cccc...,"[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
3,3fqa,2009,0.49,general,./data/2020/sets/general-set/3fqa,./data/2020/sets/general-set/3fqa/3fqa_ligand....,./data/2020/sets/general-set/3fqa/3fqa_ligand.std,./data/2020/sets/general-set/3fqa/3fqa_protein...,./data/2020/sets/general-set/3fqa/3fqa_pocket.pdb,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)CNC(=O)[C@H](C)...,"[1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,1zsb,1996,0.6,general,./data/2020/sets/general-set/1zsb,./data/2020/sets/general-set/1zsb/1zsb_ligand....,./data/2020/sets/general-set/1zsb/1zsb_ligand.std,./data/2020/sets/general-set/1zsb/1zsb_protein...,./data/2020/sets/general-set/1zsb/1zsb_pocket.pdb,CC(C)C[C@@H](C=O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@...,"[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [24]:
#mol = Chem.MolFromPDBFile(tmp_path)
#fp1 = Chem.RDKFingerprint(mol).ToList()
#fp2 = Pairs.GetAtomPairFingerprintAsBitVect(mol).ToList()
#fp3 = AllChem.GetMorganFingerprintAsBitVect(mol,3,nBits=2048).ToList()
#fp4 = Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol).ToList()

In [25]:
drop_list = []
i = 0
for sm in df_base.pocket_smiles:
    if (type(sm) == type(0.1)):
        drop_list.append(i)
    i += 1

In [26]:
df_base.drop(index = drop_list, axis = 0, inplace = True)
df_base.head()

Unnamed: 0,pdbcode,year,affinity,set,path,ligand_mol2,ligand_std,protein_pdb,pocket_pdb,pocket_smiles,pocket_rdk_fp,pocket_morgan_fp
0,3zzf,2012,0.4,general,./data/2020/sets/general-set/3zzf,./data/2020/sets/general-set/3zzf/3zzf_ligand....,./data/2020/sets/general-set/3zzf/3zzf_ligand.std,./data/2020/sets/general-set/3zzf/3zzf_protein...,./data/2020/sets/general-set/3zzf/3zzf_pocket.pdb,CC(C)C[C@@H](C=O)NC(=O)[C@H](CO)NC(=O)[C@@H](N...,"[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,3gww,2009,0.45,general,./data/2020/sets/general-set/3gww,./data/2020/sets/general-set/3gww/3gww_ligand....,./data/2020/sets/general-set/3gww/3gww_ligand.std,./data/2020/sets/general-set/3gww/3gww_protein...,./data/2020/sets/general-set/3gww/3gww_pocket.pdb,CC(C)C[C@H](N)C(=O)N[C@H](C=O)Cc1ccc(O)cc1.CC(...,"[1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
2,1w8l,2004,0.49,general,./data/2020/sets/general-set/1w8l,./data/2020/sets/general-set/1w8l/1w8l_ligand....,./data/2020/sets/general-set/1w8l/1w8l_ligand.std,./data/2020/sets/general-set/1w8l/1w8l_protein...,./data/2020/sets/general-set/1w8l/1w8l_pocket.pdb,CC(C)C[C@@H](C=O)NC(=O)[C@@H](N)Cc1c[nH]c2cccc...,"[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
3,3fqa,2009,0.49,general,./data/2020/sets/general-set/3fqa,./data/2020/sets/general-set/3fqa/3fqa_ligand....,./data/2020/sets/general-set/3fqa/3fqa_ligand.std,./data/2020/sets/general-set/3fqa/3fqa_protein...,./data/2020/sets/general-set/3fqa/3fqa_pocket.pdb,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)CNC(=O)[C@H](C)...,"[1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,1zsb,1996,0.6,general,./data/2020/sets/general-set/1zsb,./data/2020/sets/general-set/1zsb/1zsb_ligand....,./data/2020/sets/general-set/1zsb/1zsb_ligand.std,./data/2020/sets/general-set/1zsb/1zsb_protein...,./data/2020/sets/general-set/1zsb/1zsb_pocket.pdb,CC(C)C[C@@H](C=O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@...,"[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [27]:
len(df_base.pocket_rdk_fp[1258])

2048

In [28]:
df_base.to_csv('./data/2020/sm_fp_df.csv')

In [28]:
tmp = Chem.MolFromPDBFile(df_base['protein_pdb'][11100])

In [29]:
tmp.GetNumHeavyAtoms()

5406

In [32]:
df_base[df_base.pdbcode == '6cex']

Unnamed: 0,pdbcode,year,affinity,set,path,ligand_mol2,ligand_std,protein_pdb,pocket_pdb
73,6cex,2018,1.7,general,./data/2020/sets/general-set/6cex,./data/2020/sets/general-set/6cex/6cex_ligand....,./data/2020/sets/general-set/6cex/6cex_ligand.std,./data/2020/sets/general-set/6cex/6cex_protein...,./data/2020/sets/general-set/6cex/6cex_pocket.pdb
