In [1]:
import os
import re

import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem

from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

In [2]:
dataPath = './data/2020/index'

refinedFile = "INDEX_refined_data.2020"
refinedFullPath = os.path.join(dataPath, refinedFile)

df_columns = ['pdbcode', 'year', 'set', 'affinity']

In [3]:
def getPDBbind(fileFullPath, columnNames, setName=None):
    
    """
    Funtion to convert PDBbind textfile to a dataframe:
    
    Parameters:
        fileFullPath (string): location of textfile 
        setName(string): set from PDBbind (general, refined, or core)
        columnNames(list): list of features to include in dataframe
    Return:
        df: Dataframe object with columns 'pdbCodes', 'year', 'realAffinity'
    """
    
    pdbCodes = []
    year = []
    realAffinity = []
    
    df = pd.DataFrame(columns=columnNames)
    
    if (os.path.isfile(fileFullPath)):    #check if path exists 
        print ("Processing File: %s. \n" % fileFullPath )

        with open (fileFullPath, 'rt') as pdbbindSet:
            for line in pdbbindSet:
                if not line.startswith("#"):
                    columns = line.split()
                    pdbCodes.append(columns[0])
                    year.append(columns[2])
                    realAffinity.append(columns[3])        
    else:
        raise(FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), refinedFullPath))


    df["pdbcode"] = pdbCodes
    df["year"] = year
    df["affinity"] = realAffinity
    df["set"] = setName
    
    print (df.head())
    
    print ("\nTotal Complexes in %s Set: %6d \n" % (setName, df.shape[0]) )
    
    return (df)

In [4]:
df_refined = getPDBbind(refinedFullPath, df_columns, setName = 'refined')

Processing File: ./data/2020/index\INDEX_refined_data.2020. 

  pdbcode  year      set affinity
0    2r58  2007  refined     2.00
1    3c2f  2008  refined     2.00
2    3g2y  2009  refined     2.00
3    3pce  1998  refined     2.00
4    4qsu  2014  refined     2.00

Total Complexes in refined Set:   5316 



In [7]:
df_refined.tail()

Unnamed: 0,pdbcode,year,set,affinity
5311,4f3c,2013,refined,11.82
5312,5bry,2015,refined,11.82
5313,1sl3,2004,refined,11.85
5314,1ctu,1995,refined,11.92
5315,6e9a,2018,refined,11.92


## Path

In [13]:
startPath = './data/2020/sets/'

maxLevel = 0
pdbCodeFolders = []
pdbCodePaths  = []
setFolders = []
split = []

for subdirFullPath, dirs, files in os.walk(startPath):
    level = subdirFullPath.replace(startPath, '').count(os.sep)
    if level > maxLevel:
        maxLevel = level
        
maxLevel = maxLevel + 1

if (maxLevel > 2):
    raise ValueError ("Expected 2 subdirectory (refined, general) in dataset path. Got %d !" % (maxLevel))
else:
    print ("Found %d Directory Levels" % maxLevel)
    
setFolders = []    

for subdirFullPath, dirs, files in os.walk(startPath):
    level = subdirFullPath.replace(startPath, '').count(os.sep)
    if subdirFullPath == startPath:
        next
    elif level == 0:
        setFolders.append (subdirFullPath.replace(startPath, ''))
        print (subdirFullPath)
    elif level == 1:   
        pdbCodePaths.append (subdirFullPath)
        pdbCodeFolders.append (subdirFullPath[-4:])
        if (re.search( "refined-set", subdirFullPath)):
            split.append ( 'refined' )
        elif (re.search( "general-set", subdirFullPath)):
            split.append ( 'general' )
        #pdbCodeFolders.append (subdirFullPath.replace(os.path.join(startPath, setFolders[level-1]), ''))

print ("\nFound %d pdbCode Folders." % len(pdbCodeFolders)) 

Found 2 Directory Levels
./data/2020/sets/general-set
./data/2020/sets/refined-set

Found 19443 pdbCode Folders.


In [14]:
# create dataset
columnNames = ["pdbcode", "path", "is4files"]

df_path = pd.DataFrame(columns=columnNames)
df_path['pdbcode']  = pdbCodeFolders
df_path['path']  = pdbCodePaths
df_path['set'] = split

df_path.tail()

Unnamed: 0,pdbcode,path,is4files,set
19438,7std,./data/2020/sets/refined-set\7std,,refined
19439,7upj,./data/2020/sets/refined-set\7upj,,refined
19440,8a3h,./data/2020/sets/refined-set\8a3h,,refined
19441,8cpa,./data/2020/sets/refined-set\8cpa,,refined
19442,966c,./data/2020/sets/refined-set\966c,,refined


In [15]:
df_path['path'] = [(lambda x: x.replace("\\", "/"))(x) for x in df_path['path']]

In [16]:
df_path.head()

Unnamed: 0,pdbcode,path,is4files,set
0,11gs,./data/2020/sets/general-set/11gs,,general
1,13gs,./data/2020/sets/general-set/13gs,,general
2,16pk,./data/2020/sets/general-set/16pk,,general
3,1a07,./data/2020/sets/general-set/1a07,,general
4,1a08,./data/2020/sets/general-set/1a08,,general


In [17]:
df_path.drop(columns = ['is4files'], inplace = True)

In [19]:
df_path = df_path[df_path.set == 'refined'].copy()
df_path.shape

(5316, 3)

In [20]:
df_path.drop(columns = ['set'], inplace = True)

## Merging

In [22]:
df_final = df_refined.merge(df_path, on=['pdbcode'], indicator=True, how='outer')
df_final.head()

Unnamed: 0,pdbcode,year,set,affinity,path,_merge
0,2r58,2007,refined,2.0,./data/2020/sets/refined-set/2r58,both
1,3c2f,2008,refined,2.0,./data/2020/sets/refined-set/3c2f,both
2,3g2y,2009,refined,2.0,./data/2020/sets/refined-set/3g2y,both
3,3pce,1998,refined,2.0,./data/2020/sets/refined-set/3pce,both
4,4qsu,2014,refined,2.0,./data/2020/sets/refined-set/4qsu,both


## Features

In [23]:
df = df_final.copy()

In [25]:
df.drop(columns = ['year', 'set', '_merge'], inplace = True)

In [26]:
tail_pocket_pdb = '_pocket.pdb'
df['pocket_pdb'] = df_final.path.apply(lambda x: x + '/' + x.split('/')[-1] + tail_pocket_pdb)

### smiles

In [28]:
smiles_list = []
mol_list = []
i = 0

for path in df.pocket_pdb:
    mol = Chem.MolFromPDBFile(path)
    if type(mol) == Chem.rdchem.Mol:
        smi = Chem.MolToSmiles(mol)
        mol_list.append(mol)
        smiles_list.append(smi)
    else:
        mol_list.append(np.NaN)
        smiles_list.append(np.NaN)

[14:18:19] Explicit valence for atom # 64 O, 3, is greater than permitted
[14:18:23] Explicit valence for atom # 143 O, 3, is greater than permitted
[14:18:46] Explicit valence for atom # 297 O, 3, is greater than permitted
[14:18:50] Explicit valence for atom # 214 O, 4, is greater than permitted
[14:19:06] Explicit valence for atom # 31 O, 3, is greater than permitted
[14:19:22] Explicit valence for atom # 444 O, 3, is greater than permitted


In [29]:
df['pocket_smiles'] = pd.Series(smiles_list)

### fingerprints

In [30]:
fp_rdk = []
fp_morgan2 = []
fp_morgan3 = []
fp_morgan4 = []

for mol in mol_list:
    if type(mol) == Chem.rdchem.Mol:
        fp_rdk.append(Chem.RDKFingerprint(mol).ToList())
        fp_morgan2.append(AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=2048).ToList())
        fp_morgan3.append(AllChem.GetMorganFingerprintAsBitVect(mol,3,nBits=2048).ToList())
        fp_morgan4.append(AllChem.GetMorganFingerprintAsBitVect(mol,4,nBits=2048).ToList())
    else:
        fp_rdk.append(np.NaN)
        fp_morgan2.append(np.NaN)
        fp_morgan3.append(np.NaN)
        fp_morgan4.append(np.NaN)

In [31]:
df['rdk_fp'] = pd.Series(fp_rdk)
df['morgan_fp2'] = pd.Series(fp_morgan2)
df['morgan_fp3'] = pd.Series(fp_morgan3)
df['morgan_fp4'] = pd.Series(fp_morgan4)

df.head()

Unnamed: 0,pdbcode,affinity,path,pocket_pdb,pocket_smiles,rdk_fp,morgan_fp2,morgan_fp3,morgan_fp4
0,2r58,2.0,./data/2020/sets/refined-set/2r58,./data/2020/sets/refined-set/2r58/2r58_pocket.pdb,CC(C)[C@H](NC(=O)[C@H](C)N)C(=O)N[C@@H](CC(=O)...,"[1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,3c2f,2.0,./data/2020/sets/refined-set/3c2f,./data/2020/sets/refined-set/3c2f/3c2f_pocket.pdb,CC(C)C[C@@H](C=O)NC(=O)CNC(=O)CNC(=O)[C@H](CO)...,"[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
2,3g2y,2.0,./data/2020/sets/refined-set/3g2y,./data/2020/sets/refined-set/3g2y/3g2y_pocket.pdb,CC(C)C[C@@H](C=O)NC(=O)[C@H](C)N.CC(C)C[C@H](N...,"[0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
3,3pce,2.0,./data/2020/sets/refined-set/3pce,./data/2020/sets/refined-set/3pce/3pce_pocket.pdb,CC(C)[C@@H](C=O)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O...,"[1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
4,4qsu,2.0,./data/2020/sets/refined-set/4qsu,./data/2020/sets/refined-set/4qsu/4qsu_pocket.pdb,CC(C)C[C@H](NC(=O)[C@H](C)NC(=O)[C@@H](N)CC(N)...,"[0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."


In [32]:
drop_list = []
i = 0
for sm in df.pocket_smiles:
    if (type(sm) == type(0.1)):
        drop_list.append(i)
    i += 1

In [33]:
df.drop(index = drop_list, axis = 0, inplace = True)
df.head()

Unnamed: 0,pdbcode,affinity,path,pocket_pdb,pocket_smiles,rdk_fp,morgan_fp2,morgan_fp3,morgan_fp4
0,2r58,2.0,./data/2020/sets/refined-set/2r58,./data/2020/sets/refined-set/2r58/2r58_pocket.pdb,CC(C)[C@H](NC(=O)[C@H](C)N)C(=O)N[C@@H](CC(=O)...,"[1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,3c2f,2.0,./data/2020/sets/refined-set/3c2f,./data/2020/sets/refined-set/3c2f/3c2f_pocket.pdb,CC(C)C[C@@H](C=O)NC(=O)CNC(=O)CNC(=O)[C@H](CO)...,"[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
2,3g2y,2.0,./data/2020/sets/refined-set/3g2y,./data/2020/sets/refined-set/3g2y/3g2y_pocket.pdb,CC(C)C[C@@H](C=O)NC(=O)[C@H](C)N.CC(C)C[C@H](N...,"[0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
3,3pce,2.0,./data/2020/sets/refined-set/3pce,./data/2020/sets/refined-set/3pce/3pce_pocket.pdb,CC(C)[C@@H](C=O)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O...,"[1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
4,4qsu,2.0,./data/2020/sets/refined-set/4qsu,./data/2020/sets/refined-set/4qsu/4qsu_pocket.pdb,CC(C)C[C@H](NC(=O)[C@H](C)NC(=O)[C@@H](N)CC(N)...,"[0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."


In [34]:
df.drop(columns = ['path', 'pocket_pdb'], inplace = True)

In [35]:
df.to_csv('./data/2020/refined_df.csv')