rcsbsearchapi >>> functions for searching the Protein Data Bank based on the mmCIF dictionary,
os >>> operating system functions - handling file paths and directories,
requests >>> access APIs for databases,
rdkit >>> an open source github repository of cheminformatics software,
rdkit.Chem (Chem) >>> a subset of rdkit that supports file string to structure conversions,
rdkit.Chem.AllChem (AllChem) >>> a subset of rdkit.Chem that supports energy optimization,
rdkit.Chem.Draw (Draw) >>> a subset of rdkit.Chem that supports chemical drawing in Python,
vina >>> AutoDock Vina software for Python and Jupyter notebooks

In [1]:
# Import the components of rcsbsearchapi needed for this search
from rcsbsearchapi import rcsb_attributes as attrs

# Making queries

In [2]:
# There will be three components to the query, which will be labeled as q1, q2 and q3
ECnumber = "3.4.21.4"      # string variable

q1 = attrs.rcsb_polymer_entity.rcsb_ec_lineage.id == ECnumber    # looking for trypsin structure with EC = 3.4.21.4
q2 = attrs.chem_comp.formula_weight >= 300                 # setting the lower limit for molecular weight of ligands
q3 = attrs.chem_comp.formula_weight <= 800                 # setting the upper limit for molecular weight of ligands

query = q1 & q2 & q3            # combining the three queries into one

resultL = list(query())        # assign the results of the query into a list variable

print(resultL)

['1AQ7', '1AUJ', '1AZ8', '1BJV', '1BTW', '1BTX', '1BTZ', '1C1S', '1C1T', '1C2D', '1C2E', '1C2F', '1C2G', '1C2H', '1C2I', '1C2J', '1C5Q', '1C5R', '1EB2', '1F0T', '1F0U', '1FXY', '1G36', '1GJ6', '1J17', '1JRS', '1JRT', '1K1I', '1K1J', '1K1L', '1K1M', '1K1N', '1K1O', '1K1P', '1LQE', '1MAX', '1MAY', '1MTS', '1MTU', '1MTV', '1MTW', '1NC6', '1O2H', '1O2I', '1O2J', '1O2K', '1O2L', '1O2M', '1O2N', '1O2O', '1O2P', '1O2Q', '1O2R', '1O2T', '1O2U', '1O2V', '1O2W', '1O2X', '1O2Y', '1O2Z', '1O30', '1O36', '1O37', '1O38', '1O39', '1O3A', '1O3B', '1O3C', '1O3D', '1O3E', '1O3F', '1O3G', '1O3H', '1O3I', '1O3J', '1O3K', '1O3L', '1O3M', '1O3N', '1O3O', '1OYQ', '1PPC', '1PPH', '1QB1', '1QB6', '1QB9', '1QBN', '1QBO', '1QCP', '1QL7', '1QL8', '1QL9', '1RXP', '1SFI', '1TYN', '1V2K', '1V2N', '1V2O', '1V2P', '1V2Q', '1V2R', '1V2T', '1V2W', '1XUF', '1XUG', '1XUH', '1XUI', '1XUJ', '1XUK', '1Y3U', '1Y3V', '1Y3W', '1Y3X', '1Y3Y', '1Y59', '1Y5A', '1Y5B', '1Y5U', '1YP9', '1YYY', '1ZZZ', '2AGI', '2AYW', '2D8W', '2ZDK',

In [3]:
print("There are",len(resultL),"trypsin structures that contain ligands in the RCSB PDB.")

There are 187 trypsin structures that contain ligands in the RCSB PDB.


In [4]:
print(resultL[0:10])               # first 10 results

['1AQ7', '1AUJ', '1AZ8', '1BJV', '1BTW', '1BTX', '1BTZ', '1C1S', '1C1T', '1C2D']


# Finding the ligands

In [5]:
molResultL = list(query("mol_definition"))
print(molResultL)

['0CA', '0CB', '0G6', '0IV', '0KV', '0ZG', '0ZW', '0ZX', '0ZY', '10U', '11U', '12U', '132', '13U', '169', '1NJ', '312', '32U', '334', '3YH', '45U', '46U', '49U', '50U', '607', '623', '653', '655', '656', '678', '688', '693', '696', '6VZ', '6W4', '6WH', '711', '762', '780', '783', '785', '806', '847', '907', '950', '972', '974', '991', 'A2C', 'ABB', 'ANH', 'BAB', 'BAH', 'BAK', 'BAO', 'BAZ', 'BBA', 'BOZ', 'BPO', 'BR6', 'BRV', 'BX3', 'BZY', 'CCR', 'CR3', 'CR9', 'CTA', 'DJY', 'DX9', 'E64', 'ESI', 'FD1', 'FD2', 'FD3', 'FD4', 'FO9', 'GOZ', 'GP8', 'GZC', 'I4Q', 'IGN', 'IMA', 'IN4', 'IYR', 'J3I', 'J5K', 'K73', 'LXW', 'M35', 'M6Q', 'MEL', 'MID', 'MXH', 'MZE', 'PMJ', 'PNT', 'PPB', 'PR1', 'PRD_000216', 'PRD_000556', 'R11', 'RPR', 'RWJ', 'T87', 'TFN', 'TL1', 'TL2', 'TL3', 'TL4', 'TYI', 'UIB', 'UIP', 'UIQ', 'UIR', 'UIZ', 'VN1', 'XPE', 'ZAP', 'ZEN']


In [6]:
print("There are", len(molResultL) ,"ligands for EC number", ECnumber,"in this list. Here is a list of the first ten ligands.")
molResultL[0:10]

There are 119 ligands for EC number 3.4.21.4 in this list. Here is a list of the first ten ligands.


['0CA', '0CB', '0G6', '0IV', '0KV', '0ZG', '0ZW', '0ZX', '0ZY', '10U']

# How do we download the ligand files?

In [7]:
import requests       # to enable us to pull files from the PDB
import os             # to enable us to create a directory to store the files

In [10]:
# Download one of the files from our list: 11U.sdf

RES_11U_SDF = requests.get('https://files.rcsb.org/ligands/download/11U_ideal.sdf')

In [11]:
# check to see that the file downloaded properly. A status code of 200 means everything is okay.

RES_11U_SDF.status_code               # Status code check

200

In [20]:
# make a folder for our ligands
os.makedirs("rough", exist_ok=True)

with open("rough/res_11U.sdf", "w+") as file:
    file.write(RES_11U_SDF.text)

In [13]:
"a"+"b"

'ab'

In [14]:
x = 'download/'
y = 'pq_ideal.sdf'
print(x+y)

download/pq_ideal.sdf


In [15]:
lliigg = molResultL[0:10]
print(lliigg)

['0CA', '0CB', '0G6', '0IV', '0KV', '0ZG', '0ZW', '0ZX', '0ZY', '10U']


In [23]:
for ChemID in lliigg:
    cFile = f"{ChemID}_ideal.sdf"
    print(cFile)

0CA_ideal.sdf
0CB_ideal.sdf
0G6_ideal.sdf
0IV_ideal.sdf
0KV_ideal.sdf
0ZG_ideal.sdf
0ZW_ideal.sdf
0ZX_ideal.sdf
0ZY_ideal.sdf
10U_ideal.sdf


In [24]:
baselink = "https://files.rcsb.org/ligands/download/"

for ChemID in lliigg:
    cFile = f'{ChemID}_ideal.sdf'
    cFilelink = baselink + cFile
    print(cFilelink)

https://files.rcsb.org/ligands/download/0CA_ideal.sdf
https://files.rcsb.org/ligands/download/0CB_ideal.sdf
https://files.rcsb.org/ligands/download/0G6_ideal.sdf
https://files.rcsb.org/ligands/download/0IV_ideal.sdf
https://files.rcsb.org/ligands/download/0KV_ideal.sdf
https://files.rcsb.org/ligands/download/0ZG_ideal.sdf
https://files.rcsb.org/ligands/download/0ZW_ideal.sdf
https://files.rcsb.org/ligands/download/0ZX_ideal.sdf
https://files.rcsb.org/ligands/download/0ZY_ideal.sdf
https://files.rcsb.org/ligands/download/10U_ideal.sdf


In [25]:
baselink = "https://files.rcsb.org/ligands/download/"

for ChemID in lliigg:
    cFile = f'{ChemID}_ideal.sdf'
    cFilelink = baselink + cFile
    cFileLocalpath = "rough/" + cFile
    print(cFileLocalpath)

rough/0CA_ideal.sdf
rough/0CB_ideal.sdf
rough/0G6_ideal.sdf
rough/0IV_ideal.sdf
rough/0KV_ideal.sdf
rough/0ZG_ideal.sdf
rough/0ZW_ideal.sdf
rough/0ZX_ideal.sdf
rough/0ZY_ideal.sdf
rough/10U_ideal.sdf


In [26]:
baseLink = "https://files.rcsb.org/ligands/download/"

for ChemID in lliigg:
    cFile = f'{ChemID}_ideal.sdf'
    cFileLink = baseLink + cFile
    cFileLocalPath = "rough/" + cFile
    response = requests.get(cFileLink)
    with open(cFileLocalPath, 'w+') as f:
        f.write(response.text)

# Downloading all of the ligands

In [28]:
os.makedirs("ligands", exist_ok = True)

baseUrl = "https://files.rcsb.org/ligands/download/"

for ChemID in molResultL:
    cFile = f"{ChemID}_ideal.sdf"
    cFileUrl = baseUrl + cFile
    cFileLocalPath = "ligands/" + cFile
    response = requests.get(cFileUrl)
    with open (cFileLocalPath, "w+") as file:
        file.write(response.text)