In [3]:
! pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.9-py3-none-any.whl.metadata (1.4 kB)
Collecting requests-cache~=1.2 (from chembl_webresource_client)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading cattrs-24.1.2-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl.metadata (3.1 kB)
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cattrs-24.1.2-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━

In [4]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

In [5]:
# find target protein for acetylcholinesterase

target = new_client.target
target_query = target.search('acetylcholinesterase')
targets = pd.DataFrame.from_dict(target_query)
print(targets)

   cross_references                   organism  \
0                []    Drosophila melanogaster   
1                []               Homo sapiens   
2                []        Torpedo californica   
3                []               Mus musculus   
4                []          Rattus norvegicus   
5                []   Electrophorus electricus   
6                []                 Bos taurus   
7                []             Bemisia tabaci   
8                []  Leptinotarsa decemlineata   
9                []     Nephotettix cincticeps   
10               []              Gallus gallus   
11               []                Danio rerio   
12               []            Musca domestica   
13               []          Anopheles gambiae   
14               []               Homo sapiens   
15               []        Plutella xylostella   
16               []            Musca domestica   
17               []  Pediculus humanus capitis   
18               []          Spodoptera litura   


In [6]:
# selecting the first target protein
selected_target = targets.target_chembl_id[0]

# Here, we will retrieve only bioactivity data for Human Acetylcholinesterase (CHEMBL220) that are reported as pChEMBL values.
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

df = pd.DataFrame.from_dict(res)


In [7]:
df.to_csv('acetylcholinesterase_01_bioactivity_data_raw.csv', index=False)

Handling missing data

In [8]:
df2 = df[df.standard_value.notna()]
df2 = df2[df.canonical_smiles.notna()]

In [9]:
# drop duplicates, smiles is for chemical compound
df2_nr = df2.drop_duplicates(['canonical_smiles'])

In [10]:
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2_nr[selection]

In [11]:
df3.to_csv('acetylcholinesterase_02_bioactivity_data_preprocessed.csv', index=False)

Labelling compund as active/inactive

In [12]:
df4 = pd.read_csv('acetylcholinesterase_02_bioactivity_data_preprocessed.csv')


bioactivity_threshold = []
for i in df4.standard_value:
  if float(i) >= 10000:
    bioactivity_threshold.append("inactive")
  elif float(i) <= 1000:
    bioactivity_threshold.append("active")
  else:
    bioactivity_threshold.append("intermediate")


bioactivity_class = pd.Series(bioactivity_threshold, name='class')
df5 = pd.concat([df4, bioactivity_class], axis=1)
df5

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL463210,CCOP(=S)(OCC)Oc1nc(Cl)c(Cl)cc1Cl,1830.0,intermediate
1,CHEMBL2252723,CCOP(=O)(OCC)SCCCCCCCCCCN1C(=O)c2ccccc2C1=O,112720.0,inactive
2,CHEMBL2252722,CCOP(=O)(OCC)SCCCCCCCCCN1C(=O)c2ccccc2C1=O,37500.0,inactive
3,CHEMBL2252721,CCOP(=O)(OCC)SCCCCCCCCN1C(=O)c2ccccc2C1=O,4500.0,intermediate
4,CHEMBL2252851,CCOP(=O)(OCC)SCCCCCCCN1C(=O)c2ccccc2C1=O,1840.0,intermediate
5,CHEMBL2252850,CCOP(=O)(OCC)SCCCCCCN1C(=O)c2ccccc2C1=O,3810.0,intermediate
6,CHEMBL2252849,CCOP(=O)(OCC)SCCCCCN1C(=O)c2ccccc2C1=O,12340.0,inactive
7,CHEMBL2252848,CCOP(=O)(OCC)SCCCCN1C(=O)c2ccccc2C1=O,9920.0,intermediate
8,CHEMBL2252847,CCOP(=O)(OCC)SCCCN1C(=O)c2ccccc2C1=O,8290.0,intermediate
9,CHEMBL2252846,CCOP(=O)(OCC)SCCCCCCCCCCSP(=O)(OCC)OCC,1760.0,intermediate


In [13]:
df5.to_csv('acetylcholinesterase_03_bioactivity_data_curated.csv', index=False)


! zip acetylcholinesterase.zip *.csv

  adding: acetylcholinesterase_01_bioactivity_data_raw.csv (deflated 85%)
  adding: acetylcholinesterase_02_bioactivity_data_preprocessed.csv (deflated 75%)
  adding: acetylcholinesterase_03_bioactivity_data_curated.csv (deflated 76%)
