In [4]:
! pip install chembl_webresource_client



### import libraries

In [5]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

### search for target protein 

In [43]:
target = new_client.target
target_query = target.search('aromatase')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P11511', 'xref_name': None, 'xre...",Homo sapiens,Cytochrome P450 19A1,20.0,False,CHEMBL1978,"[{'accession': 'P11511', 'component_descriptio...",SINGLE PROTEIN,9606
1,"[{'xref_id': 'P22443', 'xref_name': None, 'xre...",Rattus norvegicus,Cytochrome P450 19A1,20.0,False,CHEMBL3859,"[{'accession': 'P22443', 'component_descriptio...",SINGLE PROTEIN,10116


### select bioactivity data for 'SARS coronavirus 3C-like proteinase' (index=6)

In [7]:
selected_target = targets.target_chembl_id[6]
selected_target

'CHEMBL3927'

In [11]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type='IC50')
res[0]

{'action_type': None,
 'activity_comment': None,
 'activity_id': 1480935,
 'activity_properties': [],
 'assay_chembl_id': 'CHEMBL829584',
 'assay_description': 'In vitro inhibitory concentration against SARS coronavirus main protease (SARS CoV 3C-like protease)',
 'assay_type': 'B',
 'assay_variant_accession': None,
 'assay_variant_mutation': None,
 'bao_endpoint': 'BAO_0000190',
 'bao_format': 'BAO_0000357',
 'bao_label': 'single protein format',
 'canonical_smiles': 'Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21',
 'data_validity_comment': None,
 'data_validity_description': None,
 'document_chembl_id': 'CHEMBL1139624',
 'document_journal': 'Bioorg Med Chem Lett',
 'document_year': 2005,
 'ligand_efficiency': {'bei': '18.28',
  'le': '0.33',
  'lle': '3.25',
  'sei': '5.90'},
 'molecule_chembl_id': 'CHEMBL187579',
 'molecule_pref_name': None,
 'parent_molecule_chembl_id': 'CHEMBL187579',
 'pchembl_value': '5.14',
 'potential_duplicate': 0,
 'qudt_units': 'http://www.openphacts.org/units/Nan

In [9]:
df = pd.DataFrame.from_dict(res)

In [17]:
df.head(3)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,7.2
1,,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,9.4
2,,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.5


In [24]:
df['standard_value']
# standard_value: drug potency

0       7200.0
1       9400.0
2      13500.0
3      13110.0
4       2000.0
        ...   
128    10600.0
129    10100.0
130    11500.0
131    10700.0
132    78900.0
Name: standard_value, Length: 133, dtype: object

In [13]:
df.standard_type.unique() #check elements of specific column

array(['IC50'], dtype=object)

In [18]:
df.to_csv('bioactivity_data.csv', index=False)

### handling missing data

In [21]:
all(df.standard_value.notna())

True

In [22]:
df2 = df[df.standard_value.notna()]

no missing data

### data preprocessing  
- data is in IC50 unit.
- value less than 1000nM: active
- value greater than 10000 nM: inactive
- between 1000 nM and 10000 nM:  intermediate

In [26]:
bioactivity_class = []
for value in df2.standard_value:
    value = float(value)
    if value <= 1000:
        bioactivity_class.append('active')
    elif value >= 10000:
        bioactivity_class.append('inactive')
    else:
        bioactivity_class.append('intermediate')

In [39]:
ids = [id for id in df2.molecule_chembl_id]
smiles = [smile for smile in df2.canonical_smiles]
values = [value for value in df2.standard_value]

construct new dataframe for analysis

In [41]:
datas = {'molecule_chembl_id': ids,
         'canonical_smiles': smiles,
         'bioactivity_class': bioactivity_class,
         'standard_value': values} 
    
df3 = pd.DataFrame(datas)
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,standard_value
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,intermediate,7200.0
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,intermediate,9400.0
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,inactive,13500.0
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,inactive,13110.0
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],intermediate,2000.0
...,...,...,...,...
128,CHEMBL2146517,COC(=O)[C@@]1(C)CCCc2c1ccc1c2C(=O)C(=O)c2c(C)c...,inactive,10600.0
129,CHEMBL187460,C[C@H]1COC2=C1C(=O)C(=O)c1c2ccc2c1CCCC2(C)C,inactive,10100.0
130,CHEMBL363535,Cc1coc2c1C(=O)C(=O)c1c-2ccc2c(C)cccc12,inactive,11500.0
131,CHEMBL227075,Cc1cccc2c3c(ccc12)C1=C(C(=O)C3=O)[C@@H](C)CO1,inactive,10700.0


In [42]:
df3.to_csv('bioactivity_preprocessed_data.csv', index=False)