# Data aquisition and cleaning - Dopamine D2 receptor 

In [8]:
import os
import sys
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)

In [43]:
# Load data 
from chembl_webresource_client.new_client import new_client

# Search for the target (Dopamine D2 receptor)
target = new_client.target.search("DRD2").filter(organism='Homo sapiens').filter(target_type='SINGLE PROTEIN')
target_id = target[0]['target_chembl_id']  # Extract ChEMBL ID

# Fetch bioactivity data for this target
bioactivities = new_client.activity.filter(target_chembl_id=target_id)


In [90]:
data = pd.DataFrame.from_dict(bioactivities)
data

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,32111,[],CHEMBL671073,Binding affinity to cloned human Dopamine rece...,B,,,BAO_0000190,BAO_0000219,cell-based format,c1cnc(N2CCN(Cc3cccc4c3Cc3ccccc3-4)CC2)nc1,,,CHEMBL1130912,Bioorg Med Chem Lett,1998.0,"{'bei': '14.63', 'le': '0.26', 'lle': '1.64', ...",CHEMBL303519,,CHEMBL303519,5.01,0,http://www.openphacts.org/units/Nanomolar,108659,=,1,1,=,,IC50,nM,,9800.0,CHEMBL217,Homo sapiens,Dopamine D2 receptor,9606,,,IC50,nM,UO_0000065,,9800.0
1,,,33282,[],CHEMBL671073,Binding affinity to cloned human Dopamine rece...,B,,,BAO_0000190,BAO_0000219,cell-based format,COc1ccc(-c2cccc(CN3CCN(c4ncccn4)CC3)c2)cc1,,,CHEMBL1130912,Bioorg Med Chem Lett,1998.0,"{'bei': '20.25', 'le': '0.37', 'lle': '3.83', ...",CHEMBL292943,,CHEMBL292943,7.30,0,http://www.openphacts.org/units/Nanomolar,108665,=,1,1,=,,IC50,nM,,50.0,CHEMBL217,Homo sapiens,Dopamine D2 receptor,9606,,,IC50,nM,UO_0000065,,50.0
2,,,33812,[],CHEMBL670962,Inhibitor constant of compound for high affini...,B,,,BAO_0000192,BAO_0000357,single protein format,NC(=O)[C@H]1CS[C@@H]2CC[C@]3(CCCN3C(=O)[C@@H]3...,,,CHEMBL1132286,J Med Chem,1999.0,"{'bei': '28.87', 'le': '0.58', 'lle': '10.52',...",CHEMBL156651,,CHEMBL156651,10.17,0,http://www.openphacts.org/units/Nanomolar,309100,=,1,1,=,,Ki,nM,,0.067,CHEMBL217,Homo sapiens,Dopamine D2 receptor,9606,,,Ki,nM,UO_0000065,,0.067
3,,,33813,[],CHEMBL666181,Inhibitor constant of compound for low affinit...,B,,,BAO_0000192,BAO_0000357,single protein format,NC(=O)[C@H]1CS[C@@H]2CC[C@]3(CCCN3C(=O)[C@@H]3...,,,CHEMBL1132286,J Med Chem,1999.0,"{'bei': '19.07', 'le': '0.38', 'lle': '7.07', ...",CHEMBL156651,,CHEMBL156651,6.72,0,http://www.openphacts.org/units/Nanomolar,309100,=,1,1,=,,Ki,nM,,190.0,CHEMBL217,Homo sapiens,Dopamine D2 receptor,9606,,,Ki,nM,UO_0000065,,190.0
4,,,33814,[],CHEMBL666184,Percent of Dopamine receptor D2 in high affini...,B,,,BAO_0000179,BAO_0000357,single protein format,NC(=O)[C@H]1CS[C@@H]2CC[C@]3(CCCN3C(=O)[C@@H]3...,,,CHEMBL1132286,J Med Chem,1999.0,,CHEMBL156651,,CHEMBL156651,,0,http://qudt.org/vocab/unit#Percent,309100,=,1,0,=,,RH,%,,53.0,CHEMBL217,Homo sapiens,Dopamine D2 receptor,9606,,,RH,%,UO_0000187,,53.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24624,,% Of Inhibition,25788253,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5477509,Selectivity interaction (GPCR panel (PDSP scre...,B,,,BAO_0000179,BAO_0000357,single protein format,Nc1cnc(N2CCc3cc(S(=O)(=O)Nc4c(F)cc(Cl)cc4F)cc(...,,,CHEMBL5465560,,2024.0,,CHEMBL4585920,TP-020n,CHEMBL4585920,,0,http://qudt.org/vocab/unit#Percent,4018673,=,65,0,=,,% of inhibition,%,,-0.42,CHEMBL217,Homo sapiens,Dopamine D2 receptor,9606,,,% of inhibition,%,UO_0000187,,-0.42
24625,,,25790573,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5479826,Selectivity interaction (GPCR panel (PDSP scre...,B,,,BAO_0000192,BAO_0000357,single protein format,CCC(F)(F)c1cc(N2CCC(S(C)(=O)=O)CC2)nc2sc(C(N)=...,,,CHEMBL5465560,,2024.0,"{'bei': '14.95', 'le': '0.32', 'lle': '3.98', ...",CHEMBL4522930,BI-605906,CHEMBL4522930,6.47,0,http://www.openphacts.org/units/Nanomolar,4018703,=,65,1,=,,Ki,nM,,341.44,CHEMBL217,Homo sapiens,Dopamine D2 receptor,9606,,,Ki,nM,UO_0000065,,341.44
24626,,% Of Inhibition,25790574,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5479827,Selectivity interaction (GPCR panel (PDSP scre...,B,,,BAO_0000179,BAO_0000357,single protein format,CCC(F)(F)c1cc(N2CCC(S(C)(=O)=O)CC2)nc2sc(C(N)=...,,,CHEMBL5465560,,2024.0,,CHEMBL4522930,BI-605906,CHEMBL4522930,,0,http://qudt.org/vocab/unit#Percent,4018703,=,65,0,=,,% of inhibition,%,,69.76,CHEMBL217,Homo sapiens,Dopamine D2 receptor,9606,,,% of inhibition,%,UO_0000187,,69.76
24627,,% Of Inhibition,25792085,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5481012,Selectivity interaction (CEREP panel (binding ...,B,,,BAO_0000179,BAO_0000357,single protein format,Cc1n[nH]c2ncc(C(=O)N3CCC[C@H]3c3ccc(Cl)cc3)cc12,,,CHEMBL5465560,,2024.0,,CHEMBL3903492,,CHEMBL3903492,,0,http://qudt.org/vocab/unit#Percent,4018708,=,65,0,=,,% of inhibition,%,,54.0,CHEMBL217,Homo sapiens,Dopamine D2 receptor,9606,,,% of inhibition,%,UO_0000187,,54.0


In [45]:
data.columns

Index(['action_type', 'activity_comment', 'activity_id', 'activity_properties', 'assay_chembl_id', 'assay_description', 'assay_type', 'assay_variant_accession', 'assay_variant_mutation', 'bao_endpoint', 'bao_format', 'bao_label', 'canonical_smiles', 'data_validity_comment', 'data_validity_description', 'document_chembl_id', 'document_journal', 'document_year', 'ligand_efficiency', 'molecule_chembl_id', 'molecule_pref_name', 'parent_molecule_chembl_id', 'pchembl_value', 'potential_duplicate', 'qudt_units', 'record_id', 'relation', 'src_id', 'standard_flag', 'standard_relation', 'standard_text_value', 'standard_type', 'standard_units', 'standard_upper_value', 'standard_value', 'target_chembl_id', 'target_organism', 'target_pref_name', 'target_tax_id', 'text_value', 'toid', 'type', 'units', 'uo_units', 'upper_value', 'value'], dtype='object')

In [91]:
# Removing all null values (dropped 24629-21091=3538)
data.dropna(subset=['standard_value', 'canonical_smiles'],inplace=True)

In [109]:
data.head(5)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,32111,[],CHEMBL671073,Binding affinity to cloned human Dopamine rece...,B,,,BAO_0000190,BAO_0000219,cell-based format,c1cnc(N2CCN(Cc3cccc4c3Cc3ccccc3-4)CC2)nc1,,,CHEMBL1130912,Bioorg Med Chem Lett,1998.0,"{'bei': '14.63', 'le': '0.26', 'lle': '1.64', ...",CHEMBL303519,,CHEMBL303519,5.01,0,http://www.openphacts.org/units/Nanomolar,108659,=,1,1,=,,IC50,nM,,9800.0,CHEMBL217,Homo sapiens,Dopamine D2 receptor,9606,,,IC50,nM,UO_0000065,,9800.0
1,,,33282,[],CHEMBL671073,Binding affinity to cloned human Dopamine rece...,B,,,BAO_0000190,BAO_0000219,cell-based format,COc1ccc(-c2cccc(CN3CCN(c4ncccn4)CC3)c2)cc1,,,CHEMBL1130912,Bioorg Med Chem Lett,1998.0,"{'bei': '20.25', 'le': '0.37', 'lle': '3.83', ...",CHEMBL292943,,CHEMBL292943,7.3,0,http://www.openphacts.org/units/Nanomolar,108665,=,1,1,=,,IC50,nM,,50.0,CHEMBL217,Homo sapiens,Dopamine D2 receptor,9606,,,IC50,nM,UO_0000065,,50.0
2,,,33812,[],CHEMBL670962,Inhibitor constant of compound for high affini...,B,,,BAO_0000192,BAO_0000357,single protein format,NC(=O)[C@H]1CS[C@@H]2CC[C@]3(CCCN3C(=O)[C@@H]3...,,,CHEMBL1132286,J Med Chem,1999.0,"{'bei': '28.87', 'le': '0.58', 'lle': '10.52',...",CHEMBL156651,,CHEMBL156651,10.17,0,http://www.openphacts.org/units/Nanomolar,309100,=,1,1,=,,Ki,nM,,0.067,CHEMBL217,Homo sapiens,Dopamine D2 receptor,9606,,,Ki,nM,UO_0000065,,0.067
3,,,33813,[],CHEMBL666181,Inhibitor constant of compound for low affinit...,B,,,BAO_0000192,BAO_0000357,single protein format,NC(=O)[C@H]1CS[C@@H]2CC[C@]3(CCCN3C(=O)[C@@H]3...,,,CHEMBL1132286,J Med Chem,1999.0,"{'bei': '19.07', 'le': '0.38', 'lle': '7.07', ...",CHEMBL156651,,CHEMBL156651,6.72,0,http://www.openphacts.org/units/Nanomolar,309100,=,1,1,=,,Ki,nM,,190.0,CHEMBL217,Homo sapiens,Dopamine D2 receptor,9606,,,Ki,nM,UO_0000065,,190.0
4,,,33814,[],CHEMBL666184,Percent of Dopamine receptor D2 in high affini...,B,,,BAO_0000179,BAO_0000357,single protein format,NC(=O)[C@H]1CS[C@@H]2CC[C@]3(CCCN3C(=O)[C@@H]3...,,,CHEMBL1132286,J Med Chem,1999.0,,CHEMBL156651,,CHEMBL156651,,0,http://qudt.org/vocab/unit#Percent,309100,=,1,0,=,,RH,%,,53.0,CHEMBL217,Homo sapiens,Dopamine D2 receptor,9606,,,RH,%,UO_0000187,,53.0


In [93]:
## Saving to csv 
data.to_csv('./data/Dopamine_D2_receptor.csv', index=False)

In [112]:
# Load in dataset 
df = pd.read_csv('./data/Dopamine_D2_receptor.csv', low_memory=False)

In [113]:
# Keep Only Relevant Bioactivity Measurements
valid_units = ["nM"]  # We need numeric values in nM
valid_types = ["IC50", "Ki", "Kd", "pChEMBL Value"]

# Filter out values
df_filtered = df[df['standard_type'].isin(valid_types) & df['standard_units'].isin(valid_units)]

In [114]:
# Required data for rdkit molecular descriptors calculation
df_rdkit = df_filtered[['molecule_chembl_id', 'canonical_smiles', 'standard_type', 'pchembl_value']]
df_rdkit

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_type,pchembl_value
0,CHEMBL303519,c1cnc(N2CCN(Cc3cccc4c3Cc3ccccc3-4)CC2)nc1,IC50,5.01
1,CHEMBL292943,COc1ccc(-c2cccc(CN3CCN(c4ncccn4)CC3)c2)cc1,IC50,7.30
2,CHEMBL156651,NC(=O)[C@H]1CS[C@@H]2CC[C@]3(CCCN3C(=O)[C@@H]3...,Ki,10.17
3,CHEMBL156651,NC(=O)[C@H]1CS[C@@H]2CC[C@]3(CCCN3C(=O)[C@@H]3...,Ki,6.72
6,CHEMBL156651,NC(=O)[C@H]1CS[C@@H]2CC[C@]3(CCCN3C(=O)[C@@H]3...,Ki,9.71
...,...,...,...,...
21072,CHEMBL5465167,Cc1noc(C)c1-c1cnc2cc(C(=O)NCCC[n+]3ccc(N(C)C)c...,IC50,
21077,CHEMBL2017291,COc1cc2c(cc1-c1c(C)noc1C)ncc1[nH]c(=O)n([C@H](...,IC50,
21085,CHEMBL3643413,CCC(=O)N1CC[C@H](Nc2ncnc3c2CN(c2cnc(OC)c(C(F)(...,IC50,
21087,CHEMBL4522930,CCC(F)(F)c1cc(N2CCC(S(C)(=O)=O)CC2)nc2sc(C(N)=...,Ki,6.47


In [None]:
# ## Changing the standard values to class type for classification type model. 
# bioactivity_class =[]
# for x in data.standard_value:
#     if float(x) >= 10000:
#         bioactivity_class.append('inactive')
#     elif float(x) < 1000:
#         bioactivity_class.append('active')
#     else:
#         bioactivity_class.append('min effect')

In [115]:
## Saving to csv 
df_rdkit.to_csv('./data/Dopamine_D2_rdkit.csv', index=False)

## Generate molecular descriptors with RDKit

In [117]:
from rdkit import Chem
import rdkit