# Computational Drug Discovery 

In [1]:
#Importing the necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

## Data Collection

### Searching for target protein (Dengue Virus)

In [2]:
#General settings, increasing displayed rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
#Target search for dengue virus
target = new_client.target
target_query = target.search("dengue virus")
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Dengue virus,Dengue virus,24.0,False,CHEMBL613757,[],ORGANISM,12637
1,[],Dengue virus 1,Dengue virus 1,21.0,False,CHEMBL613360,[],ORGANISM,11053
2,[],Dengue virus 2,Dengue virus 2,21.0,False,CHEMBL613966,[],ORGANISM,11060
3,[],Dengue virus 3,Dengue virus 3,21.0,False,CHEMBL612717,[],ORGANISM,11069
4,[],Dengue virus 4,Dengue virus type 4,18.0,False,CHEMBL613728,[],ORGANISM,11070
5,"[{'xref_id': 'P29990', 'xref_name': None, 'xre...",Dengue virus type 2 (strain Thailand/16681/198...,Dengue virus type 2 NS3 protein,15.0,False,CHEMBL5980,"[{'accession': 'P29990', 'component_descriptio...",SINGLE PROTEIN,31634
6,[],Dengue virus,Nonstructural protein 5,14.0,False,CHEMBL4295629,"[{'accession': 'V5TFZ2', 'component_descriptio...",SINGLE PROTEIN,12637
7,[],Homo sapiens,Hepatitis A virus cellular receptor 2,14.0,False,CHEMBL4630879,"[{'accession': 'Q8TDQ0', 'component_descriptio...",SINGLE PROTEIN,9606
8,[],Homo sapiens,Sodium-dependent phosphate transporter 1,12.0,False,CHEMBL4295909,"[{'accession': 'Q8WUM9', 'component_descriptio...",SINGLE PROTEIN,9606
9,"[{'xref_id': 'P05412', 'xref_name': None, 'xre...",Homo sapiens,Proto-oncogene c-JUN,10.0,False,CHEMBL4977,"[{'accession': 'P05412', 'component_descriptio...",SINGLE PROTEIN,9606


### Selecting and retrieving bioactivity data of nonstructural protein 5 (seventh entry)

[Nonstructural protein 5](https://www.sciencedirect.com/topics/medicine-and-dentistry/nonstructural-protein-5) is a component of the dengue virus RNA genome, encoding for a methyltransferase (MTase) at the N-terminal, while the C-terminal encodes for the RNA-dependent RNA polymerase.

In [4]:
#selecting the target protein to the variable "selected_target"
selected_target = targets.target_chembl_id[6]
selected_target

'CHEMBL4295629'

We will only focus on the bioactivity, reported as the half maximal inhibitory concentration (IC50). IC50 indicates how much of a particular inhibitory substance (e.g. drug) is needed to inhibit, in vitro, a given biological process or biological component by 50%.
IC50 is given as nanomolar (nM) units.

In [5]:
#getting IC50 data of the target protein
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [6]:
#creating a dataframe from the filtered data + general information
df = pd.DataFrame.from_dict(res)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65 entries, 0 to 64
Data columns (total 45 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   activity_comment           2 non-null      object
 1   activity_id                65 non-null     int64 
 2   activity_properties        65 non-null     object
 3   assay_chembl_id            65 non-null     object
 4   assay_description          65 non-null     object
 5   assay_type                 65 non-null     object
 6   assay_variant_accession    0 non-null      object
 7   assay_variant_mutation     0 non-null      object
 8   bao_endpoint               65 non-null     object
 9   bao_format                 65 non-null     object
 10  bao_label                  65 non-null     object
 11  canonical_smiles           65 non-null     object
 12  data_validity_comment      17 non-null     object
 13  data_validity_description  17 non-null     object
 14  document_che

In [7]:
#Getting the first five rows to sample the data
df.head(5)

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,18297472,[],CHEMBL4123978,Inhibition of Dengue virus 4 NS5 full length R...,B,,,BAO_0000190,BAO_0000357,single protein format,N#C[C@]1(O)C(n2ccc3c(N)ncnc32)O[C@H](COP(=O)(O...,,,CHEMBL4118212,Bioorg Med Chem Lett,2018,"{'bei': '12.18', 'le': '0.27', 'lle': '7.60', ...",CHEMBL4126343,,CHEMBL4126343,6.47,False,http://www.openphacts.org/units/Nanomolar,3037407,=,1,True,=,,IC50,nM,,340.0,CHEMBL4295629,Dengue virus,Nonstructural protein 5,12637,,,IC50,uM,UO_0000065,,0.34
1,,18297475,[],CHEMBL4123975,Inhibition of Dengue virus 2 NS5 full length R...,B,,,BAO_0000190,BAO_0000357,single protein format,[N-]=[N+]=N[C@]1(COP(=O)(O)OP(=O)(O)OP(=O)(O)O...,,,CHEMBL4118212,Bioorg Med Chem Lett,2018,"{'bei': '12.29', 'le': '0.28', 'lle': '8.01', ...",CHEMBL3417270,,CHEMBL3417270,6.44,False,http://www.openphacts.org/units/Nanomolar,3037406,=,1,True,=,,IC50,nM,,360.0,CHEMBL4295629,Dengue virus,Nonstructural protein 5,12637,,,IC50,uM,UO_0000065,,0.36
2,,18297488,[],CHEMBL4123982,Inhibition of Dengue virus RdRp activity using...,B,,,BAO_0000190,BAO_0000357,single protein format,C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,,,CHEMBL4118212,Bioorg Med Chem Lett,2018,"{'bei': '10.64', 'le': '0.24', 'lle': '7.41', ...",CHEMBL521487,,CHEMBL521487,5.3,False,http://www.openphacts.org/units/Nanomolar,3037397,=,1,True,=,,IC50,nM,,5000.0,CHEMBL4295629,Dengue virus,Nonstructural protein 5,12637,,,IC50,uM,UO_0000065,,5.0
3,,18297489,[],CHEMBL4123982,Inhibition of Dengue virus RdRp activity using...,B,,,BAO_0000190,BAO_0000357,single protein format,C#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,,,CHEMBL4118212,Bioorg Med Chem Lett,2018,"{'bei': '11.41', 'le': '0.26', 'lle': '8.30', ...",CHEMBL4127092,,CHEMBL4127092,5.8,False,http://www.openphacts.org/units/Nanomolar,3037398,=,1,True,=,,IC50,nM,,1600.0,CHEMBL4295629,Dengue virus,Nonstructural protein 5,12637,,,IC50,uM,UO_0000065,,1.6
4,,18297490,[],CHEMBL4123982,Inhibition of Dengue virus RdRp activity using...,B,,,BAO_0000190,BAO_0000357,single protein format,CC#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,,,CHEMBL4118212,Bioorg Med Chem Lett,2018,"{'bei': '10.91', 'le': '0.24', 'lle': '7.81', ...",CHEMBL4127819,,CHEMBL4127819,5.7,False,http://www.openphacts.org/units/Nanomolar,3037399,=,1,True,=,,IC50,nM,,2000.0,CHEMBL4295629,Dengue virus,Nonstructural protein 5,12637,,,IC50,uM,UO_0000065,,2.0


In [8]:
#writing the dataframe to a csv file
df.to_csv("bioactivity_data_raw.csv", index=False)

### Correcting missing data

In [9]:
df2 = df[df.standard_value.notna()]
df2

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,18297472,[],CHEMBL4123978,Inhibition of Dengue virus 4 NS5 full length R...,B,,,BAO_0000190,BAO_0000357,single protein format,N#C[C@]1(O)C(n2ccc3c(N)ncnc32)O[C@H](COP(=O)(O...,,,CHEMBL4118212,Bioorg Med Chem Lett,2018,"{'bei': '12.18', 'le': '0.27', 'lle': '7.60', ...",CHEMBL4126343,,CHEMBL4126343,6.47,False,http://www.openphacts.org/units/Nanomolar,3037407,=,1,True,=,,IC50,nM,,340.0,CHEMBL4295629,Dengue virus,Nonstructural protein 5,12637,,,IC50,uM,UO_0000065,,0.34
1,,18297475,[],CHEMBL4123975,Inhibition of Dengue virus 2 NS5 full length R...,B,,,BAO_0000190,BAO_0000357,single protein format,[N-]=[N+]=N[C@]1(COP(=O)(O)OP(=O)(O)OP(=O)(O)O...,,,CHEMBL4118212,Bioorg Med Chem Lett,2018,"{'bei': '12.29', 'le': '0.28', 'lle': '8.01', ...",CHEMBL3417270,,CHEMBL3417270,6.44,False,http://www.openphacts.org/units/Nanomolar,3037406,=,1,True,=,,IC50,nM,,360.0,CHEMBL4295629,Dengue virus,Nonstructural protein 5,12637,,,IC50,uM,UO_0000065,,0.36
2,,18297488,[],CHEMBL4123982,Inhibition of Dengue virus RdRp activity using...,B,,,BAO_0000190,BAO_0000357,single protein format,C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,,,CHEMBL4118212,Bioorg Med Chem Lett,2018,"{'bei': '10.64', 'le': '0.24', 'lle': '7.41', ...",CHEMBL521487,,CHEMBL521487,5.3,False,http://www.openphacts.org/units/Nanomolar,3037397,=,1,True,=,,IC50,nM,,5000.0,CHEMBL4295629,Dengue virus,Nonstructural protein 5,12637,,,IC50,uM,UO_0000065,,5.0
3,,18297489,[],CHEMBL4123982,Inhibition of Dengue virus RdRp activity using...,B,,,BAO_0000190,BAO_0000357,single protein format,C#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,,,CHEMBL4118212,Bioorg Med Chem Lett,2018,"{'bei': '11.41', 'le': '0.26', 'lle': '8.30', ...",CHEMBL4127092,,CHEMBL4127092,5.8,False,http://www.openphacts.org/units/Nanomolar,3037398,=,1,True,=,,IC50,nM,,1600.0,CHEMBL4295629,Dengue virus,Nonstructural protein 5,12637,,,IC50,uM,UO_0000065,,1.6
4,,18297490,[],CHEMBL4123982,Inhibition of Dengue virus RdRp activity using...,B,,,BAO_0000190,BAO_0000357,single protein format,CC#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,,,CHEMBL4118212,Bioorg Med Chem Lett,2018,"{'bei': '10.91', 'le': '0.24', 'lle': '7.81', ...",CHEMBL4127819,,CHEMBL4127819,5.7,False,http://www.openphacts.org/units/Nanomolar,3037399,=,1,True,=,,IC50,nM,,2000.0,CHEMBL4295629,Dengue virus,Nonstructural protein 5,12637,,,IC50,uM,UO_0000065,,2.0
5,,18297491,[],CHEMBL4123982,Inhibition of Dengue virus RdRp activity using...,B,,,BAO_0000190,BAO_0000357,single protein format,C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,,,CHEMBL4118212,Bioorg Med Chem Lett,2018,"{'bei': '9.49', 'le': '0.22', 'lle': '5.87', '...",CHEMBL486231,GS-461203,CHEMBL486231,4.75,False,http://www.openphacts.org/units/Nanomolar,3037400,=,1,True,=,,IC50,nM,,18000.0,CHEMBL4295629,Dengue virus,Nonstructural protein 5,12637,,,IC50,uM,UO_0000065,,18.0
6,,18297492,[],CHEMBL4123982,Inhibition of Dengue virus RdRp activity using...,B,,,BAO_0000190,BAO_0000357,single protein format,C#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,,,CHEMBL4118212,Bioorg Med Chem Lett,2018,"{'bei': '9.41', 'le': '0.21', 'lle': '6.32', '...",CHEMBL4127921,,CHEMBL4127921,4.8,False,http://www.openphacts.org/units/Nanomolar,3037401,=,1,True,=,,IC50,nM,,15900.0,CHEMBL4295629,Dengue virus,Nonstructural protein 5,12637,,,IC50,uM,UO_0000065,,15.9
7,,18297493,[],CHEMBL4123982,Inhibition of Dengue virus RdRp activity using...,B,,,BAO_0000190,BAO_0000357,single protein format,CC#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,,,CHEMBL4118212,Bioorg Med Chem Lett,2018,,CHEMBL4129313,,CHEMBL4129313,,False,http://www.openphacts.org/units/Nanomolar,3037402,>,1,True,>,,IC50,nM,,20000.0,CHEMBL4295629,Dengue virus,Nonstructural protein 5,12637,,,IC50,uM,UO_0000065,,20.0
8,,18297494,[],CHEMBL4123982,Inhibition of Dengue virus RdRp activity using...,B,,,BAO_0000190,BAO_0000357,single protein format,C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(O)...,,,CHEMBL4118212,Bioorg Med Chem Lett,2018,"{'bei': '10.04', 'le': '0.23', 'lle': '6.99', ...",CHEMBL4127030,,CHEMBL4127030,5.18,False,http://www.openphacts.org/units/Nanomolar,3037403,=,1,True,=,,IC50,nM,,6600.0,CHEMBL4295629,Dengue virus,Nonstructural protein 5,12637,,,IC50,uM,UO_0000065,,6.6
9,,18297495,[],CHEMBL4123982,Inhibition of Dengue virus RdRp activity using...,B,,,BAO_0000190,BAO_0000357,single protein format,C#C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(...,,,CHEMBL4118212,Bioorg Med Chem Lett,2018,"{'bei': '11.76', 'le': '0.26', 'lle': '8.39', ...",CHEMBL4125966,,CHEMBL4125966,6.19,False,http://www.openphacts.org/units/Nanomolar,3037404,=,1,True,=,,IC50,nM,,650.0,CHEMBL4295629,Dengue virus,Nonstructural protein 5,12637,,,IC50,uM,UO_0000065,,0.65


It looks like the dataset does not contain missing data for the standard values. 

###  Pre-processing of data

In this dataframe, bioactivity is represented as standard value of IC50. A compound with a value below 1,000 nM will be considered **active**, while those above 10,000 nM are considered **inactive**. Additionally, values between 1,000 and 10,000 nm are called **intermediate**.

In [10]:
#Labelling compound classes of bioactivity
bioactivity_class= []
for i in df2.standard_value:
    if float(i) <= 1000:
        bioactivity_class.append("active")
    elif float(i) >= 10000:
        bioactivity_class.append("inactive")
    else:
        bioactivity_class.append("intermediate")

In [11]:
#Selecting only the id, chemical structure and standard value of the compounds
selection = ["molecule_chembl_id", "canonical_smiles", "standard_value"]
df3 = df2[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL4126343,N#C[C@]1(O)C(n2ccc3c(N)ncnc32)O[C@H](COP(=O)(O...,340.0
1,CHEMBL3417270,[N-]=[N+]=N[C@]1(COP(=O)(O)OP(=O)(O)OP(=O)(O)O...,360.0
2,CHEMBL521487,C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,5000.0
3,CHEMBL4127092,C#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,1600.0
4,CHEMBL4127819,CC#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,2000.0
5,CHEMBL486231,C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,18000.0
6,CHEMBL4127921,C#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,15900.0
7,CHEMBL4129313,CC#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,20000.0
8,CHEMBL4127030,C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(O)...,6600.0
9,CHEMBL4125966,C#C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(...,650.0


Before concatenating the bioactivity classes, it needs to be converted to a pandas series. It is currently a list, and the function pd.concat can only work with series and pandas dataframes.

In [12]:
#Converting the bioactivity class list to a pandas series
bioactivity_class = pd.Series(bioactivity_class, name="bioactivity_class")

In [13]:
#Concatenating df3 with class definition
df4 = pd.concat([df3, bioactivity_class], axis=1)
df4

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL4126343,N#C[C@]1(O)C(n2ccc3c(N)ncnc32)O[C@H](COP(=O)(O...,340.0,active
1,CHEMBL3417270,[N-]=[N+]=N[C@]1(COP(=O)(O)OP(=O)(O)OP(=O)(O)O...,360.0,active
2,CHEMBL521487,C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,5000.0,intermediate
3,CHEMBL4127092,C#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,1600.0,intermediate
4,CHEMBL4127819,CC#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,2000.0,intermediate
5,CHEMBL486231,C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,18000.0,inactive
6,CHEMBL4127921,C#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,15900.0,inactive
7,CHEMBL4129313,CC#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,20000.0,inactive
8,CHEMBL4127030,C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(O)...,6600.0,intermediate
9,CHEMBL4125966,C#C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(...,650.0,active


In [14]:
#Identifying duplicate molecules
number = df4.molecule_chembl_id.unique()
number

array(['CHEMBL4126343', 'CHEMBL3417270', 'CHEMBL521487', 'CHEMBL4127092',
       'CHEMBL4127819', 'CHEMBL486231', 'CHEMBL4127921', 'CHEMBL4129313',
       'CHEMBL4127030', 'CHEMBL4125966', 'CHEMBL4127565', 'CHEMBL4282493',
       'CHEMBL4287909', 'CHEMBL4280007', 'CHEMBL4284511', 'CHEMBL4295172',
       'CHEMBL4281094', 'CHEMBL4291731', 'CHEMBL4277678', 'CHEMBL4288319',
       'CHEMBL4284513', 'CHEMBL4292407', 'CHEMBL4281097', 'CHEMBL4289005',
       'CHEMBL4291308', 'CHEMBL4280662', 'CHEMBL4287912', 'CHEMBL4277265',
       'CHEMBL4279581', 'CHEMBL4287491', 'CHEMBL2326938', 'CHEMBL3233235',
       'CHEMBL3233241', 'CHEMBL4289289', 'CHEMBL4281417', 'CHEMBL4522025',
       'CHEMBL4475329', nan, 'CHEMBL4578774', 'CHEMBL4454666',
       'CHEMBL4538799', 'CHEMBL4446745', 'CHEMBL4436568', 'CHEMBL4522746',
       'CHEMBL4437452', 'CHEMBL4547767', 'CHEMBL4465538', 'CHEMBL4436970',
       'CHEMBL4513879', 'CHEMBL4556728', 'CHEMBL3890313', 'CHEMBL3959384',
       'CHEMBL3899268', 'CHEMBL3976785'

There are currently 62 unique chembl ids, and 65 rows. This means that there must be a single duplicate chembl id, since there are two rows with NaN values. This is CHEMBL1418094. We can therefore drop a duplicate value of CHEMBL1418094.

In [15]:
df5= df4.drop([62])
df5

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL4126343,N#C[C@]1(O)C(n2ccc3c(N)ncnc32)O[C@H](COP(=O)(O...,340.0,active
1,CHEMBL3417270,[N-]=[N+]=N[C@]1(COP(=O)(O)OP(=O)(O)OP(=O)(O)O...,360.0,active
2,CHEMBL521487,C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,5000.0,intermediate
3,CHEMBL4127092,C#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,1600.0,intermediate
4,CHEMBL4127819,CC#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,2000.0,intermediate
5,CHEMBL486231,C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,18000.0,inactive
6,CHEMBL4127921,C#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,15900.0,inactive
7,CHEMBL4129313,CC#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,20000.0,inactive
8,CHEMBL4127030,C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(O)...,6600.0,intermediate
9,CHEMBL4125966,C#C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(...,650.0,active


We can additionally see that there are currently a few molecules with either NaN, or incorrect values for their bioactivity class. However, we can manually discern their bioactivity class from their standard values. We can proceed to add these in the dataframe. 

In [16]:
df5.bioactivity_class[55]= "inactive"
df5.bioactivity_class[56]= "inactive"
df5.bioactivity_class[61]= "intermediate"
df5.bioactivity_class[63]= "active"
df5.bioactivity_class[64]= "active"
df5

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL4126343,N#C[C@]1(O)C(n2ccc3c(N)ncnc32)O[C@H](COP(=O)(O...,340.0,active
1,CHEMBL3417270,[N-]=[N+]=N[C@]1(COP(=O)(O)OP(=O)(O)OP(=O)(O)O...,360.0,active
2,CHEMBL521487,C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,5000.0,intermediate
3,CHEMBL4127092,C#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,1600.0,intermediate
4,CHEMBL4127819,CC#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,2000.0,intermediate
5,CHEMBL486231,C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,18000.0,inactive
6,CHEMBL4127921,C#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,15900.0,inactive
7,CHEMBL4129313,CC#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,20000.0,inactive
8,CHEMBL4127030,C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(O)...,6600.0,intermediate
9,CHEMBL4125966,C#C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(...,650.0,active


In [17]:
#Removing missing data
df6=df5[df4.standard_value.notna()]
df6

  


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL4126343,N#C[C@]1(O)C(n2ccc3c(N)ncnc32)O[C@H](COP(=O)(O...,340.0,active
1,CHEMBL3417270,[N-]=[N+]=N[C@]1(COP(=O)(O)OP(=O)(O)OP(=O)(O)O...,360.0,active
2,CHEMBL521487,C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,5000.0,intermediate
3,CHEMBL4127092,C#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,1600.0,intermediate
4,CHEMBL4127819,CC#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,2000.0,intermediate
5,CHEMBL486231,C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,18000.0,inactive
6,CHEMBL4127921,C#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,15900.0,inactive
7,CHEMBL4129313,CC#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,20000.0,inactive
8,CHEMBL4127030,C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(O)...,6600.0,intermediate
9,CHEMBL4125966,C#C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(...,650.0,active


Finally, we can write the newly preprocessed data into a csv file.

In [18]:
df6.to_csv("bioactivity_data_preprocessed.csv", index=False)

In [19]:
! ls -l

total 696
-rwxrwxrwx 1 root root   6119 Nov 15 10:47 bioactivity_data_preprocessed.csv
-rwxrwxrwx 1 root root  39757 Nov 15 10:47 bioactivity_data_raw.csv
-rwxrwxrwx 1 root root 661884 Nov 15 10:44 Computational-Drug-Discovery.ipynb


We've now created the raw and preprocessed data of nonstructural protein 5.

## Exploratory Data Analysis

### Loading the preprocessed data

In [20]:
new_df = pd.read_csv('bioactivity_data_preprocessed.csv')

### Installing conda and rdkit

[Conda](https://docs.conda.io/en/latest/) is a open source package management system, assisting in installing packages. On the other hand, [rdkit](https://github.com/rdkit/rdkit) is a collection of cheminformatics and machine-learning software written in C++ and Python; it allows you to compute molecular descriptors for the previously compiled data. 

We will use these tools in this section.

- conda install -c conda-forge rdkit

### Calculating Lipinski descriptors

A scientist named Christopher A. Lipinski, devised a rule of thumb to evaluate the **druglikeness** of a chemical compound. The result would then determine whether the compound would constitute as a suitable, orally active drug in humans. This is dubbed as [Lipinski's rule of five](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2728118/). The overall druglikeness is based on five molecular properties, namely the absorption, distribution, metabolism, and excretion ("ADME").

The Lipinski's Rule states the following:

- Molecular weight < 500 Dalton
- Octanol-water partition coefficient (LogP) < 5
- Hydrogen bond donors < 5
- Hydrogen bond acceptors < 10

In [21]:
#Importing the required libraries
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

In [22]:
#Compute descriptors 
def lipinski(smiles, verbose=False):

    moldata= []
    for elem in smiles:
        mol=Chem.MolFromSmiles(elem) 
        moldata.append(mol)
       
    baseData= np.arange(1,1)
    i=0  
    for mol in moldata:        
       
        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
           
        row = np.array([desc_MolWt,
                        desc_MolLogP,
                        desc_NumHDonors,
                        desc_NumHAcceptors])   
    
        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1      
    
    columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]   
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)
    
    return descriptors

In [23]:
df_lipinski = lipinski(df3.canonical_smiles)
df_lipinski

Unnamed: 0,MW,LogP,NumHDonors,NumHAcceptors
0,531.204,-1.13022,7.0,14.0
1,524.169,-1.574,7.0,14.0
2,498.167,-2.1108,7.0,12.0
3,508.162,-2.4975,7.0,12.0
4,522.189,-2.1074,7.0,12.0
5,500.158,-1.1336,6.0,11.0
6,510.153,-1.5203,6.0,11.0
7,524.18,-1.1302,6.0,11.0
8,516.157,-1.8136,7.0,12.0
9,526.152,-2.2003,7.0,12.0


We can now proceed with combining the two dataframes "df_lipinski" and "new_df" to obtain a better overview of the molecular data.

In [24]:
df_combined = pd.concat([new_df, df_lipinski], axis=1)

In [25]:
df_combined

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class,MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL4126343,N#C[C@]1(O)C(n2ccc3c(N)ncnc32)O[C@H](COP(=O)(O...,340.0,active,531.204,-1.13022,7.0,14.0
1,CHEMBL3417270,[N-]=[N+]=N[C@]1(COP(=O)(O)OP(=O)(O)OP(=O)(O)O...,360.0,active,524.169,-1.574,7.0,14.0
2,CHEMBL521487,C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,5000.0,intermediate,498.167,-2.1108,7.0,12.0
3,CHEMBL4127092,C#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,1600.0,intermediate,508.162,-2.4975,7.0,12.0
4,CHEMBL4127819,CC#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,2000.0,intermediate,522.189,-2.1074,7.0,12.0
5,CHEMBL486231,C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,18000.0,inactive,500.158,-1.1336,6.0,11.0
6,CHEMBL4127921,C#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,15900.0,inactive,510.153,-1.5203,6.0,11.0
7,CHEMBL4129313,CC#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,20000.0,inactive,524.18,-1.1302,6.0,11.0
8,CHEMBL4127030,C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(O)...,6600.0,intermediate,516.157,-1.8136,7.0,12.0
9,CHEMBL4125966,C#C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(...,650.0,active,526.152,-2.2003,7.0,12.0


In [26]:
#Dropping row 62 since that was a copy of CHEMBL418052
df_combined2= df_combined.drop([62])
df_combined2

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class,MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL4126343,N#C[C@]1(O)C(n2ccc3c(N)ncnc32)O[C@H](COP(=O)(O...,340.0,active,531.204,-1.13022,7.0,14.0
1,CHEMBL3417270,[N-]=[N+]=N[C@]1(COP(=O)(O)OP(=O)(O)OP(=O)(O)O...,360.0,active,524.169,-1.574,7.0,14.0
2,CHEMBL521487,C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,5000.0,intermediate,498.167,-2.1108,7.0,12.0
3,CHEMBL4127092,C#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,1600.0,intermediate,508.162,-2.4975,7.0,12.0
4,CHEMBL4127819,CC#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,2000.0,intermediate,522.189,-2.1074,7.0,12.0
5,CHEMBL486231,C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,18000.0,inactive,500.158,-1.1336,6.0,11.0
6,CHEMBL4127921,C#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,15900.0,inactive,510.153,-1.5203,6.0,11.0
7,CHEMBL4129313,CC#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,20000.0,inactive,524.18,-1.1302,6.0,11.0
8,CHEMBL4127030,C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(O)...,6600.0,intermediate,516.157,-1.8136,7.0,12.0
9,CHEMBL4125966,C#C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(...,650.0,active,526.152,-2.2003,7.0,12.0


### Converting IC50 to pIC50

[pIC50](https://www.collaborativedrug.com/what-is-pic50-2/) is considered to be the negative logarithmic of IC50 (-log10(IC50)). We perform this conversion in order to make the IC50 data more uniformly distributed. 

Here, we will utilize the custom function pIC50(). It  will accept a DataFrame as input, followed by:
- Taking the IC50 values from the standard_value column and converting it from nM to M by multiplying the value by 10^-9
- Taking the molar value and applying -log10
- Deleting the standard_value column and creating a new pIC50 column

In [27]:
def pIC50(input):
    pIC50 = []

    for i in input['standard_value']:
        molar = i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop('standard_value', 1)
        
    return x

One thing to note however; a value greater than 100,000,000 will cause its negative logarithmic value to turn negative. Normally, you would need to cap the values to a limit of 100,000,000 in order to prevent this occurrence. 

In [28]:
df_combined2.standard_value.describe()

count        62.000000
mean      94504.790323
std      146546.895941
min          16.000000
25%        2650.000000
50%       14950.000000
75%      135500.000000
max      452000.000000
Name: standard_value, dtype: float64

This dataset seems to lack any values above 100,000,000. We can therefore omit the capping process, and proceed with the conversion of IC50 to pIC50.

In [29]:
df_final = pIC50(df_combined2)
df_final

  if __name__ == '__main__':


Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL4126343,N#C[C@]1(O)C(n2ccc3c(N)ncnc32)O[C@H](COP(=O)(O...,active,531.204,-1.13022,7.0,14.0,6.468521
1,CHEMBL3417270,[N-]=[N+]=N[C@]1(COP(=O)(O)OP(=O)(O)OP(=O)(O)O...,active,524.169,-1.574,7.0,14.0,6.443697
2,CHEMBL521487,C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,intermediate,498.167,-2.1108,7.0,12.0,5.30103
3,CHEMBL4127092,C#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,intermediate,508.162,-2.4975,7.0,12.0,5.79588
4,CHEMBL4127819,CC#C[C@@]1(O)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,intermediate,522.189,-2.1074,7.0,12.0,5.69897
5,CHEMBL486231,C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,inactive,500.158,-1.1336,6.0,11.0,4.744727
6,CHEMBL4127921,C#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,inactive,510.153,-1.5203,6.0,11.0,4.798603
7,CHEMBL4129313,CC#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,inactive,524.18,-1.1302,6.0,11.0,4.69897
8,CHEMBL4127030,C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(O)...,intermediate,516.157,-1.8136,7.0,12.0,5.180456
9,CHEMBL4125966,C#C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(...,active,526.152,-2.2003,7.0,12.0,6.187087


In [30]:
df_final.describe()

Unnamed: 0,MW,LogP,NumHDonors,NumHAcceptors,pIC50
count,62.0,62.0,62.0,62.0,62.0
mean,491.807984,3.157621,3.564516,7.016129,4.843798
std,51.044148,2.640248,2.280502,2.877225,1.052163
min,338.341,-2.4975,0.0,2.0,3.344862
25%,461.883,3.13144,1.0,5.0,3.87088
50%,500.507,4.1083,4.0,6.0,4.826062
75%,523.674,4.8991,5.0,8.0,5.57698
max,643.596,7.0895,7.0,14.0,7.79588


In [31]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62 entries, 0 to 61
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   molecule_chembl_id  62 non-null     object 
 1   canonical_smiles    62 non-null     object 
 2   bioactivity_class   62 non-null     object 
 3   MW                  62 non-null     float64
 4   LogP                62 non-null     float64
 5   NumHDonors          62 non-null     float64
 6   NumHAcceptors       62 non-null     float64
 7   pIC50               62 non-null     float64
dtypes: float64(5), object(3)
memory usage: 4.4+ KB


To allow a more simple comparison between active and inactive compounds, we are going to remove the "intermediate" bioactivity class. 

In [32]:
df_2class = df_final[df_final.bioactivity_class != 'intermediate']
df_2class

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL4126343,N#C[C@]1(O)C(n2ccc3c(N)ncnc32)O[C@H](COP(=O)(O...,active,531.204,-1.13022,7.0,14.0,6.468521
1,CHEMBL3417270,[N-]=[N+]=N[C@]1(COP(=O)(O)OP(=O)(O)OP(=O)(O)O...,active,524.169,-1.574,7.0,14.0,6.443697
5,CHEMBL486231,C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)OP...,inactive,500.158,-1.1336,6.0,11.0,4.744727
6,CHEMBL4127921,C#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O)...,inactive,510.153,-1.5203,6.0,11.0,4.798603
7,CHEMBL4129313,CC#C[C@@]1(F)[C@H](O)[C@@H](COP(=O)(O)OP(=O)(O...,inactive,524.18,-1.1302,6.0,11.0,4.69897
9,CHEMBL4125966,C#C[C@@]1(O)[C@H](O)[C@@](F)(COP(=O)(O)OP(=O)(...,active,526.152,-2.2003,7.0,12.0,6.187087
11,CHEMBL4282493,COc1cc(C)c(C(=O)NS(=O)(=O)c2cccc3cccnc23)cc1-c...,active,492.578,3.74282,2.0,7.0,7.79588
12,CHEMBL4287909,O=C(OC1=CS(=O)(=O)Nc2ccc(Oc3ccccc3Br)cc21)c1cc...,inactive,472.316,5.1521,1.0,5.0,4.511449
17,CHEMBL4291731,COc1cccc(Oc2ccc3c(c2)C(OC(=O)c2ccccc2)=CS(=O)(...,inactive,423.446,4.3982,1.0,6.0,4.692504
18,CHEMBL4277678,CC(C)(C)c1cccc(Oc2ccc3c(c2)C(OC(=O)c2ccccc2)=C...,inactive,449.528,5.6871,1.0,5.0,4.801343
