## **Installing libraries**

In [1]:
! pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.7-py3-none-any.whl (55 kB)
[?25l[K     |██████                          | 10 kB 16.4 MB/s eta 0:00:01[K     |███████████▉                    | 20 kB 15.0 MB/s eta 0:00:01[K     |█████████████████▊              | 30 kB 10.2 MB/s eta 0:00:01[K     |███████████████████████▋        | 40 kB 8.8 MB/s eta 0:00:01[K     |█████████████████████████████▌  | 51 kB 5.4 MB/s eta 0:00:01[K     |████████████████████████████████| 55 kB 2.1 MB/s 
Collecting requests-cache~=0.7.0
  Downloading requests_cache-0.7.4-py3-none-any.whl (38 kB)
Collecting pyyaml>=5.4
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 11.2 MB/s 
[?25hCollecting url-normalize<2.0,>=1.4
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Collecting itsdangerous>=2.0.1
  Downloading itsdangerous-2.0.1-py3-none-any.whl (18 kB)
Installing collected packages: u

## **Importing libraries**

In [2]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

## **Search for Target protein**

### **Target search for PD-L1**

In [3]:
# Target search for PD-L1 protein
target = new_client.target
target_query = target.search('PD-L1')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Homo sapiens,Programmed cell death 1 ligand 1,21.0,False,CHEMBL3580522,"[{'accession': 'Q9NZQ7', 'component_descriptio...",SINGLE PROTEIN,9606.0
1,[],Homo sapiens,Programmed cell death protein 1/Programmed cel...,21.0,False,CHEMBL4523993,"[{'accession': 'Q15116', 'component_descriptio...",PROTEIN COMPLEX,9606.0
2,[],Mus musculus,Programmed cell death protein 1/Programmed cel...,19.0,False,CHEMBL4630756,"[{'accession': 'Q9EP73', 'component_descriptio...",PROTEIN COMPLEX,10090.0
3,[],Mus musculus,Programmed cell death 1 ligand 1,18.0,False,CHEMBL4523448,"[{'accession': 'Q9EP73', 'component_descriptio...",SINGLE PROTEIN,10090.0
4,[],,PD-145,17.0,False,CHEMBL614845,[],CELL-LINE,
5,[],Human papillomavirus type 16,Major capsid protein L1,15.0,False,CHEMBL3562172,"[{'accession': 'P03101', 'component_descriptio...",SINGLE PROTEIN,333760.0
6,[],Human papillomavirus type 58,Major capsid protein L1,15.0,False,CHEMBL3562173,"[{'accession': 'P26535', 'component_descriptio...",SINGLE PROTEIN,10598.0
7,[],Human papillomavirus type 18,Major capsid protein L1,15.0,False,CHEMBL3562174,"[{'accession': 'P06794', 'component_descriptio...",SINGLE PROTEIN,333761.0
8,[],Mus musculus,3T3-L1,14.0,False,CHEMBL614510,[],CELL-LINE,10090.0
9,"[{'xref_id': 'P06797', 'xref_name': None, 'xre...",Mus musculus,Cathepsin L1,14.0,False,CHEMBL5291,"[{'accession': 'P06797', 'component_descriptio...",SINGLE PROTEIN,10090.0


### **Select and retrieve bioactivity data for *Programmed Death 1 Ligand 1* in Homo sapiens (first entry)**

We will assign the first entry (which corresponds to the target protein, *PD-L1 in Homo sapiens*) to the ***selected_target*** variable

In [4]:
selected_target = targets.target_chembl_id[0]
selected_target

'CHEMBL3580522'

Here, we will retrieve only bioactivity data for *PD-L1* (CHEMBL3580522) that are reported as IC$_{50}$ values in nM (nanomolar) unit. IC$_{50}$ is used for the measure of antagonist drug potency in pharmacological research.

In [5]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [6]:
df = pd.DataFrame.from_dict(res)

In [7]:
df.head(3)

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,15614399,[],CHEMBL3583018,Binding affinity to His-tagged PD-L1 (unknown ...,B,,,BAO_0000190,BAO_0000357,single protein format,COc1cc(OCc2cccc(-c3ccccc3)c2C)cc(OC)c1CNC(CO)(...,,,CHEMBL3580542,ACS Med. Chem. Lett.,2015,"{'bei': '17.58', 'le': '0.33', 'lle': '5.16', ...",CHEMBL3582257,,CHEMBL3582257,8.22,False,http://www.openphacts.org/units/Nanomolar,2483175,=,1,True,=,,IC50,nM,,6.0,CHEMBL3580522,Homo sapiens,Programmed cell death 1 ligand 1,9606,,,IC50,uM,UO_0000065,,0.006
1,,15614400,[],CHEMBL3583018,Binding affinity to His-tagged PD-L1 (unknown ...,B,,,BAO_0000190,BAO_0000357,single protein format,COc1cc(OCc2cccc(-c3ccccc3)c2C#N)cc(OC)c1CN1CCC...,,,CHEMBL3580542,ACS Med. Chem. Lett.,2015,"{'bei': '16.90', 'le': '0.31', 'lle': '2.96', ...",CHEMBL3582256,,CHEMBL3582256,8.22,False,http://www.openphacts.org/units/Nanomolar,2483174,=,1,True,=,,IC50,nM,,6.0,CHEMBL3580522,Homo sapiens,Programmed cell death 1 ligand 1,9606,,,IC50,uM,UO_0000065,,0.006
2,,15614401,[],CHEMBL3583018,Binding affinity to His-tagged PD-L1 (unknown ...,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1c(COc2ccc(CNCC3CCC(=O)N3)cc2)cccc1-c1ccccc1,,,CHEMBL3580542,ACS Med. Chem. Lett.,2015,"{'bei': '14.97', 'le': '0.27', 'lle': '1.39', ...",CHEMBL3582255,,CHEMBL3582255,6.0,False,http://www.openphacts.org/units/Nanomolar,2483173,=,1,True,=,,IC50,nM,,1010.0,CHEMBL3580522,Homo sapiens,Programmed cell death 1 ligand 1,9606,,,IC50,uM,UO_0000065,,1.01


We want a 'standard_value' to be as low as possible to see which compounds or pharmalogical drugs interact the most with PD-L1, which a positive PD-L1 IHC assay can be from a genetic process (mutations) in cancer cells or due to reactivity to a T cell infiltrate (immune response).

Using the 'unique' function, we can see that only $IC_{50}$ is present in the DataFrame

In [8]:
df.standard_type.unique()

array(['IC50'], dtype=object)

Finally we will save the resulting bioactivity data to a CSV file **pdl1_bioactivity_data.csv**.

In [9]:
df.to_csv('pdl1_bioactivity_data.csv', index=False)

## **Copying files to Google Drive**

Firstly, we need to mount the Google Drive into Colab so that we can have access to our Google adrive from within Colab.

In [11]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)


Mounted at /content/gdrive/


Next, we create a **pdl1_data** folder in our **Colab Notebooks** folder on Google Drive.

In [12]:
! mkdir "/content/gdrive/My Drive/Colab Notebooks/pdl1_data"

mkdir: cannot create directory ‘/content/gdrive/My Drive/Colab Notebooks/pdl1_data’: File exists


In [13]:
! cp pdl1_bioactivity_data.csv "/content/gdrive/My Drive/Colab Notebooks/pdl1_data"

In [14]:
! ls -l "/content/gdrive/My Drive/Colab Notebooks/pdl1_data"

total 117
-rw------- 1 root root  14553 Aug 22 09:37 pdl1_bioactivity_data.csv
-rw------- 1 root root 102061 Aug 22 09:36 PDL1_bioactivity_data.ipynb
-rw------- 1 root root   2349 Aug 21 20:41 pdl1_bioactivity_preprocessed_data.csv


In [15]:
! head pdl1_bioactivity_data.csv

activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
,15614399,[],CHEMBL3583018,Binding affinity to His-tagged PD-L1 (unknown origin) assessed as inhibition of interaction with PD1 preincubated for 15 mins followed by PD1 addition measured after 15 mins by HTRF assay,B,,,BAO_0000190,BAO_0000357,single protein format,COc1cc(OCc2cccc(-c3ccccc3)c2C)cc(OC)c1CNC(CO)(CO)CO,,,CHE

## **Handling missing data**
If any compounds has missing value for the **standard_value** column then drop it

In [16]:
df2 = df[df.standard_value.notna()]
df2

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,15614399,[],CHEMBL3583018,Binding affinity to His-tagged PD-L1 (unknown ...,B,,,BAO_0000190,BAO_0000357,single protein format,COc1cc(OCc2cccc(-c3ccccc3)c2C)cc(OC)c1CNC(CO)(...,,,CHEMBL3580542,ACS Med. Chem. Lett.,2015,"{'bei': '17.58', 'le': '0.33', 'lle': '5.16', ...",CHEMBL3582257,,CHEMBL3582257,8.22,False,http://www.openphacts.org/units/Nanomolar,2483175,=,1,True,=,,IC50,nM,,6.0,CHEMBL3580522,Homo sapiens,Programmed cell death 1 ligand 1,9606,,,IC50,uM,UO_0000065,,0.006
1,,15614400,[],CHEMBL3583018,Binding affinity to His-tagged PD-L1 (unknown ...,B,,,BAO_0000190,BAO_0000357,single protein format,COc1cc(OCc2cccc(-c3ccccc3)c2C#N)cc(OC)c1CN1CCC...,,,CHEMBL3580542,ACS Med. Chem. Lett.,2015,"{'bei': '16.90', 'le': '0.31', 'lle': '2.96', ...",CHEMBL3582256,,CHEMBL3582256,8.22,False,http://www.openphacts.org/units/Nanomolar,2483174,=,1,True,=,,IC50,nM,,6.0,CHEMBL3580522,Homo sapiens,Programmed cell death 1 ligand 1,9606,,,IC50,uM,UO_0000065,,0.006
2,,15614401,[],CHEMBL3583018,Binding affinity to His-tagged PD-L1 (unknown ...,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1c(COc2ccc(CNCC3CCC(=O)N3)cc2)cccc1-c1ccccc1,,,CHEMBL3580542,ACS Med. Chem. Lett.,2015,"{'bei': '14.97', 'le': '0.27', 'lle': '1.39', ...",CHEMBL3582255,,CHEMBL3582255,6.0,False,http://www.openphacts.org/units/Nanomolar,2483173,=,1,True,=,,IC50,nM,,1010.0,CHEMBL3580522,Homo sapiens,Programmed cell death 1 ligand 1,9606,,,IC50,uM,UO_0000065,,1.01
3,,15614402,[],CHEMBL3583018,Binding affinity to His-tagged PD-L1 (unknown ...,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1c(COc2ccc(CN[C@H](C)C(=O)O)cc2Cl)cccc1-c1cc...,,,CHEMBL3580542,ACS Med. Chem. Lett.,2015,"{'bei': '20.06', 'le': '0.39', 'lle': '2.76', ...",CHEMBL3582254,,CHEMBL3582254,8.22,False,http://www.openphacts.org/units/Nanomolar,2483172,=,1,True,=,,IC50,nM,,6.0,CHEMBL3580522,Homo sapiens,Programmed cell death 1 ligand 1,9606,,,IC50,uM,UO_0000065,,0.006
4,,15614403,[],CHEMBL3583018,Binding affinity to His-tagged PD-L1 (unknown ...,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1cc(CN2CCC(C(=O)NCCO)CC2)ccc1OCc1cccc(-c2ccc...,,,CHEMBL3580542,ACS Med. Chem. Lett.,2015,"{'bei': '14.72', 'le': '0.27', 'lle': '2.09', ...",CHEMBL3582253,,CHEMBL3582253,6.96,False,http://www.openphacts.org/units/Nanomolar,2483171,=,1,True,=,,IC50,nM,,110.0,CHEMBL3580522,Homo sapiens,Programmed cell death 1 ligand 1,9606,,,IC50,uM,UO_0000065,,0.11
5,,15614404,[],CHEMBL3583018,Binding affinity to His-tagged PD-L1 (unknown ...,B,,,BAO_0000190,BAO_0000357,single protein format,COc1cc(OCc2cccc(-c3ccccc3)c2C)cc(OC)c1CN1CCCC[...,,,CHEMBL3580542,ACS Med. Chem. Lett.,2015,"{'bei': '17.29', 'le': '0.32', 'lle': '2.52', ...",CHEMBL3582252,,CHEMBL3582252,8.22,False,http://www.openphacts.org/units/Nanomolar,2483170,=,1,True,=,,IC50,nM,,6.0,CHEMBL3580522,Homo sapiens,Programmed cell death 1 ligand 1,9606,,,IC50,uM,UO_0000065,,0.006
6,,18080412,[],CHEMBL4017391,Binding affinity to His-tagged PD-L1 (unknown ...,B,,,BAO_0000190,BAO_0000357,single protein format,COc1cc(OCc2cccc(-c3ccccc3)c2C)cc(OC)c1CNCCNC(C)=O,,,CHEMBL4014356,J Med Chem,2017,"{'bei': '18.33', 'le': '0.34', 'lle': '3.74', ...",CHEMBL4081869,,CHEMBL4081869,8.22,False,http://www.openphacts.org/units/Nanomolar,2997001,=,1,True,=,,IC50,nM,,6.0,CHEMBL3580522,Homo sapiens,Programmed cell death 1 ligand 1,9606,,,IC50,nM,UO_0000065,100.0,6.0
7,,18080413,[],CHEMBL4017391,Binding affinity to His-tagged PD-L1 (unknown ...,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1c(COc2ccc(CN3CCCCC3C(=O)O)cc2Br)cccc1-c1ccccc1,,,CHEMBL4014356,J Med Chem,2017,"{'bei': '13.83', 'le': '0.29', 'lle': '0.40', ...",CHEMBL4099869,,CHEMBL4099869,6.84,True,http://www.openphacts.org/units/Nanomolar,2997002,=,1,True,=,,IC50,nM,,146.0,CHEMBL3580522,Homo sapiens,Programmed cell death 1 ligand 1,9606,,,IC50,nM,UO_0000065,,146.0
8,,18080414,[],CHEMBL4017391,Binding affinity to His-tagged PD-L1 (unknown ...,B,,,BAO_0000190,BAO_0000357,single protein format,COc1nc(OCc2cccc(-c3ccccc3)c2C)ccc1CNCCNC(C)=O,,,CHEMBL4014356,J Med Chem,2017,"{'bei': '18.46', 'le': '0.34', 'lle': '3.87', ...",CHEMBL4089730,,CHEMBL4089730,7.75,True,http://www.openphacts.org/units/Nanomolar,2997003,=,1,True,=,,IC50,nM,,18.0,CHEMBL3580522,Homo sapiens,Programmed cell death 1 ligand 1,9606,,,IC50,nM,UO_0000065,,18.0
9,,18080415,[],CHEMBL4017391,Binding affinity to His-tagged PD-L1 (unknown ...,B,,,BAO_0000190,BAO_0000357,single protein format,COc1cc(OCc2cccc(-c3ccccc3)c2C)cc(OC)c1CN[C@@H]...,,,CHEMBL4014356,J Med Chem,2017,"{'bei': '18.29', 'le': '0.34', 'lle': '2.86', ...",CHEMBL4061613,,CHEMBL4061613,8.22,False,http://www.openphacts.org/units/Nanomolar,2997004,=,1,True,=,,IC50,nM,,6.0,CHEMBL3580522,Homo sapiens,Programmed cell death 1 ligand 1,9606,,,IC50,nM,UO_0000065,100.0,6.0


## **Data pre-processing of the bioactivity data**

### **Labeling compounds as either being active, inactive, or intermediate**
The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be **active** while those greater than 10,000 nM will be considered to be **inactive**. As for those values in between 1,000 and 10,000 nM will be referred to as **intermediate**. 

In [17]:
bioactivity_class = []
for i in df2.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

### **Iterate the *molecule_chembl_id* to a list because we don't want duplicates**

In [18]:
mol_cid = []
for i in df2.molecule_chembl_id:
  mol_cid.append(i)

### **Iterate *canonical_smiles* to a list because we still don't want duplicates**

In [22]:
canonical_smiles = []
for i in df2.canonical_smiles:
  canonical_smiles.append(i)

### **Iterate *standard_value* to a list because we again still don't want duplicates**

In [23]:
standard_value = []
for i in df2.standard_value:
  standard_value.append(i)

### **Combine the 4 lists into 1 dataframe**

In [24]:
data_tuples = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value))
df3 = pd.DataFrame( data_tuples,  columns=['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'standard_value'])

In [25]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,standard_value
0,CHEMBL3582257,COc1cc(OCc2cccc(-c3ccccc3)c2C)cc(OC)c1CNC(CO)(...,active,6.0
1,CHEMBL3582256,COc1cc(OCc2cccc(-c3ccccc3)c2C#N)cc(OC)c1CN1CCC...,active,6.0
2,CHEMBL3582255,Cc1c(COc2ccc(CNCC3CCC(=O)N3)cc2)cccc1-c1ccccc1,intermediate,1010.0
3,CHEMBL3582254,Cc1c(COc2ccc(CN[C@H](C)C(=O)O)cc2Cl)cccc1-c1cc...,active,6.0
4,CHEMBL3582253,Cc1cc(CN2CCC(C(=O)NCCO)CC2)ccc1OCc1cccc(-c2ccc...,active,110.0
5,CHEMBL3582252,COc1cc(OCc2cccc(-c3ccccc3)c2C)cc(OC)c1CN1CCCC[...,active,6.0
6,CHEMBL4081869,COc1cc(OCc2cccc(-c3ccccc3)c2C)cc(OC)c1CNCCNC(C)=O,active,6.0
7,CHEMBL4099869,Cc1c(COc2ccc(CN3CCCCC3C(=O)O)cc2Br)cccc1-c1ccccc1,active,146.0
8,CHEMBL4089730,COc1nc(OCc2cccc(-c3ccccc3)c2C)ccc1CNCCNC(C)=O,active,18.0
9,CHEMBL4061613,COc1cc(OCc2cccc(-c3ccccc3)c2C)cc(OC)c1CN[C@@H]...,active,6.0


Save the new dataframe to pre-processed CSV file

In [26]:
df3.to_csv('pdl1_bioactivity_preprocessed_data.csv', index=False)

In [27]:
! ls -l

total 28
drwx------ 5 root root  4096 Aug 22 09:36 gdrive
-rw-r--r-- 1 root root 14553 Aug 22 09:36 pdl1_bioactivity_data.csv
-rw-r--r-- 1 root root  2349 Aug 22 09:39 pdl1_bioactivity_preprocessed_data.csv
drwxr-xr-x 1 root root  4096 Aug 13 13:35 sample_data


In [28]:
! cp pdl1_bioactivity_preprocessed_data.csv "/content/gdrive/My Drive/Colab Notebooks/pdl1_data"

In [29]:
! ls "/content/gdrive/My Drive/Colab Notebooks/pdl1_data"

pdl1_bioactivity_data.csv    pdl1_bioactivity_preprocessed_data.csv
PDL1_bioactivity_data.ipynb
