#### **Importing Libraries**

In [None]:
# pip install -r requirements.txt

In [2]:
import pandas as pd
from chembl_webresource_client.new_client import new_client 

### **Query Targets - EGFR Protein Family**

In [3]:
target = new_client.target
target_query = target.search('CHEMBL2363049')
targets = pd.DataFrame.from_dict(target_query)
targets.head(3)

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Homo sapiens,Epidermal growth factor receptor,12.0,False,CHEMBL2363049,"[{'accession': 'P04626', 'component_descriptio...",PROTEIN FAMILY,9606
1,"[{'xref_id': 'Q15303', 'xref_name': None, 'xre...",Homo sapiens,Receptor protein-tyrosine kinase erbB-4,11.0,False,CHEMBL3009,"[{'accession': 'Q15303', 'component_descriptio...",SINGLE PROTEIN,9606
2,"[{'xref_id': 'P21860', 'xref_name': None, 'xre...",Homo sapiens,Receptor tyrosine-protein kinase erbB-3,10.0,False,CHEMBL5838,"[{'accession': 'P21860', 'component_descriptio...",SINGLE PROTEIN,9606


In [4]:
selected_target = targets.target_chembl_id[0] 
selected_target

'CHEMBL2363049'

#### *Retrieve bioactivity data for *EGFR* (CHEMBL2363049) that are reported as IC50 values in nanomolars (nM)*

In [5]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50", units='nM')
df = pd.DataFrame.from_dict(res)
df.head(5)
# df.to_csv('biactivity_of_EGFR.csv', index=False)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,3261308,[],CHEMBL1105218,Inhibition of EGFR Leu858Arg and Thr790Met mut...,B,P00533,"L858R,T790M",BAO_0000190,...,Homo sapiens,Epidermal growth factor receptor,9606,,,IC50,nM,UO_0000065,,140.0
1,,,3261309,[],CHEMBL1105218,Inhibition of EGFR Leu858Arg and Thr790Met mut...,B,P00533,"L858R,T790M",BAO_0000190,...,Homo sapiens,Epidermal growth factor receptor,9606,,,IC50,nM,UO_0000065,,1500.0
2,,,3261310,[],CHEMBL1105218,Inhibition of EGFR Leu858Arg and Thr790Met mut...,B,P00533,"L858R,T790M",BAO_0000190,...,Homo sapiens,Epidermal growth factor receptor,9606,,,IC50,nM,UO_0000065,,1000.0
3,,,3261311,[],CHEMBL1105218,Inhibition of EGFR Leu858Arg and Thr790Met mut...,B,P00533,"L858R,T790M",BAO_0000190,...,Homo sapiens,Epidermal growth factor receptor,9606,,,IC50,nM,UO_0000065,,1500.0
4,,,3261312,[],CHEMBL1105218,Inhibition of EGFR Leu858Arg and Thr790Met mut...,B,P00533,"L858R,T790M",BAO_0000190,...,Homo sapiens,Epidermal growth factor receptor,9606,,,IC50,nM,UO_0000065,,190.0


#### **Handling missing data**

In [6]:
df2 = df[df.standard_value.notna()]
# No data lost

### **Data pre-processing of bioactivity data**

#### Labeling compounds as either active, inactive or intermediate

This is based on the nanomolar IC50 standard value for each compound. Active: < 10 nM , Inactive: > 100 nM, Intermediate: In-between

In [7]:
active_thres = 10
inactive_thres = 100

bioactivity_class = []
for val in df2.standard_value:
    if float(val) > inactive_thres:
        bioactivity_class.append('inactive')
    elif float(val) < active_thres:
        bioactivity_class.append('active')
    else:
        bioactivity_class.append('intermediate')
bioactivity_class = pd.Series(bioactivity_class, name = 'bioactivity')
# bioactivity_class

#### Create dataframe for compound analysis 
molecule_chembl_id: describes molecule tested against EGFR

canonical_smiles: Simplified Molecular Input Line Entry System (SMILE), chemical structure for computer representation

standard_value: IC50 value in nM

bioactivity_class: Description of compound efficacy 

In [8]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df3 = pd.concat([df2[selection], bioactivity_class], axis=1)
df3.to_csv('bioactivity_of_EGFR_inhibitors.csv', index=False)