In [1]:
import pandas as pd # uses pandas python module to view and analyse data
import requests # this is used to access json files
from chembl_webresource_client import *
f2=open('output.txt', 'w+')
from os import environ


#====================================================================#

# using a list of known targets, find compounds that are active on these targets:

def find_bioactivities_for_targets(targets):

    targets = ",".join(targets) # join the targets into a suitable string to fulfil the search conditions of the ChEMBL api
    assay_type = 'B' # only look for binding assays
    pchembl_value = 5 # xxx specify a minimum threshold of the pCHEMBL activity value. Greater than or equal to 5 (10 microM) is a typical minimum rule of thumb for binding activity between a compound and a protein target
    limit = 100 # limit the number of records pulled back for each url call xxx

    # set up the call to the ChEMBL 'activity' API.
    # Remember that there is a limit to the number of records returned in any one API call (default is 20 records, maximum is 1000 records).
    # So we need to iterate over several pages of results to gather all relevant information together.
    url_stem = "https://www.ebi.ac.uk" #This is the stem of the url
    url_full_string = url_stem + "/chembl/api/data/activity.json?target_chembl_id__in={}&assay_type={}&pchembl_value__gte={}&limit={}".format(targets
, assay_type, pchembl_value, limit) #This is the full url with the specified input parameters
    url_full = requests.get( url_full_string ).json() 
    url_activities = url_full['activities'] #This is a list of the results for activities

    # This 'while' loop iterates over several pages of records (if required), and collates the list of results
    while url_full['page_meta']['next']:
        url_full = requests.get(url_stem + url_full['page_meta']['next']).json()
        url_activities = url_activities + url_full['activities'] #Add result (as a list) to previous list of results

    # Convert the list of results into a Pandas dataframe:
    act_df = pd.DataFrame(url_activities)

   

    #Specify which columns to keep so that the size of the dataframe becomes more manageable:
    act_df = act_df[[  'target_chembl_id','target_organism', 'target_pref_name'
                     , 'parent_molecule_chembl_id','molecule_chembl_id','molecule_pref_name'
                     , 'pchembl_value', 'standard_type','standard_relation', 'standard_value', 'standard_units'
                     , 'assay_chembl_id','document_chembl_id','src_id']]

    return act_df


ModuleNotFoundError: No module named 'chembl_webresource_client'

In [2]:
# Extract the list of compounds from the previous dataframe ('act_df'), and call the 'molecule' API to find their molecular properties etc, so that this list can be refined

def find_properties_of_compounds(act_df):

    #First find the list of compounds that are within the act_df dataframe:
    
    cmpd_chembl_ids = list(set(act_df['molecule_chembl_id']))
    print("There are {} compounds initially identified as active on the known targets. e.g.".format(len(cmpd_chembl_ids)))
    print(cmpd_chembl_ids[0:2])

    #For the identified compounds, extract their molecular properties and other information from the 'molecule' ChEMBL API
    #Specify the input parameters:
    cmpd_chembl_ids = ",".join(cmpd_chembl_ids[0:]) #Amend the format of the text string of compounds so that it is suitable for the API call
    limit = 100 #Limit the number of records pulled back for each url call

    # Set up the call to the ChEMBL 'molecule' API
    # Remember that there is a limit to the number of records returned in any one API call (default is 20 records, maximum is 1000 records)
    # So need to iterate over several pages of records to gather all relevant information together!
    url_stem = "https://www.ebi.ac.uk" #This is the stem of the url
    url_full_string = url_stem + "/chembl/api/data/molecule.json?molecule_chembl_id__in={}&limit={}".format(cmpd_chembl_ids, limit) #This is the full

    url_full = requests.get( url_full_string ).json() #This calls the information back from the API using the 'requests' module, and converts it to j

    url_molecules = url_full['molecules'] #This is a list of the results for activities

    # This 'while' loop iterates over several pages of records (if required), and collates the list of results
    while url_full['page_meta']['next']:
        url_full = requests.get(url_stem + url_full['page_meta']['next']).json()
        url_molecules = url_molecules + url_full['molecules'] #Add result (as a list) to previous list of results

    #Convert the list of results into a Pandas dataframe:
    mol_df = pd.DataFrame(url_molecules)

   
    # Select only relevant columns:
    mol_df = mol_df[[ 'molecule_chembl_id']]
    
    
   
    return mol_df

#====================================================================#


In [4]:
from chembl_webresource_client.new_client import new_client

def main(target):

    # using a list of known targets, find compounds that are active on these targets:
    targets = [target] # xxx
    act_df = find_bioactivities_for_targets(targets)

    # extract the list of compounds, and find these compounds' properties:
    mol_df = find_properties_of_compounds(act_df)
    chembl_list = mol_df.values.tolist()
    flattened = [val for sublist in chembl_list for val in sublist]
    chemblid=[]
    smilesid=[]
    for x in flattened:
        chemblid.append(x)
        chembl_id=str(x)
        records = new_client.activity.filter(molecule_chembl_id=chembl_id)
        molecule=records[0]['canonical_smiles']
        smilesid.append(molecule)
    df = pd.DataFrame(list(zip(chemblid, smilesid)), 
               columns =['ChemblID', 'smiles'])
    print(df)

ModuleNotFoundError: No module named 'chembl_webresource_client'

In [4]:
#====================================================================#

if __name__=="__main__":
    main(mytarget)

#====================================================================#


There are 40 compounds initially identified as active on the known targets. e.g.
['CHEMBL344614', 'CHEMBL137897']
         ChemblID                                             smiles
0        CHEMBL67             COc1ccc(\C=C/c2cc(OC)c(OC)c(OC)c2)cc1O
1       CHEMBL107  COC1=CC=C2C(=CC1=O)[C@H](CCc3cc(OC)c(OC)c(OC)c...
2     CHEMBL20705         COc1ccc(cc1)S(=O)(=O)Nc2cc(Cl)cc3cc[nH]c23
3     CHEMBL44918   Oc1ccc(O)c(CNc2ccc(O)c(c2)C(=O)NCCc3ccc(F)cc3)c1
4    CHEMBL299613  COc1cc2[C@H]3CC[C@]4(C)[C@@H](O)CC[C@H]4[C@@H]...
5     CHEMBL79720               COC1=CC=C(C=CC1=O)c2ccc(OC)c(OC)c2OC
6     CHEMBL84903                   COc1ccc(\C=C/c2ccc3ccccc3c2)cc1O
7     CHEMBL85065              COc1cc(\C=C/c2ccc3ccccc3c2)cc(OC)c1OC
8    CHEMBL122397              CCCCCCNC(=O)c1cc(NCc2cc(O)ccc2O)ccc1O
9    CHEMBL436000  C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)C=CC=C34)[C@@...
10   CHEMBL136780   COc1cc(cc(OC)c1OC)C2OC(=NN2C(=O)C)c3ccc(N)c(C)c3
11   CHEMBL422643        Oc1ccc(O)c(CNc2ccc(cc2)C(=O)NCCCc

In [5]:
records = new_client.target.all()

In [6]:
records[0]

{'cross_references': [{'xref_id': 'O43451',
   'xref_name': None,
   'xref_src': 'canSAR-Target'},
  {'xref_id': 'Maltase-glucoamylase',
   'xref_name': None,
   'xref_src': 'Wikipedia'}],
 'organism': 'Homo sapiens',
 'pref_name': 'Maltase-glucoamylase',
 'species_group_flag': False,
 'target_chembl_id': 'CHEMBL2074',
 'target_components': [{'accession': 'O43451',
   'component_description': 'Maltase-glucoamylase, intestinal',
   'component_id': 434,
   'component_type': 'PROTEIN',
   'relationship': 'SINGLE PROTEIN',
   'target_component_synonyms': [{'component_synonym': '3.2.1.20',
     'syn_type': 'EC_NUMBER'},
    {'component_synonym': '3.2.1.3', 'syn_type': 'EC_NUMBER'},
    {'component_synonym': 'Alpha-glucosidase', 'syn_type': 'UNIPROT'},
    {'component_synonym': 'Glucan 1,4-alpha-glucosidase',
     'syn_type': 'UNIPROT'},
    {'component_synonym': 'Glucoamylase', 'syn_type': 'UNIPROT'},
    {'component_synonym': 'Maltase', 'syn_type': 'UNIPROT'},
    {'component_synonym': 'Ma

In [None]:
targets=[]
organisms=[]
prefname=[]
length=len(records)
print(records[0]['organism'])

Homo sapiens


In [None]:
print(length)

12482


In [None]:
i=0
while i <= 12481:
    targets.append(records[i]['target_chembl_id'])
    organisms.append(records[i]['organism'])
    prefname.append(records[i]['pref_name'])
    
    i+=1
    

In [None]:
chemblframe=pd.DataFrame(list(zip(targets, organisms, prefname)))

In [None]:
chemblframe

In [None]:
f2=open('output.txt', 'w+')
print(chemblframe[chemblframe[2].str.match('CDK4')], file=f2)
f2.close()

In [25]:
chemblframe.to_csv('chembl.csv')