In [2]:
!pip install chembl_webresource_client


Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.9-py3-none-any.whl.metadata (1.4 kB)
Collecting requests-cache~=1.2 (from chembl_webresource_client)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading cattrs-24.1.3-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading url_normalize-2.2.0-py3-none-any.whl.metadata (4.9 kB)
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cattrs-24.1.3-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━━━━━

In [None]:
'''1. Retrieve all approved drugs from the ChEMBL database, sort them by approval year and name'''

In [15]:
from chembl_webresource_client.new_client import new_client
import pandas as pd


molecule = new_client.molecule
approved_drugs = molecule.filter(max_phase=4).order_by('first_approval','pref_name')
approved_drugs



In [None]:
import pandas as pd

# Assuming `approved_drugs` contains the data fetched from the ChEMBL API
# Convert the approved_drugs data to a list of dictionaries if it's not already in that format
# The ChEMBL Webresource Client typically handles this, but let's ensure the format

# Fetch all results if not already done; this might take some time depending on the data size
approved_drugs_list = approved_drugs.all()

# Now, convert this list of dictionaries into a pandas DataFrame
df_approved_drugs = pd.DataFrame(approved_drugs_list)

# Display the first few rows of the DataFrame to verify
print(df_approved_drugs.head())



In [18]:
# If you only want specific columns, you can specify them during the DataFrame creation
# For example, to only include 'molecule_chembl_id', 'pref_name', and 'first_approval':
df_approved_drugs = pd.DataFrame(approved_drugs_list, columns=['molecule_chembl_id', 'pref_name', 'first_approval'])
df_approved_drugs

Unnamed: 0,molecule_chembl_id,pref_name,first_approval
0,CHEMBL449,BUTABARBITAL,1939.0
1,CHEMBL1200982,BUTABARBITAL SODIUM,1939.0
2,CHEMBL1200542,DESOXYCORTICOSTERONE ACETATE,1939.0
3,CHEMBL821,GUANIDINE,1939.0
4,CHEMBL1200728,GUANIDINE HYDROCHLORIDE,1939.0
...,...,...,...
4384,CHEMBL37744,ZIMELDINE,
4385,CHEMBL2355333,ZIMELDINE HYDROCHLORIDE,
4386,CHEMBL3833362,ZINC OLEATE,
4387,CHEMBL135400,ZOPICLONE,


In [None]:
'''2. For each approved drug since 2019 that you identified in step (1), retrieve a list
of UniProt accession numbers, namely protein targets associated with the drug'''

In [21]:
# Ensure the 'first_approval' column is of integer type for comparison
df_approved_drugs['first_approval'] = pd.to_numeric(df_approved_drugs['first_approval'], errors='coerce')

# Now filter the DataFrame to only include drugs approved until the year 2019
df_approved_drugs_until_2019 = df_approved_drugs[df_approved_drugs['first_approval'] >= 2019]
df_approved_drugs_until_2019

Unnamed: 0,molecule_chembl_id,pref_name,first_approval
3144,CHEMBL4594264,AIR POLYMER-TYPE A,2019.0
3145,CHEMBL2396661,ALPELISIB,2019.0
3146,CHEMBL4594262,AMLODIPINE BENZOATE,2019.0
3147,CHEMBL3301583,ANDEXANET ALFA,2019.0
3148,CHEMBL4594311,BETIBEGLOGENE AUTOTEMCEL,2019.0
...,...,...,...
3496,CHEMBL3707311,VAMOROLONE,2023.0
3497,CHEMBL2397415,ZAVEGEPANT,2023.0
3498,CHEMBL4650220,ZAVEGEPANT HYDROCHLORIDE,2023.0
3499,CHEMBL5315048,ZILUCOPLAN SODIUM,2023.0


In [22]:
from chembl_webresource_client.new_client import new_client
import pandas as pd

# Initialize the necessary clients
molecule = new_client.molecule
activity = new_client.activity

# Assuming df_approved_drugs_until_2019 is your DataFrame with the approved drugs until 2019
chembl_ids = df_approved_drugs_until_2019['molecule_chembl_id'].unique()

# Initialize a dictionary to hold the mapping of ChEMBL IDs to UniProt accession numbers
chembl_to_uniprot = {}

# Loop through each ChEMBL ID and fetch the associated activities
for chembl_id in chembl_ids:
    activities = activity.filter(molecule_chembl_id=chembl_id)
    for activity_entry in activities:
        target = activity_entry.get('target_chembl_id')
        if target:
            # Fetch the target information
            target_info = new_client.target.filter(target_chembl_id=target)
            for ti in target_info:
                # Extract UniProt accession numbers from the target information
                uniprot_accessions = [component['accession'] for component in ti['target_components'] if 'accession' in component]
                # Update the mapping dictionary
                chembl_to_uniprot[chembl_id] = uniprot_accessions

# Now, create a DataFrame from the mapping dictionary
df_chembl_to_uniprot = pd.DataFrame(list(chembl_to_uniprot.items()), columns=['molecule_chembl_id', 'uniprot_accessions'])
df_chembl_to_uniprot

Unnamed: 0,molecule_chembl_id,uniprot_accessions
0,CHEMBL2396661,[P42336]
1,CHEMBL2070241,[]
2,CHEMBL4297533,[]
3,CHEMBL207538,"[O09028, P15431, P18506, P18508, P19969, P2023..."
4,CHEMBL4173394,[Q9Z1M0]
...,...,...
159,CHEMBL539423,[]
160,CHEMBL2105420,[P11229]
161,CHEMBL197084,[Q9UBN7]
162,CHEMBL2397415,[Q16602]


In [24]:
pip install requests



In [None]:
'''For each protein with a UniProt accession number that you identified in step (2), retrieve UniProt keywords associated with it'''

In [27]:
import requests
import pandas as pd

# Example initialization of your dataframe, replace it with your actual dataframe
# chembl_to_uniprot = {'CHEMBL1': ['P12345', 'Q67890'], 'CHEMBL2': ['P09876']}
# df_chembl_to_uniprot = pd.DataFrame(list(chembl_to_uniprot.items()), columns=['molecule_chembl_id', 'uniprot_accessions'])

def get_uniprot_keywords(uniprot_accessions):
    """
    For a given list of UniProt accession numbers, retrieve the associated keywords by querying the UniProt API.
    """
    keywords = []
    for uniprot_id in uniprot_accessions:
        url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.txt"
        response = requests.get(url)
        if response.status_code == 200:
            for line in response.text.split('\n'):
                if line.startswith('KW'):
                    # Extracting keywords after 'KW' and removing the trailing period if present
                    keyword_line = line[5:].rstrip('.')
                    keywords.extend(keyword_line.split('; '))
    # Remove potential duplicates by converting the list to a set and back to a list
    return list(set(keywords))

# Apply the function to each row in the dataframe and store the results in a new column
df_chembl_to_uniprot['keywords'] = df_chembl_to_uniprot['uniprot_accessions'].apply(get_uniprot_keywords)

In [28]:
df_chembl_to_uniprot

Unnamed: 0,molecule_chembl_id,uniprot_accessions,keywords
0,CHEMBL2396661,[P42336],"[Proteomics identification, Angiogenesis, Phag..."
1,CHEMBL2070241,[],[]
2,CHEMBL4297533,[],[]
3,CHEMBL207538,"[O09028, P15431, P18506, P18508, P19969, P2023...","[Ion channel, Signal {ECO:0000256|ARBA:ARBA000..."
4,CHEMBL4173394,[Q9Z1M0],"[Cell membrane, Membrane, Receptor, Ion transp..."
...,...,...,...
159,CHEMBL539423,[],[]
160,CHEMBL2105420,[P11229],"[Phosphoprotein;, Postsynaptic cell membrane, ..."
161,CHEMBL197084,[Q9UBN7],"[Zinc;, Chromatin regulator, Cytoskeleton, Alt..."
162,CHEMBL2397415,[Q16602],"[Glycoprotein, Cell membrane, Proteomics ident..."
