In [1]:
from pathlib import Path
import urllib.request

import pandas as pd

In [2]:
REPO = (Path(_dh[-1]) / "..").resolve()
DATA = REPO / 'data'

# Find UniProt IDs in ChEMBL targets

Get UniProt-ChEMBL mapping from EBI FTP server. Update URL to reflect new ChEMBL releases!

In [3]:
CHEMBL_VERSION = "chembl_27"
url = fr"ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/{CHEMBL_VERSION}/chembl_uniprot_mapping.txt"

In [4]:
with urllib.request.urlopen(url) as response:
    uniprot_map = pd.read_csv(response, sep="\t", skiprows=[0], names=["UniprotID", "chembl_targets", "description", "type"])
uniprot_map

Unnamed: 0,UniprotID,chembl_targets,description,type
0,P21266,CHEMBL2242,Glutathione S-transferase Mu 3,SINGLE PROTEIN
1,O00519,CHEMBL2243,Anandamide amidohydrolase,SINGLE PROTEIN
2,P19217,CHEMBL2244,Estrogen sulfotransferase,SINGLE PROTEIN
3,P97292,CHEMBL2245,Histamine H2 receptor,SINGLE PROTEIN
4,P17342,CHEMBL2247,Atrial natriuretic peptide receptor C,SINGLE PROTEIN
...,...,...,...,...
11779,Q91ZR5,CHEMBL3886121,Cation channel sperm-associated protein 1,SINGLE PROTEIN
11780,P48763,CHEMBL3886122,Sodium/hydrogen exchanger 2,SINGLE PROTEIN
11781,Q9UKU6,CHEMBL3886123,Thyrotropin-releasing hormone-degrading ectoen...,SINGLE PROTEIN
11782,Q9JJH7,CHEMBL3886124,Transient receptor potential cation channel su...,SINGLE PROTEIN


We join this new information to the human kinases aggregated list from `human-kinases` (all of them, regardless the source):

In [6]:
kinases = pd.read_csv(DATA / "human_kinases.aggregated.csv")
kinases

Unnamed: 0,UniprotID,kinhub,klifs,pkinfam
0,A0A0B4J2F2,False,False,True
1,A4QPH2,False,True,False
2,B5MCJ9,True,False,False
3,O00141,True,True,True
4,O00238,True,True,True
...,...,...,...,...
547,Q9Y616,True,True,True
548,Q9Y6E0,True,True,True
549,Q9Y6M4,True,True,True
550,Q9Y6R4,True,True,True


In [7]:
merged = pd.merge(kinases, uniprot_map[["UniprotID", "chembl_targets", "type"]], how="inner", on='UniprotID')[["UniprotID", "chembl_targets", "type"]]
merged

Unnamed: 0,UniprotID,chembl_targets,type
0,A4QPH2,CHEMBL4105789,SINGLE PROTEIN
1,A4QPH2,CHEMBL3038509,PROTEIN COMPLEX
2,O00141,CHEMBL2343,SINGLE PROTEIN
3,O00238,CHEMBL5476,SINGLE PROTEIN
4,O00311,CHEMBL2111377,PROTEIN COMPLEX
...,...,...,...
873,Q9Y5S2,CHEMBL5052,SINGLE PROTEIN
874,Q9Y616,CHEMBL5081,SINGLE PROTEIN
875,Q9Y6E0,CHEMBL5082,SINGLE PROTEIN
876,Q9Y6M4,CHEMBL5084,SINGLE PROTEIN


In [8]:
merged.UniprotID.value_counts()

P11802    12
P24941    12
P35968    10
P54646     9
P06493     9
          ..
Q8NEB9     1
Q96PF2     1
Q9NQU5     1
Q9H422     1
Q6P3R8     1
Name: UniprotID, Length: 488, dtype: int64

And save as CSV for easy reuse in other notebooks.

In [9]:
merged.to_csv(DATA /  f"human_kinases_and_chembl_targets.{CHEMBL_VERSION}.csv", index=False)

Note there's not 1:1 correspondence between UniprotID and ChEMBL ID! Some kinases are included in several ChEMBL targets:

In [10]:
merged[merged.UniprotID == "P11802"]

Unnamed: 0,UniprotID,chembl_targets,type
181,P11802,CHEMBL331,SINGLE PROTEIN
182,P11802,CHEMBL2095942,PROTEIN COMPLEX
183,P11802,CHEMBL2111326,SELECTIVITY GROUP
184,P11802,CHEMBL1907601,PROTEIN COMPLEX
185,P11802,CHEMBL3301385,PROTEIN COMPLEX
186,P11802,CHEMBL3038472,PROTEIN COMPLEX
187,P11802,CHEMBL3559691,PROTEIN FAMILY
188,P11802,CHEMBL3038517,PROTEIN FAMILY
189,P11802,CHEMBL4106184,PROTEIN FAMILY
190,P11802,CHEMBL3885548,PROTEIN COMPLEX


... and some ChEMBL targets include several kinases (e.g. chimeric proteins):

In [11]:
merged[merged.chembl_targets == "CHEMBL2096618"]

Unnamed: 0,UniprotID,chembl_targets,type
98,P00519,CHEMBL2096618,CHIMERIC PROTEIN
171,P11274,CHEMBL2096618,CHIMERIC PROTEIN


This is due to the different `type` values:

In [12]:
merged.type.value_counts()

SINGLE PROTEIN                 485
PROTEIN FAMILY                 215
PROTEIN COMPLEX                107
PROTEIN-PROTEIN INTERACTION     33
SELECTIVITY GROUP               16
CHIMERIC PROTEIN                11
PROTEIN COMPLEX GROUP           11
Name: type, dtype: int64

If we focus on `SINGLE PROTEIN` types:

In [13]:
merged[merged.type == "SINGLE PROTEIN"]

Unnamed: 0,UniprotID,chembl_targets,type
0,A4QPH2,CHEMBL4105789,SINGLE PROTEIN
2,O00141,CHEMBL2343,SINGLE PROTEIN
3,O00238,CHEMBL5476,SINGLE PROTEIN
5,O00311,CHEMBL5443,SINGLE PROTEIN
6,O00329,CHEMBL3130,SINGLE PROTEIN
...,...,...,...
873,Q9Y5S2,CHEMBL5052,SINGLE PROTEIN
874,Q9Y616,CHEMBL5081,SINGLE PROTEIN
875,Q9Y6E0,CHEMBL5082,SINGLE PROTEIN
876,Q9Y6M4,CHEMBL5084,SINGLE PROTEIN
