In [None]:
!pip install psycopg2-binary pandas tqdm

In [1]:
from AlzheimerTargets import targets
from database_connection_info import user, password, host, port

In [2]:
import requests
import time

# We maintain a global or external dictionary so if multiple targets
# share the same UniProt ID, we only fetch once.
uniprot_cache = {}

def fetch_uniprot_json(accession, max_retries=5):
    """
    Fetch a UniProt entry in JSON format, with caching and simple rate-limit handling.
    
    :param accession: UniProt Accession (e.g. "P05067")
    :param max_retries: how many times to try if we get a rate-limit (429) or server error
    :return: Python dict or None
    """
    # 1) Check our cache first
    if accession in uniprot_cache:
        return uniprot_cache[accession]
    
    base_url = "https://rest.uniprot.org/uniprotkb"
    query_url = f"{base_url}/{accession}.json"

    attempts = 0
    while attempts < max_retries:
        attempts += 1
        try:
            # 2) Make the request
            response = requests.get(query_url)
            
            # 3) Handle the response
            if response.status_code == 200:
                data = response.json()
                # Cache the result
                uniprot_cache[accession] = data
                return data
            elif response.status_code == 429:
                # Rate limited - "Too Many Requests"
                print(f"[WARN] 429 Too Many Requests for {accession}. "
                      f"Pausing, then retrying (attempt {attempts}/{max_retries})...")
                time.sleep(10)  # wait longer if 429
            else:
                print(f"[WARN] Failed to fetch {accession}, status {response.status_code}")
                # Might be 404 (not found) or 500 (server error)
                break  # no need to retry 404
        except Exception as e:
            print(f"[ERROR] Exception while fetching {accession}: {e}")
            time.sleep(5)
    
    # If we get here, we never returned successfully
    uniprot_cache[accession] = None
    return None


def parse_uniprot_json(data):
    """
    Given UniProt JSON data, extract:
      - UniProt Accession
      - UniProt GeneNames
      - UniProt Organism
      - UniProt Function
      - UniProt Disease
      - UniProt SubcellularLocation
    Returns a dict (or None if data is invalid).
    """
    if not data:
        return None
    
    record = {
        "UniProt Accession": data.get("primaryAccession"),
        "UniProt GeneNames": None,
        "UniProt Organism": None,
        "UniProt Function": [],
        "UniProt Disease": [],
        "UniProt SubcellularLocation": []
    }

    # Genes
    genes_block = data.get("genes", [])
    gene_names = []
    for g in genes_block:
        gene_val = g.get("geneName", {}).get("value")
        if gene_val:
            gene_names.append(gene_val)
    if gene_names:
        record["UniProt GeneNames"] = "; ".join(gene_names)

    # Organism
    organism_info = data.get("organism", {})
    sci_name = organism_info.get("scientificName")
    if sci_name:
        record["UniProt Organism"] = sci_name

    # Comments
    for c in data.get("comments", []):
        ctype = c.get("commentType")
        if ctype == "FUNCTION":
            for t in c.get("texts", []):
                record["UniProt Function"].append(t["value"])
        elif ctype == "DISEASE":
            disease_id = c.get("disease", {}).get("diseaseId")
            disease_texts = " ".join(t["value"] for t in c.get("texts", []))
            combined = disease_id + ": " + disease_texts if disease_id else disease_texts
            if combined.strip():
                record["UniProt Disease"].append(combined)
        elif ctype == "SUBCELLULAR LOCATION":
            for sloc in c.get("subcellularLocations", []):
                loc_val = sloc.get("location", {}).get("value")
                if loc_val:
                    record["UniProt SubcellularLocation"].append(loc_val)
    
    # Convert lists to single strings
    if record["UniProt Function"]:
        record["UniProt Function"] = " ".join(record["UniProt Function"])
    else:
        record["UniProt Function"] = None
    
    if record["UniProt Disease"]:
        record["UniProt Disease"] = " | ".join(record["UniProt Disease"])
    else:
        record["UniProt Disease"] = None
    
    if record["UniProt SubcellularLocation"]:
        record["UniProt SubcellularLocation"] = "; ".join(record["UniProt SubcellularLocation"])
    else:
        record["UniProt SubcellularLocation"] = None

    return record


import psycopg2
import pandas as pd
from tqdm import tqdm

# Include the two UniProt helper functions here or import them
# e.g. from uniprot_helpers import fetch_uniprot_json, parse_uniprot_json

def get_chembl_ids_and_fetch_data(target_names, user, password, host, port):
    # Connect to PostgreSQL (ChEMBL)
    conn = psycopg2.connect(
        dbname="chembl_35",
        user=user,
        password=password,
        host=host,
        port=port
    )
    chembl_ids = {}

    # Step 1: Get ChEMBL IDs for target names (using SQL in a loop)
    with conn.cursor() as cursor:
        for target_name in target_names:
            cursor.execute("""
                SELECT chembl_id, target_type, organism
                FROM target_dictionary
                WHERE pref_name ILIKE %s
            """, (target_name,))
            
            results = cursor.fetchall()
            if results:
                chembl_ids[target_name] = [{
                    "target_chembl_id": row[0],
                    "target_type": row[1],
                    "organism": row[2]
                } for row in results]
            else:
                print(f"No exact match found for target: {target_name}")

    all_data = []

    # Step 2: For each target_chembl_id, fetch the UniProt ID(s)
    for target_name, target_info_list in chembl_ids.items():
        for target_info in target_info_list:
            target_id = target_info["target_chembl_id"]
            print(f"Fetching data for target {target_id} ({target_name})...")

            # 2a) Retrieve the UniProt accession(s) for that target
            # Some targets have multiple components => multiple accessions
            # We'll gather them in a list
            uniprot_accessions = []
            with conn.cursor() as cur_u:
                cur_u.execute("""
                    SELECT cs.accession
                    FROM target_dictionary td
                    JOIN target_components tc ON td.tid = tc.tid
                    JOIN component_sequences cs ON tc.component_id = cs.component_id
                    WHERE td.chembl_id = %s
                """, (target_id,))
                up_results = cur_u.fetchall()
                for row_up in up_results:
                    if row_up[0]:
                        uniprot_accessions.append(row_up[0])
            
            # If there's more than one, you can decide to keep just the first
            # or combine them. Here, let's just keep the first for illustration.
            uniprot_id = uniprot_accessions[0] if uniprot_accessions else None

            # 2b) If we have a UniProt ID, fetch + parse from UniProt
            uniprot_data = {}
            if uniprot_id:
                json_data = fetch_uniprot_json(uniprot_id)
                parsed_data = parse_uniprot_json(json_data)
                if parsed_data:
                    uniprot_data = parsed_data
                else:
                    uniprot_data = {
                        "UniProt Accession": uniprot_id,
                        "UniProt GeneNames": None,
                        "UniProt Organism": None,
                        "UniProt Function": None,
                        "UniProt Disease": None,
                        "UniProt SubcellularLocation": None
                    }
            else:
                # No UniProt found
                uniprot_data = {
                    "UniProt Accession": None,
                    "UniProt GeneNames": None,
                    "UniProt Organism": None,
                    "UniProt Function": None,
                    "UniProt Disease": None,
                    "UniProt SubcellularLocation": None
                }
            
            try:
                # 2c) Now do your single query to get the activity/compound data
                query = """
                    SELECT 
                        a.standard_value,
                        a.standard_type AS activity_type,
                        m.chembl_id AS molecule_chembl_id,
                        m.max_phase,
                        m.molecule_type,
                        cs.canonical_smiles,
                        cs.standard_inchi,
                        cs.standard_inchi_key,
                        cp.full_molformula,
                        cp.full_mwt,
                        cp.alogp,
                        cp.aromatic_rings,
                        cp.hba,
                        cp.hbd,
                        cp.rtb,
                        cp.psa,
                        cp.qed_weighted
                    FROM activities a
                    JOIN assays ass ON a.assay_id = ass.assay_id
                    JOIN target_dictionary td ON ass.tid = td.tid
                    JOIN molecule_dictionary m ON a.molregno = m.molregno
                    JOIN compound_structures cs ON m.molregno = cs.molregno
                    JOIN compound_properties cp ON m.molregno = cp.molregno
                    WHERE td.chembl_id = %s
                    AND a.standard_value IS NOT NULL
                """
                
                with conn.cursor(name="fetch_activities") as cursor2:
                    cursor2.itersize = 1000
                    cursor2.execute(query, (target_id,))
                    
                    for row in tqdm(cursor2, desc=f"Processing {target_id}"):
                        info = {
                            "Target Name": target_name,
                            "Target ChEMBL ID": target_id,
                            "Target Type": target_info["target_type"],
                            "Molecule ChEMBL ID": row[2],
                            "Activity Type": row[1],
                            "Activity Value": row[0],
                            "Canonical SMILES": row[5],
                            "Standard InChI": row[6],
                            "Standard InChI Key": row[7],
                            "Max Phase": row[3],
                            "Molecular Formula": row[8],
                            "Molecular Weight": row[9],
                            "AlogP": row[10],
                            "Aromatic Rings": row[11],
                            "HBA": row[12],
                            "HBD": row[13],
                            "Rotatable Bonds": row[14],
                            "Polar Surface Area (PSA)": row[15],
                            "QED Weighted": row[16]
                        }
                        # Merge the UniProt data
                        info.update(uniprot_data)
                        all_data.append(info)

            except Exception as e:
                print(f"Error retrieving data for target {target_id}: {e}")
                conn.rollback()
            else:
                conn.commit()

    conn.close()
    return pd.DataFrame(all_data)


# Example usage
if __name__ == "__main__":
    # Suppose you define 'targets' somewhere above
    # e.g. targets = ["BACE1", "Cyclooxygenase-2", "Beta-secretase", ...]
    df = get_chembl_ids_and_fetch_data(
        target_names=targets,
        user=user,
        password=password,
        host=host,
        port=port
    )
    

Fetching data for target CHEMBL1978 (Cytochrome P450 19A1)...


Processing CHEMBL1978: 6630it [00:02, 2864.78it/s]


Fetching data for target CHEMBL3859 (Cytochrome P450 19A1)...


Processing CHEMBL3859: 263it [00:00, 425.24it/s]


Fetching data for target CHEMBL211 (Muscarinic acetylcholine receptor M2)...


Processing CHEMBL211: 8829it [00:02, 3175.90it/s]


Fetching data for target CHEMBL4781 (Muscarinic acetylcholine receptor M2)...


Processing CHEMBL4781: 447it [00:00, 768.82it/s]


Fetching data for target CHEMBL3197 (Muscarinic acetylcholine receptor M2)...


Processing CHEMBL3197: 85it [00:00, 144.66it/s]


Fetching data for target CHEMBL309 (Muscarinic acetylcholine receptor M2)...


Processing CHEMBL309: 969it [00:00, 1322.59it/s]


Fetching data for target CHEMBL2095187 (Muscarinic acetylcholine receptors; M2 & M3)...


Processing CHEMBL2095187: 652it [00:00, 1124.49it/s]


Fetching data for target CHEMBL234 (Dopamine D3 receptor)...


Processing CHEMBL234: 12791it [00:02, 4675.82it/s]


Fetching data for target CHEMBL3441 (Dopamine D3 receptor)...


Processing CHEMBL3441: 6it [00:00, 11.68it/s]


Fetching data for target CHEMBL3138 (Dopamine D3 receptor)...


Processing CHEMBL3138: 1005it [00:00, 1493.79it/s]


Fetching data for target CHEMBL2304406 (Dopamine D3 receptor)...


Processing CHEMBL2304406: 16it [00:00, 29.59it/s]


Fetching data for target CHEMBL219 (Dopamine D4 receptor)...


Processing CHEMBL219: 6126it [00:00, 6882.35it/s]


Fetching data for target CHEMBL2574 (Dopamine D4 receptor)...


Processing CHEMBL2574: 19it [00:00, 32.16it/s]


Fetching data for target CHEMBL3361 (Dopamine D4 receptor)...


Processing CHEMBL3361: 266it [00:00, 482.98it/s]


Fetching data for target CHEMBL1944 (Neprilysin)...


Processing CHEMBL1944: 782it [00:00, 1097.51it/s]


Fetching data for target CHEMBL3768 (Neprilysin)...


Processing CHEMBL3768: 341it [00:00, 553.79it/s]


Fetching data for target CHEMBL2642 (Neprilysin)...


Processing CHEMBL2642: 36it [00:00, 63.58it/s]


Fetching data for target CHEMBL3369 (Neprilysin)...


Processing CHEMBL3369: 521it [00:00, 708.78it/s]


Fetching data for target CHEMBL6107 (Neprilysin)...


Processing CHEMBL6107: 9it [00:00, 16.02it/s]


Fetching data for target CHEMBL1801 (Plasminogen)...


Processing CHEMBL1801: 2147it [00:01, 1738.57it/s]


Fetching data for target CHEMBL2957 (Plasminogen)...


Processing CHEMBL2957: 14it [00:00, 18.13it/s]


Fetching data for target CHEMBL3204 (Plasminogen)...


Processing CHEMBL3204: 115it [00:00, 145.43it/s]


Fetching data for target CHEMBL1075299 (Plasminogen)...


Processing CHEMBL1075299: 11it [00:00, 17.85it/s]


Fetching data for target CHEMBL216 (Muscarinic acetylcholine receptor M1)...


Processing CHEMBL216: 10658it [00:01, 7557.60it/s] 


Fetching data for target CHEMBL276 (Muscarinic acetylcholine receptor M1)...


Processing CHEMBL276: 3357it [00:01, 3110.45it/s]


Fetching data for target CHEMBL3733 (Muscarinic acetylcholine receptor M1)...


Processing CHEMBL3733: 123it [00:00, 180.23it/s]


Fetching data for target CHEMBL2672 (Muscarinic acetylcholine receptor M1)...


Processing CHEMBL2672: 0it [00:00, ?it/s]


KeyboardInterrupt: 

In [None]:
df

Unnamed: 0,Target Name,Target ChEMBL ID,Target Type,Molecule ChEMBL ID,Activity Type,Activity Value,Canonical SMILES,Standard InChI,Standard InChI Key,Max Phase,...,HBD,Rotatable Bonds,Polar Surface Area (PSA),QED Weighted,UniProt Accession,UniProt GeneNames,UniProt Organism,UniProt Function,UniProt Disease,UniProt SubcellularLocation
0,Cytochrome P450 19A1,CHEMBL1978,SINGLE PROTEIN,CHEMBL63297,IC50,190.0,Fc1ccc(C(C2Cc3ccccc3O2)n2cncn2)cc1,InChI=1S/C17H14FN3O/c18-14-7-5-12(6-8-14)17(21...,RVFPOHIDBASNMF-UHFFFAOYSA-N,,...,0.0,3.0,39.94,0.75,P11511,CYP19A1,Homo sapiens,A cytochrome P450 monooxygenase that catalyzes...,Aromatase excess syndrome: | Aromatase defici...,Endoplasmic reticulum membrane; Microsome memb...
1,Cytochrome P450 19A1,CHEMBL1978,SINGLE PROTEIN,CHEMBL65328,IC50,590.0,Cc1ccc(C(C2Cc3ccccc3O2)n2cncn2)cc1,InChI=1S/C18H17N3O/c1-13-6-8-14(9-7-13)18(21-1...,OZEGINYEQGNONL-UHFFFAOYSA-N,,...,0.0,3.0,39.94,0.74,P11511,CYP19A1,Homo sapiens,A cytochrome P450 monooxygenase that catalyzes...,Aromatase excess syndrome: | Aromatase defici...,Endoplasmic reticulum membrane; Microsome memb...
2,Cytochrome P450 19A1,CHEMBL1978,SINGLE PROTEIN,CHEMBL488,IC50,18500.0,CCC1(c2ccc(N)cc2)CCC(=O)NC1=O,InChI=1S/C13H16N2O2/c1-2-13(8-7-11(16)15-12(13...,ROBVIMPUHSLWNV-UHFFFAOYSA-N,4.0,...,2.0,2.0,72.19,0.60,P11511,CYP19A1,Homo sapiens,A cytochrome P450 monooxygenase that catalyzes...,Aromatase excess syndrome: | Aromatase defici...,Endoplasmic reticulum membrane; Microsome memb...
3,Cytochrome P450 19A1,CHEMBL1978,SINGLE PROTEIN,CHEMBL291646,IC50,200.0,Clc1ccc(C(C2Cc3ccccc3O2)n2cncn2)cc1,InChI=1S/C17H14ClN3O/c18-14-7-5-12(6-8-14)17(2...,UVOSSULRORMWHE-UHFFFAOYSA-N,,...,0.0,3.0,39.94,0.74,P11511,CYP19A1,Homo sapiens,A cytochrome P450 monooxygenase that catalyzes...,Aromatase excess syndrome: | Aromatase defici...,Endoplasmic reticulum membrane; Microsome memb...
4,Cytochrome P450 19A1,CHEMBL1978,SINGLE PROTEIN,CHEMBL304788,IC50,880.0,Clc1ccc(C(C2Cc3ccccc3O2)n2cncn2)c(Cl)c1,InChI=1S/C17H13Cl2N3O/c18-12-5-6-13(14(19)8-12...,ZSINOYZZEZDTLK-UHFFFAOYSA-N,,...,0.0,3.0,39.94,0.71,P11511,CYP19A1,Homo sapiens,A cytochrome P450 monooxygenase that catalyzes...,Aromatase excess syndrome: | Aromatase defici...,Endoplasmic reticulum membrane; Microsome memb...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1463272,Neuronal acetylcholine receptor; alpha3/alpha6...,CHEMBL2109233,PROTEIN COMPLEX,CHEMBL3329544,Emax,64.0,C[C@H]1C[C@@H]1C(=O)N1CC2CC(C1)N2,InChI=1S/C10H16N2O/c1-6-2-9(6)10(13)12-4-7-3-8...,INYNWEUTASDACG-KRTGUUSXSA-N,,...,1.0,1.0,32.34,0.63,P17787,CHRNB2,Homo sapiens,"After binding acetylcholine, the AChR responds...","Epilepsy, nocturnal frontal lobe, 3:",Postsynaptic cell membrane; Cell membrane
1463273,Neuronal acetylcholine receptor; alpha3/alpha6...,CHEMBL2109233,PROTEIN COMPLEX,CHEMBL3329545,Emax,73.0,CC1(C)CC1C(=O)N1CC2CC(C1)N2,InChI=1S/C11H18N2O/c1-11(2)4-9(11)10(14)13-5-7...,LJEPPQATHWQMBV-UHFFFAOYSA-N,,...,1.0,1.0,32.34,0.66,P17787,CHRNB2,Homo sapiens,"After binding acetylcholine, the AChR responds...","Epilepsy, nocturnal frontal lobe, 3:",Postsynaptic cell membrane; Cell membrane
1463274,Neuronal acetylcholine receptor; alpha3/alpha6...,CHEMBL2109233,PROTEIN COMPLEX,CHEMBL3329546,Emax,110.0,O=C(C1CC1(F)F)N1CC2CC(C1)N2,InChI=1S/C9H12F2N2O/c10-9(11)2-7(9)8(14)13-3-5...,VXQLPXYMYHKASH-UHFFFAOYSA-N,,...,1.0,1.0,32.34,0.66,P17787,CHRNB2,Homo sapiens,"After binding acetylcholine, the AChR responds...","Epilepsy, nocturnal frontal lobe, 3:",Postsynaptic cell membrane; Cell membrane
1463275,Neuronal acetylcholine receptor; alpha3/alpha6...,CHEMBL2109233,PROTEIN COMPLEX,CHEMBL4869892,IC50,135.0,CC[C@H](C)[C@@H]1NC(=O)[C@H](CCC(=O)O)NC(=O)[C...,InChI=1S/C68H103N23O21S4/c1-7-32(6)53-66(110)8...,ATDHPNPDBDEQAK-BCTYQQKGSA-N,,...,,,,,P17787,CHRNB2,Homo sapiens,"After binding acetylcholine, the AChR responds...","Epilepsy, nocturnal frontal lobe, 3:",Postsynaptic cell membrane; Cell membrane


In [7]:
#df.to_csv('Alzheimer_CheMBLv35_Uniprot.csv', index=False) 

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("Alzheimer_CheMBLv35_Uniprot.csv")
df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Target Name,Target ChEMBL ID,Target Type,Molecule ChEMBL ID,Activity Type,Activity Value,Canonical SMILES,Standard InChI,Standard InChI Key,Max Phase,...,HBD,Rotatable Bonds,Polar Surface Area (PSA),QED Weighted,UniProt Accession,UniProt GeneNames,UniProt Organism,UniProt Function,UniProt Disease,UniProt SubcellularLocation
0,Cytochrome P450 19A1,CHEMBL1978,SINGLE PROTEIN,CHEMBL63297,IC50,190.00,Fc1ccc(C(C2Cc3ccccc3O2)n2cncn2)cc1,InChI=1S/C17H14FN3O/c18-14-7-5-12(6-8-14)17(21...,RVFPOHIDBASNMF-UHFFFAOYSA-N,,...,0.0,3.0,39.94,0.75,P11511,CYP19A1,Homo sapiens,A cytochrome P450 monooxygenase that catalyzes...,Aromatase excess syndrome: | Aromatase defici...,Endoplasmic reticulum membrane; Microsome memb...
1,Cytochrome P450 19A1,CHEMBL1978,SINGLE PROTEIN,CHEMBL65328,IC50,590.00,Cc1ccc(C(C2Cc3ccccc3O2)n2cncn2)cc1,InChI=1S/C18H17N3O/c1-13-6-8-14(9-7-13)18(21-1...,OZEGINYEQGNONL-UHFFFAOYSA-N,,...,0.0,3.0,39.94,0.74,P11511,CYP19A1,Homo sapiens,A cytochrome P450 monooxygenase that catalyzes...,Aromatase excess syndrome: | Aromatase defici...,Endoplasmic reticulum membrane; Microsome memb...
2,Cytochrome P450 19A1,CHEMBL1978,SINGLE PROTEIN,CHEMBL488,IC50,18500.00,CCC1(c2ccc(N)cc2)CCC(=O)NC1=O,InChI=1S/C13H16N2O2/c1-2-13(8-7-11(16)15-12(13...,ROBVIMPUHSLWNV-UHFFFAOYSA-N,4.0,...,2.0,2.0,72.19,0.60,P11511,CYP19A1,Homo sapiens,A cytochrome P450 monooxygenase that catalyzes...,Aromatase excess syndrome: | Aromatase defici...,Endoplasmic reticulum membrane; Microsome memb...
3,Cytochrome P450 19A1,CHEMBL1978,SINGLE PROTEIN,CHEMBL291646,IC50,200.00,Clc1ccc(C(C2Cc3ccccc3O2)n2cncn2)cc1,InChI=1S/C17H14ClN3O/c18-14-7-5-12(6-8-14)17(2...,UVOSSULRORMWHE-UHFFFAOYSA-N,,...,0.0,3.0,39.94,0.74,P11511,CYP19A1,Homo sapiens,A cytochrome P450 monooxygenase that catalyzes...,Aromatase excess syndrome: | Aromatase defici...,Endoplasmic reticulum membrane; Microsome memb...
4,Cytochrome P450 19A1,CHEMBL1978,SINGLE PROTEIN,CHEMBL304788,IC50,880.00,Clc1ccc(C(C2Cc3ccccc3O2)n2cncn2)c(Cl)c1,InChI=1S/C17H13Cl2N3O/c18-12-5-6-13(14(19)8-12...,ZSINOYZZEZDTLK-UHFFFAOYSA-N,,...,0.0,3.0,39.94,0.71,P11511,CYP19A1,Homo sapiens,A cytochrome P450 monooxygenase that catalyzes...,Aromatase excess syndrome: | Aromatase defici...,Endoplasmic reticulum membrane; Microsome memb...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1463272,Neuronal acetylcholine receptor; alpha3/alpha6...,CHEMBL2109233,PROTEIN COMPLEX,CHEMBL3329544,Emax,64.00,C[C@H]1C[C@@H]1C(=O)N1CC2CC(C1)N2,InChI=1S/C10H16N2O/c1-6-2-9(6)10(13)12-4-7-3-8...,INYNWEUTASDACG-KRTGUUSXSA-N,,...,1.0,1.0,32.34,0.63,P17787,CHRNB2,Homo sapiens,"After binding acetylcholine, the AChR responds...","Epilepsy, nocturnal frontal lobe, 3:",Postsynaptic cell membrane; Cell membrane
1463273,Neuronal acetylcholine receptor; alpha3/alpha6...,CHEMBL2109233,PROTEIN COMPLEX,CHEMBL3329545,Emax,73.00,CC1(C)CC1C(=O)N1CC2CC(C1)N2,InChI=1S/C11H18N2O/c1-11(2)4-9(11)10(14)13-5-7...,LJEPPQATHWQMBV-UHFFFAOYSA-N,,...,1.0,1.0,32.34,0.66,P17787,CHRNB2,Homo sapiens,"After binding acetylcholine, the AChR responds...","Epilepsy, nocturnal frontal lobe, 3:",Postsynaptic cell membrane; Cell membrane
1463274,Neuronal acetylcholine receptor; alpha3/alpha6...,CHEMBL2109233,PROTEIN COMPLEX,CHEMBL3329546,Emax,110.00,O=C(C1CC1(F)F)N1CC2CC(C1)N2,InChI=1S/C9H12F2N2O/c10-9(11)2-7(9)8(14)13-3-5...,VXQLPXYMYHKASH-UHFFFAOYSA-N,,...,1.0,1.0,32.34,0.66,P17787,CHRNB2,Homo sapiens,"After binding acetylcholine, the AChR responds...","Epilepsy, nocturnal frontal lobe, 3:",Postsynaptic cell membrane; Cell membrane
1463275,Neuronal acetylcholine receptor; alpha3/alpha6...,CHEMBL2109233,PROTEIN COMPLEX,CHEMBL4869892,IC50,135.00,CC[C@H](C)[C@@H]1NC(=O)[C@H](CCC(=O)O)NC(=O)[C...,InChI=1S/C68H103N23O21S4/c1-7-32(6)53-66(110)8...,ATDHPNPDBDEQAK-BCTYQQKGSA-N,,...,,,,,P17787,CHRNB2,Homo sapiens,"After binding acetylcholine, the AChR responds...","Epilepsy, nocturnal frontal lobe, 3:",Postsynaptic cell membrane; Cell membrane


In [5]:
df.columns

Index(['Target Name', 'Target ChEMBL ID', 'Target Type', 'Molecule ChEMBL ID',
       'Activity Type', 'Activity Value', 'Canonical SMILES', 'Standard InChI',
       'Standard InChI Key', 'Max Phase', 'Molecular Formula',
       'Molecular Weight', 'AlogP', 'Aromatic Rings', 'HBA', 'HBD',
       'Rotatable Bonds', 'Polar Surface Area (PSA)', 'QED Weighted',
       'UniProt Accession', 'UniProt GeneNames', 'UniProt Organism',
       'UniProt Function', 'UniProt Disease', 'UniProt SubcellularLocation'],
      dtype='object')

In [6]:
set(list(df["Activity Type"]))

{'% Activity remaining',
 '% Control',
 '% Ctrl',
 '% Enzyme Activity (relative to DMSO controls, average n=2)',
 '% Inhibition of Control Agonist Response (Mean n=2)',
 '% Inhibition of Control Specific Binding',
 '% Inhibition of Control Specific Binding (Mean n=2)',
 '% Inhibition of Control Values',
 '% Inhibition of Control Values (Mean n=2)',
 '% Residual activity with Skepinone-L',
 '% binding',
 '% inhibition',
 '% maximum response',
 '% of Control Agonist Response (Mean n=2)',
 '% of activity',
 '% of control',
 '% of effect',
 '% of engagement',
 '% of inhibition',
 '% of inhibition or stimulation',
 '% of residual activity',
 '% residual kinase activity',
 '%Bound_Albumin',
 '%Inhib (Mean)',
 '%Max (Mean)',
 '%max',
 '-Delta G',
 '-Log ED50',
 '-Log K0.5',
 '-Log KB',
 '-Log KD',
 '-TdeltaS',
 '-log(RatioIC50)',
 '-logKa',
 '1/kmax',
 '1/kobsd-ka',
 'A/Ad',
 'A1 selectivity',
 'A2',
 'A2 selectivity',
 'A50',
 'AC50',
 'ACAT activity',
 'ACT',
 'AE activity',
 'AIDH activity