In [1]:
!pip install psycopg2-binary pandas tqdm



In [4]:
from AlzheimerTargets import targets
from database_connection_info import user, password, host, port

In [2]:
targets

['Cytochrome P450 19A1',
 'Muscarinic acetylcholine receptor M2',
 'Muscarinic acetylcholine receptors; M2 & M3',
 'Dopamine D3 receptor',
 'Dopamine D4 receptor',
 'Neprilysin',
 'Plasminogen',
 'Muscarinic acetylcholine receptor M1',
 'Angiotensin II receptor',
 'Serotonin 1a (5-HT1a) receptor',
 'Matrix metalloproteinase 3',
 'Matrix metalloproteinase-1',
 'Matrix metalloproteinase 9',
 'Matrix metalloproteinase-2',
 'ADAM17',
 'Peroxisome proliferator-activated receptor gamma',
 'Receptor protein-tyrosine kinase erbB-2',
 'Tyrosine-protein kinase LCK',
 'Angiotensin II type 2 (AT-2) receptor',
 'Serum albumin',
 'Adenosine A2a receptor',
 'Intercellular adhesion molecule-1',
 'Androgen Receptor',
 'Estrogen receptor',
 'Purinergic receptor P2Y2',
 'Excitatory amino acid transporter 1',
 'Excitatory amino acid transporter 2',
 'Excitatory amino acid transporter 3',
 'Caspase-1',
 'Caspase-3',
 'Caspase-6',
 'Caspase-8',
 'Renin',
 'Sigma opioid receptor',
 'Nitric oxide synthase, in

In [5]:
import psycopg2
import pandas as pd
from tqdm import tqdm 

def get_chembl_ids_and_fetch_data(target_names):
    # Connect to PostgreSQL
    conn = psycopg2.connect(
        dbname="chembl_35",
        user=user,  
        password=password,  
        host=host,
        port=port
    )

    chembl_ids = {}

    # Step 1: Get ChEMBL IDs for target names (using SQL)
    with conn.cursor() as cursor:
        for target_name in target_names:
            cursor.execute("""
                SELECT chembl_id, target_type, organism
                FROM target_dictionary
                WHERE pref_name ILIKE %s
            """, (target_name,))
            
            results = cursor.fetchall()
            if results:
                chembl_ids[target_name] = [{
                    "target_chembl_id": row[0],
                    "target_type": row[1],
                    "organism": row[2]
                } for row in results]
            else:
                print(f"No exact match found for target: {target_name}")

    # Step 2: Fetch data using optimized SQL joins
    all_data = []
    for target_name, target_info_list in chembl_ids.items():
        for target_info in target_info_list:
            target_id = target_info["target_chembl_id"]
            print(f"Fetching data for target {target_id} ({target_name})...")
            
            try:
                # Single query to get all related data using joins
                query = """
                    SELECT 
                        a.standard_value,
                        a.standard_type AS activity_type,
                        m.chembl_id AS molecule_chembl_id,
                        m.max_phase,
                        m.molecule_type,
                        cs.canonical_smiles,
                        cs.standard_inchi,
                        cs.standard_inchi_key,
                        cp.full_molformula,
                        cp.full_mwt,
                        cp.alogp,
                        cp.aromatic_rings,
                        cp.hba,
                        cp.hbd,
                        cp.rtb,
                        cp.psa,
                        cp.qed_weighted
                    FROM activities a
                    JOIN assays ass ON a.assay_id = ass.assay_id
                    JOIN target_dictionary td ON ass.tid = td.tid
                    JOIN molecule_dictionary m ON a.molregno = m.molregno
                    JOIN compound_structures cs ON m.molregno = cs.molregno
                    JOIN compound_properties cp ON m.molregno = cp.molregno
                    WHERE td.chembl_id = %s
                    AND a.standard_value IS NOT NULL
                """
                
                with conn.cursor(name="fetch_activities") as cursor:
                    cursor.itersize = 1000  # Batch size for server-side cursor
                    cursor.execute(query, (target_id,))
                    
                    for row in tqdm(cursor, desc=f"Processing {target_id}"):
                        info = {
                            "Target Name": target_name,
                            "Target ChEMBL ID": target_id,
                            "Target Type": target_info["target_type"],
                            "Organism": target_info["organism"],
                            "Molecule ChEMBL ID": row[2],
                            "Activity Type": row[1],
                            "Activity Value": row[0],
                            "Canonical SMILES": row[5],
                            "Standard InChI": row[6],
                            "Standard InChI Key": row[7],
                            "Max Phase": row[3],
                            "Molecular Formula": row[8],
                            "Molecular Weight": row[9],
                            "AlogP": row[10],
                            "Aromatic Rings": row[11],
                            "HBA": row[12],
                            "HBD": row[13],
                            "Rotatable Bonds": row[14],
                            "Polar Surface Area (PSA)": row[15],
                            "QED Weighted": row[16]
                        }
                        all_data.append(info)

            except Exception as e:
                print(f"Error retrieving data for target {target_id}: {e}")
                conn.rollback()
            else:
                conn.commit()

    conn.close()
    return pd.DataFrame(all_data)


if __name__ == "__main__":
    target_names = targets
    df = get_chembl_ids_and_fetch_data(target_names)

No exact match found for target: Neuronal acetylcholine receptor subunit alpha-2/beta-2
Fetching data for target CHEMBL1978 (Cytochrome P450 19A1)...


Processing CHEMBL1978: 5000it [00:01, 4486.88it/s]


KeyboardInterrupt: 

In [4]:
df

Unnamed: 0,Target Name,Target ChEMBL ID,Target Type,Organism,Molecule ChEMBL ID,Activity Type,Activity Value,Canonical SMILES,Standard InChI,Standard InChI Key,Max Phase,Molecular Formula,Molecular Weight,AlogP,Aromatic Rings,HBA,HBD,Rotatable Bonds,Polar Surface Area (PSA),QED Weighted
0,Cytochrome P450 19A1,CHEMBL1978,SINGLE PROTEIN,Homo sapiens,CHEMBL63297,IC50,190.0,Fc1ccc(C(C2Cc3ccccc3O2)n2cncn2)cc1,InChI=1S/C17H14FN3O/c18-14-7-5-12(6-8-14)17(21...,RVFPOHIDBASNMF-UHFFFAOYSA-N,,C17H14FN3O,295.32,3.01,3.0,4.0,0.0,3.0,39.94,0.75
1,Cytochrome P450 19A1,CHEMBL1978,SINGLE PROTEIN,Homo sapiens,CHEMBL65328,IC50,590.0,Cc1ccc(C(C2Cc3ccccc3O2)n2cncn2)cc1,InChI=1S/C18H17N3O/c1-13-6-8-14(9-7-13)18(21-1...,OZEGINYEQGNONL-UHFFFAOYSA-N,,C18H17N3O,291.35,3.18,3.0,4.0,0.0,3.0,39.94,0.74
2,Cytochrome P450 19A1,CHEMBL1978,SINGLE PROTEIN,Homo sapiens,CHEMBL488,IC50,18500.0,CCC1(c2ccc(N)cc2)CCC(=O)NC1=O,InChI=1S/C13H16N2O2/c1-2-13(8-7-11(16)15-12(13...,ROBVIMPUHSLWNV-UHFFFAOYSA-N,4.0,C13H16N2O2,232.28,1.35,1.0,3.0,2.0,2.0,72.19,0.60
3,Cytochrome P450 19A1,CHEMBL1978,SINGLE PROTEIN,Homo sapiens,CHEMBL291646,IC50,200.0,Clc1ccc(C(C2Cc3ccccc3O2)n2cncn2)cc1,InChI=1S/C17H14ClN3O/c18-14-7-5-12(6-8-14)17(2...,UVOSSULRORMWHE-UHFFFAOYSA-N,,C17H14ClN3O,311.77,3.52,3.0,4.0,0.0,3.0,39.94,0.74
4,Cytochrome P450 19A1,CHEMBL1978,SINGLE PROTEIN,Homo sapiens,CHEMBL304788,IC50,880.0,Clc1ccc(C(C2Cc3ccccc3O2)n2cncn2)c(Cl)c1,InChI=1S/C17H13Cl2N3O/c18-12-5-6-13(14(19)8-12...,ZSINOYZZEZDTLK-UHFFFAOYSA-N,,C17H13Cl2N3O,346.22,4.18,3.0,4.0,0.0,3.0,39.94,0.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1448774,Neuronal acetylcholine receptor; alpha3/alpha6...,CHEMBL2109233,PROTEIN COMPLEX,Homo sapiens,CHEMBL3329544,Emax,64.0,C[C@H]1C[C@@H]1C(=O)N1CC2CC(C1)N2,InChI=1S/C10H16N2O/c1-6-2-9(6)10(13)12-4-7-3-8...,INYNWEUTASDACG-KRTGUUSXSA-N,,C10H16N2O,180.25,0.22,0.0,2.0,1.0,1.0,32.34,0.63
1448775,Neuronal acetylcholine receptor; alpha3/alpha6...,CHEMBL2109233,PROTEIN COMPLEX,Homo sapiens,CHEMBL3329545,Emax,73.0,CC1(C)CC1C(=O)N1CC2CC(C1)N2,InChI=1S/C11H18N2O/c1-11(2)4-9(11)10(14)13-5-7...,LJEPPQATHWQMBV-UHFFFAOYSA-N,,C11H18N2O,194.28,0.61,0.0,2.0,1.0,1.0,32.34,0.66
1448776,Neuronal acetylcholine receptor; alpha3/alpha6...,CHEMBL2109233,PROTEIN COMPLEX,Homo sapiens,CHEMBL3329546,Emax,110.0,O=C(C1CC1(F)F)N1CC2CC(C1)N2,InChI=1S/C9H12F2N2O/c10-9(11)2-7(9)8(14)13-3-5...,VXQLPXYMYHKASH-UHFFFAOYSA-N,,C9H12F2N2O,202.20,0.21,0.0,2.0,1.0,1.0,32.34,0.66
1448777,Neuronal acetylcholine receptor; alpha3/alpha6...,CHEMBL2109233,PROTEIN COMPLEX,Homo sapiens,CHEMBL4869892,IC50,135.0,CC[C@H](C)[C@@H]1NC(=O)[C@H](CCC(=O)O)NC(=O)[C...,InChI=1S/C68H103N23O21S4/c1-7-32(6)53-66(110)8...,ATDHPNPDBDEQAK-BCTYQQKGSA-N,,C68H103N23O21S4,1706.98,,,,,,,


In [5]:
len(list(set(list(df["Target ChEMBL ID"]))))

782

In [6]:
set(list(df["Activity Type"]))

{'% Activity remaining',
 '% Control',
 '% Ctrl',
 '% Enzyme Activity (relative to DMSO controls, average n=2)',
 '% Inhibition of Control Agonist Response (Mean n=2)',
 '% Inhibition of Control Specific Binding',
 '% Inhibition of Control Specific Binding (Mean n=2)',
 '% Inhibition of Control Values',
 '% Inhibition of Control Values (Mean n=2)',
 '% Residual activity with Skepinone-L',
 '% binding',
 '% inhibition',
 '% maximum response',
 '% of Control Agonist Response (Mean n=2)',
 '% of activity',
 '% of control',
 '% of effect',
 '% of engagement',
 '% of inhibition',
 '% of inhibition or stimulation',
 '% of residual activity',
 '% residual kinase activity',
 '%Bound_Albumin',
 '%Inhib (Mean)',
 '%Max (Mean)',
 '%max',
 '-Delta G',
 '-Log ED50',
 '-Log K0.5',
 '-Log KB',
 '-Log KD',
 '-TdeltaS',
 '-log(RatioIC50)',
 '-logKa',
 '1/kmax',
 '1/kobsd-ka',
 'A/Ad',
 'A1 selectivity',
 'A2',
 'A2 selectivity',
 'A50',
 'AC50',
 'ACAT activity',
 'ACT',
 'AE activity',
 'AIDH activity

In [1]:
from transformers import BertTokenizerFast, BertModel
checkpoint = 'unikei/bert-base-smiles'
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)
model = BertModel.from_pretrained(checkpoint)

example = 'O=C([C@@H](c1ccc(cc1)O)N)N[C@@H]1C(=O)N2[C@@H]1SC([C@@H]2C(=O)O)(C)C'
tokens = tokenizer(example, return_tensors='pt')
predictions = model(**tokens)

tokenizer_config.json:   0%|          | 0.00/315 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/306k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [2]:
predictions

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-1.3821,  0.0593, -0.1914,  ..., -2.0613, -0.4603, -1.1894],
         [-0.9418, -0.1656,  0.2671,  ..., -0.9770, -0.0633, -1.9062],
         [ 1.1226, -0.4002,  0.3521,  ...,  0.1618, -0.1386,  1.8086],
         ...,
         [ 0.6186,  0.3017,  0.1241,  ..., -0.1342,  0.0974, -0.0602],
         [ 0.4130, -1.1119,  0.2806,  ...,  0.0944, -0.5532, -0.3501],
         [ 0.5375,  0.7749,  1.0559,  ..., -1.0184, -0.0626, -0.3657]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.7910,  0.9982,  0.5162,  0.5645,  0.9931, -1.0000, -0.9679,  0.9552,
         -0.9853,  0.9589, -0.9807,  0.9999,  0.5608, -0.9023,  0.3254,  0.9963,
         -0.9993, -0.9995, -0.9986,  0.9893, -0.9805, -0.9412,  1.0000,  0.9999,
         -0.9650,  0.9021, -0.9906,  0.9998,  0.9825, -0.9002, -0.0313,  0.9987,
         -0.9999,  0.9993, -0.9979, -0.6498, -0.9986,  0.9922, -1.0000,  0.6821,
          0.1280, -0.9983, -0.99