In [1]:
import json
import pandas as pd

In [2]:
js = json.load(open("/Mounts/rbg-storage1/datasets/Metabo/antibiotics/stokes2019_dataset.json", "rb"))

In [2]:
df = pd.read_json(open("/Mounts/rbg-storage1/datasets/Metabo/antibiotics/stokes2019_dataset.json", "rb"))

In [4]:
# df.to_csv("/Mounts/rbg-storage1/datasets/Metabo/antibiotics/stokes2019_dataset.csv")

In [3]:
df.head()

Unnamed: 0,mean_inhibition,smiles,name,activity
0,0.041572,Cc1cc(O)c(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(C...,CEFPIRAMIDE,Active
1,0.041876,CON=C1CN(c2nc3c(cc2F)c(=O)c(C(=O)O)cn3C2CC2)CC...,GEMIFLOXACIN MESYLATE,Active
2,0.041916,CCC(C)CCCCC(=O)NC(CCN)C(=O)NC(C(=O)NC(CCN)C(=O...,POLYMYXIN B SULFATE,Active
3,0.041964,Cl.N=C(N)n1cccn1,PRAXADINE HYDROCHLORIDE,Active
4,0.042295,Cl.Cl.N=C(NCCCCCCNC(=N)NC(=N)Nc1ccc(Cl)cc1)NC(...,CHLORHEXIDINE DIHYDROCHLORIDE,Active


In [10]:
from multiprocessing import Pool
# from pathlib import Path
from typing import Optional

from chembl_webresource_client.new_client import new_client
import numpy as np
# from tap import Tap
from tqdm import tqdm

In [13]:
def get_chembl_id(smiles: str) -> Optional[str]:
    """Gets a ChEMBL ID for a SMILES."""
    try:
        records = new_client.molecule.filter(
            molecule_structures__canonical_smiles__flexmatch=smiles
        ).only(['molecule_chembl_id'])

        if len(records) > 0:
            return records[0]['molecule_chembl_id']
    except Exception as e:
        print(f'Exception for {smiles}')
        print(e)

    return None

def get_chembl_ids(data):
    """Gets ChEMBL IDs from SMILES."""
    # Get ChEMBL IDs
    smiles_list = sorted(set(data[SMILES_COLUMN].dropna().unique()))
    smiles_to_chembl_ids = {}
    with Pool() as pool:
        for smiles, chembl_id in tqdm(zip(smiles_list, pool.imap(get_chembl_id, smiles_list)), total=len(smiles_list)):
            if chembl_id is not None:
                smiles_to_chembl_ids[smiles] = chembl_id

    # Add ChEMBL IDs to data
    data[CHEMBL_COMPOUND_ID_COLUMN] = [smiles_to_chembl_ids.get(smiles, np.nan) for smiles in data[SMILES_COLUMN]]

    print(f'Number of unique ChEMBL IDs = '
          f'{len({chembl_id for chembl_ids in smiles_to_chembl_ids.values() for chembl_id in chembl_ids}):,}')

    return data

In [14]:
CHEMBL_COMPOUND_ID_COLUMN = "chembl_ids"
SMILES_COLUMN = "smiles"

In [16]:
data = get_chembl_ids(df)

 19%|████████████████████████████████▎                                                                                                                                           | 438/2335 [00:32<04:35,  6.89it/s]

Exception for CC1(C)CCCC2(C)C1CCC1(C)OCCC12
database is locked


 44%|███████████████████████████████████████████████████████████████████████████▍                                                                                               | 1030/2335 [01:06<01:28, 14.83it/s]

Exception for CN1CCC(=C2c3ccccc3CCc3sccc32)CC1.O=C(O)CC(O)C(=O)O
database is locked


 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 2056/2335 [02:08<00:24, 11.22it/s]

Exception for O=C(OC1Cc2c(O)cc(O)cc2OC1c1cc(O)c(O)c(O)c1-c1c(C2Oc3cc(O)cc(O)c3CC2OC(=O)c2cc(O)c(O)c(O)c2)cc(O)c(O)c1O)c1cc(O)c(O)c(O)c1
database is locked
Exception for O=C(O)COc1nn(Cc2ccccc2)c2ccccc12
database is locked


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2335/2335 [02:19<00:00, 16.73it/s]

Number of unique ChEMBL IDs = 16





In [29]:
len(set(df["smiles"].values))

2335

In [27]:
len(set([i for i in data['chembl_ids'].values if i is not np.nan]))

1621

In [30]:
data = df

In [31]:
"""Gets ChEMBL and UniProt targets from ChEMBL compound IDs."""
# Load data
# data = pd.read_csv(args.data_path)

'Gets ChEMBL and UniProt targets from ChEMBL compound IDs.'

In [32]:
# Drop data without ChEMBL compound ID or SMILES
data.dropna(subset=[CHEMBL_COMPOUND_ID_COLUMN, SMILES_COLUMN], inplace=True)

# Sort data by SMILES for canonical selection between SMILES with same ChEMBL ID
data.sort_values(by=SMILES_COLUMN, inplace=True)

# Map ChEMBL compound ID to SMILES
# NOTE: By default, uses the last SMILES that matches the ChEMBL compound ID
chembl_compound_id_to_smiles = dict(zip(data[CHEMBL_COMPOUND_ID_COLUMN], data[SMILES_COLUMN]))

In [34]:
from collections import defaultdict

In [36]:
from tqdm import trange

In [None]:
# Get ChEMBL targets
chembl_compound_ids = sorted(chembl_compound_id_to_smiles)
chembl_compound_id_to_chembl_target_ids = defaultdict(set)
chunk_size = 50
for i in trange(0, len(chembl_compound_ids), chunk_size):
    # Use activities to get target from compound
    activities = new_client.activity.filter(
        molecule_chembl_id__in=chembl_compound_ids[i:i + chunk_size]
    ).only(['molecule_chembl_id', 'target_chembl_id'])

    # Extract target ChEMBL IDs from activities:
    for activity in tqdm(activities):
        chembl_compound_id_to_chembl_target_ids[activity['molecule_chembl_id']].add(activity['target_chembl_id'])

  0%|                                                                                                                                                                                        | 0/33 [00:00<?, ?it/s]
  0%|                                                                                                                                                                                     | 0/32991 [00:00<?, ?it/s][A
  0%|                                                                                                                                                                            | 21/32991 [00:01<33:59, 16.16it/s][A
  0%|▏                                                                                                                                                                           | 41/32991 [00:01<24:16, 22.62it/s][A
  0%|▎                                                                                                                                     

In [None]:
chembl_compound_id_to_chembl_target_ids = dict(chembl_compound_id_to_chembl_target_ids)

In [None]:
chembl_compound_id_to_chembl_target_ids

In [None]:
# Create DataFrame with ChEMBL targets
chembl_target_ids = sorted(
    chembl_target_id
    for chembl_target_ids in chembl_compound_id_to_chembl_target_ids.values()
    for chembl_target_id in chembl_target_ids
)
chembl_target_data = pd.DataFrame(data={
    SMILES_COLUMN: [chembl_compound_id_to_smiles[chembl_compound_id] for chembl_compound_id in chembl_compound_ids],
    **{
        chembl_target_id: [
            1 if chembl_target_id in chembl_compound_id_to_chembl_target_ids[chembl_compound_id] else 0
            for chembl_compound_id in chembl_compound_ids
        ]
        for chembl_target_id in chembl_target_ids
    }
})

In [None]:
# Save ChEMBL target data
chembl_target_data.to_csv(args.chembl_save_path, index=False)

In [None]:
# Get UniProt targets
chembl_compound_id_to_uniprot_target_ids = defaultdict(set)
for chembl_compound_id, chembl_target_ids in tqdm(chembl_compound_id_to_chembl_target_ids.items()):
    chembl_target_ids = sorted(chembl_target_ids)

    for i in trange(0, len(chembl_target_ids), args.chunk_size):
        targets = new_client.target.filter(
            target_chembl_id__in=chembl_target_ids[i:i + args.chunk_size]
        ).only(['target_components'])
        chembl_compound_id_to_uniprot_target_ids[chembl_compound_id] |= {
            component['accession'] for target in targets for component in target['target_components']
        }

chembl_compound_id_to_uniprot_target_ids = dict(chembl_compound_id_to_uniprot_target_ids)

# Create DataFrame with UniProt targets
uniprot_target_ids = sorted(
    uniprot_target_id
    for uniprot_target_ids in chembl_compound_id_to_uniprot_target_ids.values()
    for uniprot_target_id in uniprot_target_ids)
uniprot_target_data = pd.DataFrame(data={
    SMILES_COLUMN: [chembl_compound_id_to_smiles[chembl_compound_id] for chembl_compound_id in chembl_compound_ids],
    **{
        uniprot_target_id: [
            1 if uniprot_target_id in chembl_compound_id_to_uniprot_target_ids[chembl_compound_id] else 0
            for chembl_compound_id in chembl_compound_ids
        ]
        for uniprot_target_id in uniprot_target_ids
    }
})

# Save UniProt target data
uniprot_target_data.to_csv(args.uniprot_save_path, index=False)

## StringDB E coli model

In [27]:
import pandas as pd
import networkx as nx
import numpy as np
import pickle

In [2]:
prots = pd.read_csv("/Mounts/rbg-storage1/datasets/STRING/e_coli/511145.protein.info.v11.5.txt", sep = "\t")
prots["#string_protein_id"] = prots["#string_protein_id"].str.replace("511145\.", "", regex=True)
links = pd.read_csv("/Mounts/rbg-storage1/datasets/STRING/e_coli/511145.protein.links.v11.5.txt", sep = " ")
links["protein1"] = links["protein1"].str.replace("511145\.", "", regex=True)
links["protein2"] = links["protein2"].str.replace("511145\.", "", regex=True)

In [None]:
# Makes a graph of the PPI network and runs Floyd-Warshall algorithm (All pairs shortest path)
# Path weights are -log(weight) because we want the maximum multiplication along the paths
# idx2prot = {prot_id: idx for idx, prot_id in \
#             zip(range(len(set(prots["#string_protein_id"].values))), \
#                 set(prots["#string_protein_id"].values))}
edgelist = [i.tolist()[1:] for i in links.to_records()]
edgelist = [(p1, p2, {"weight": -np.log(w / 1000)}) for (p1, p2, w) in edgelist]
# edgelist = [(idx2prot[p1], idx2prot[p2], {"weight": w / 1000}) for (p1, p2, w) in edgelist]
G = nx.Graph(edgelist)
predecessor, shortest_paths = nx.floyd_warshall_predecessor_and_distance(G)
clean_dict = {}
for k1 in shortest_paths:
    for k2 in shortest_paths[k1]:
        clean_dict.setdefault(k1, {})
        clean_dict[k1][k2] = shortest_paths[k1][k2]
        
paths = {}
for k1 in shortest_paths:
    for k2 in shortest_paths[k1]:
        paths.setdefault(k1, {})
        paths[k1][k2] = nx.reconstruct_path(k1, k2, predecessors)
        
pickle.dump(dict(clean_dict), open("predecessors.pkl", "wb"))
pickle.dump(dict(paths), open("predecessors.pkl", "wb"))

In [47]:
shortest_paths_logs = pickle.load(open("updated_paths.pkl", "rb"))

In [None]:
shortest_paths

In [44]:
max_paths = {}
for k1 in shortest_paths_logs:
    for k2 in shortest_paths_logs[k1]:
        max_paths[k1] = {k2: np.exp(-1*shortest_paths_logs[k1][k2])}

In [46]:
max_paths["b0001"]

{'b4706': 0.6296337599976092}