In [1]:
import BioSimSpace as BSS
import pandas as pd
from scipy import stats
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import minmax_scale
import glob
import csv
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import numpy as np
import os
import shutil
from functools import reduce
from rdkit import Chem

Process Process-147:
Traceback (most recent call last):
  File "/home/jscheen/miniconda3/envs/data_driven_fep_rel/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/jscheen/miniconda3/envs/data_driven_fep_rel/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/jscheen/miniconda3/envs/data_driven_fep_rel/lib/python3.7/site-packages/BioSimSpace/Align/_lomap/dbmol.py", line 567, in compute_mtx
    MC = mcs.MCS(moli, molj, options=self.options)
  File "/home/jscheen/miniconda3/envs/data_driven_fep_rel/lib/python3.7/site-packages/BioSimSpace/Align/_lomap/mcs.py", line 559, in __init__
    matchChiralTag=False)
KeyboardInterrupt
Process Process-150:
Traceback (most recent call last):
  File "/home/jscheen/miniconda3/envs/data_driven_fep_rel/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/jscheen/miniconda3/envs/data_driven_fep_rel/lib

In [2]:
def generateLinksFiles(preds_path):
    """Given a csv file with SEM predictions, write out links files for LOMAP.
    Because SEMs are the inverse of LOMAP (i.e. LOMAP-score 0.0 is poor, but SEM 0.0 is good), 
    we need to invert the predicted SEM values. Additionally, compress them to fall in the range 0-1 
    as is the case with LOMAP-scores."""
    
    preds_df = pd.read_csv(preds_path)
    pert_names = preds_df["pert_name"].values
    
    # for these, take inverse and scale to 0-1.
    pred_sems = minmax_scale(1 / preds_df["pred_sem_mean"].values, feature_range=(0,1))
    
    random_sems = preds_df["random_sem"].values
    return pert_names, pred_sems, random_sems
        
def writeLinksFile(pert_names, values, filename):
    with open(filename, "w") as writefile:
        writer = csv.writer(writefile, delimiter =" ")
        for pert_name, value in zip(pert_names, values):
            writer.writerow([pert_name, value])
        

In [26]:
def runLOMAP(tgt, plot_network=False, links_file=False):
    path_to_ligands = f"/home/jscheen/projects/FEPSPACE/fep_ref_ligands/{tgt}"
    ligand_files = glob.glob(f"{path_to_ligands}/*.sdf")

    ligands = []
    ligand_names = []

    for filepath in ligand_files:
        # for TNKS2, exclude +1 ligands.
        if tgt == "tnks2":
            if "8" in filepath.split("/")[-1]:
                continue # lignames with 8 in name are +1 ligands; exclude these.               
        
        
        # append the molecule object to a list.
        ligands.append(Chem.SDMolSupplier(filepath)[0])

        # append the molecule name to another list so that we can use the name of each molecule in our workflow.
        ligand_names.append(filepath.split("/")[-1].replace(".sdf",""))
    
    # now run LOMAP. Even if we have a links_file, we need to run a 'vanilla' LOMAP first to get the work dir.
    if links_file:
        tranformations, lomap_scores = BSS.Align.generateNetwork(ligands, plot_network=plot_network, names=ligand_names,
                                                            work_dir="tmp/lomap_workdir")       
    else:
        tranformations, lomap_scores = BSS.Align.generateNetwork(ligands, plot_network=plot_network, names=ligand_names,
                                                            work_dir="tmp/lomap_workdir")

    if links_file:
        links_file = f"tmp/lomap_ml_links_file_{tgt}.csv"
        # bit of a workaround, but we have to find the mol2 file names that LOMAP uses internally.  
        # write out a second linksfile that doesn't have the tilde to denote the perturbations. Also
        # use glob in the pre-generated LOMAP work folder to find what we should call our ligands (i.e.
        # refer to the internal LOMAP file name).
        links_file_contents = pd.read_csv(links_file, sep=" ", header=None)
        lomap_internal_files = glob.glob("tmp/lomap_workdir/inputs/*.sdf")
        internal_links_file_path = links_file.replace(".csv", "_internal.csv")

        with open(internal_links_file_path, "w") as writefile:
            writer = csv.writer(writefile, delimiter =" ")
            lig1, lig2 = None, None
            for pert_name, value in zip(links_file_contents[0].values, links_file_contents[1].values):
                # find the internal path.
                for lig in lomap_internal_files:
                    if pert_name.split("~")[0] in lig:
                        lig1 = lig.split("/")[-1]
                    elif pert_name.split("~")[1] in lig:
                        lig2 = lig.split("/")[-1]
                if lig1 and lig2:
                    writer.writerow([lig1, lig2, value])



        # now run LOMAP with the pre-specified edge scorings.
        tranformations, lomap_scores = BSS.Align.generateNetwork(ligands, plot_network=plot_network, names=ligand_names,
                                                                links_file=internal_links_file_path)    

    pert_network_dict = {}
    transformations_named = [(ligand_names[transf[0]], ligand_names[transf[1]]) for transf in tranformations]
    for transf, score in zip(transformations_named, lomap_scores):
        transf_tilde = "~".join(transf)
        pert_network_dict[transf_tilde] = score

    return tranformations, lomap_scores, pert_network_dict, ligand_names

In [27]:
def compareNetworks(tgt_to_do, print_overlapping_edges=False, plot_network=False):
    # clear all links files from ./tmp.
    [os.remove(linkspath) for linkspath in glob.glob("tmp/lomap*.csv")]
    shutil.rmtree("tmp/lomap_workdir", ignore_errors=True)
    
    
    # get the links files for this target.
    sem_preds_files = []
    
    for sem_preds_file in glob.glob(f"output/series_predictions/{tgt_to_do}_*csv"):
        if not "perts" in sem_preds_file and not "_networks" in sem_preds_file:
            sem_preds_files.append(sem_preds_file)
    pred_sems_coll = []
    for sem_preds_file in sem_preds_files:
        pert_names, pred_sems, _ = generateLinksFiles(sem_preds_file)
        pred_sems_coll.append(pred_sems)
        
    
    # compute mean pred_sems.
    pred_sems = np.mean(pred_sems_coll, axis=0)
                
                
    # generate networks.      
    writeLinksFile(pert_names, pred_sems, f"tmp/lomap_ml_links_file_{tgt_to_do}.csv")


    transformations_lomap, _, _, lig_names = runLOMAP(tgt_to_do, plot_network, links_file=False)
    transformations_fepnn, _, _, lig_names = runLOMAP(tgt_to_do, plot_network, links_file=True)
    
    if print_overlapping_edges:
        print("Overlapping edges:")
        for transf in transformations_lomap:
            inv_transf = (transf[1], transf[0])
            if transf in transformations_fepnn or inv_transf in transformations_fepnn:
                print(f"{lig_names[transf[0]]}~{lig_names[transf[1]]}")
    
    return transformations_lomap, transformations_fepnn, len(lig_names)


In [28]:
def computeNetworkOverlap(transformations_lomap, transformations_fepnn):
    overlap = 0

    for edge_lomap in transformations_lomap:
        
        inv_edge = (edge_lomap[1], edge_lomap[0])
        if edge_lomap in transformations_fepnn:

            overlap += 1
        elif inv_edge in transformations_fepnn:
            overlap += 1


    if overlap == 0:
        perc_overlap = 0
    else:
        perc_overlap_lomap = int(overlap/len(transformations_lomap)*100)
        perc_overlap_fepnn = int(overlap/len(transformations_fepnn)*100)
        perc_overlap = np.mean([perc_overlap_lomap, perc_overlap_fepnn])

    return len(transformations_lomap), int(perc_overlap), len(transformations_fepnn)


In [32]:
tgts = []
for tgt in glob.glob("/home/jscheen/projects/FEPSPACE/fep_ref_ligands/*"):
    exclude = ["protein_files", "readme.txt", "fep_benchmarking_perts.csv", "cats"]
    tgt = tgt.split("/")[-1]
    if not tgt in exclude:
        tgts.append(tgt)

        
# start with an empty df.
df = pd.DataFrame(columns=["Series size (n)", "LOMAP network (n)", "Network overlap (%)", "FEPNN network (n)"])

# populate the df with data.
for tgt in tgts:
    print(tgt)
    
    transformations_lomap, transformations_fepnn, num_ligs = compareNetworks(tgt, print_overlapping_edges=True)
    num_lomap, overlap, num_fepnn = computeNetworkOverlap(transformations_lomap, transformations_fepnn)
    row_df = pd.DataFrame([[num_ligs, num_lomap, overlap, num_fepnn]], columns=["Series size (n)", "LOMAP network (n)", "Network overlap (%)", "FEPNN network (n)"], index=[tgt])
    df = df.append(row_df)


df = df.sort_values(by="Series size (n)", ascending=False)
    
df.to_csv("output/series_predictions/network_overlaps.csv")

print(df)

cdk8
Overlapping edges:
32~33
32~28
32~20
32~26
32~29
44~18
44~45
35~36
35~34
14~16
30~24
30~38
36~34
28~23
20~19
16~15
26~21
26~19
43~42
37~31
38~31
pfkfb3
Overlapping edges:
33~24
33~19
35~65
35~55
35~52
30~23
48~47
48~46
41~20
36~29
26~37
26~34
43~38
43~42
37~34
39~38
58~70
58~60
47~46
31~23
70~60
shp2
Overlapping edges:
out1~out3
out1~out11
out1~11
out1~10
out1~out14
out1~out13
out1~out8
out1~out25
out1~out19
out16~out2
out11~6
out20~3
7~6
6~SHP836-2
out24~out26
4~SHP836-2
galectin
Overlapping edges:
lig_04_ligNMe2~lig_03_ligNHMe
lig_02_ligOMe~lig_07_ligOH
lig_06_ligPyr~lig_07_ligOH
lig_03_ligNHMe~lig_08_ligNH2
eg5
Overlapping edges:
CHEMBL1086409~CHEMBL1083517
CHEMBL1096003~CHEMBL1084677
CHEMBL1086410~CHEMBL1083836
CHEMBL1078691~CHEMBL1089056
CHEMBL1078691~CHEMBL1078774
CHEMBL1078691~CHEMBL1093088
CHEMBL1088740~CHEMBL1089056
CHEMBL1088740~CHEMBL1089393
CHEMBL1089056~CHEMBL1089393
CHEMBL1089056~CHEMBL1078774
CHEMBL1089056~CHEMBL1093088
hif2a
Overlapping edges:
266~25
35~41
30~31
25

In [25]:
# what's with EG5?
pd.read_csv("tmp/lomap_ml_links_file_eg5_internal.csv")

Unnamed: 0,000_CHEMBL1084143.sdf 001_CHEMBL1086409.sdf 0.0
0,001_CHEMBL1086409.sdf 000_CHEMBL1084143.sdf 0.0
1,000_CHEMBL1084143.sdf 002_CHEMBL1096003.sdf 0.0
2,002_CHEMBL1096003.sdf 000_CHEMBL1084143.sdf 0.0
3,000_CHEMBL1084143.sdf 003_CHEMBL1077204.sdf 0.0
4,003_CHEMBL1077204.sdf 000_CHEMBL1084143.sdf 0.0
...,...
750,026_CHEMBL1084935.sdf 025_CHEMBL1084676.sdf 0.0
751,025_CHEMBL1084676.sdf 027_CHEMBL1093088.sdf 0.0
752,027_CHEMBL1093088.sdf 025_CHEMBL1084676.sdf 0.0
753,026_CHEMBL1084935.sdf 027_CHEMBL1093088.sdf 0.0


In [None]:
"""FEPNN predicted SEM is 0.0 in most cases. Grafting failed for these perts due to overly complex perturbations. 
LOMAP algorithm excludes ligands from the network if all adjacent possible edges are 0.0, which is what's happening
in this case. Excluding EG5 from the comparison table."""