In [None]:
import numpy as np 
import pandas as pd
import tmap as tm
from math import log10

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.AtomPairs import Pairs
from mxfp import mxfp
from map4 import MAP4Calculator
from mhfp.encoder import MHFPEncoder
from drfp import DrfpEncoder

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from faerun import Faerun

from tqdm import tqdm
tqdm.pandas()

#### Clean dataset (kekulization, remove duplicates)

In [None]:
def SaltRemover(smiles):
    return max(smiles.split('.'), key=len)

In [None]:
df = pd.read_csv('data/erb2.csv', sep=';')
df = df[['Smiles', 'Molecule ChEMBL ID', 'Standard Value', 'Standard Type', 'Standard Relation', 'Standard Units']]
df = df.dropna(subset=['Smiles', 'Standard Value', 'Standard Type', 'Standard Relation', 'Standard Units'])
df = df[df['Standard Units'] == 'nM']
df = df[df['Standard Type'] == 'IC50']
df = df[df['Standard Relation'] == "'='"]
df['Smiles'] = df.Smiles.apply(SaltRemover)
df['ROMol'] = df.Smiles.apply(Chem.MolFromSmiles)
df['Smiles'] = df.ROMol.apply(lambda x: Chem.MolToSmiles(x, kekuleSmiles=True, isomericSmiles=False))
df = df.drop_duplicates(subset=['Smiles'])
df

Add molecular properties

In [None]:
df['log(IC50)'] = df['Standard Value'].apply(log10)
df['MW'] = df.ROMol.apply(rdMolDescriptors.CalcExactMolWt)
df['HAC'] = df.ROMol.apply(rdMolDescriptors.CalcNumHeavyAtoms)
df['FCsp3'] = df.ROMol.apply(rdMolDescriptors.CalcFractionCSP3)
df

#### Select top 200 closest compounds to Afatinib

Calculate MAP4 fingerprint for library and Afatinib

In [None]:
MAP4 = MAP4Calculator(dimensions=2048)

afatinib_smiles = 'CN(C)C/C=C/C(=O)NC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC(=C(C=C3)F)Cl)O[C@H]4CCOC4'
afatinib_mol = Chem.MolFromSmiles(afatinib_smiles)
afatinib_map4 = MAP4.calculate(afatinib_mol)

df['MAP4'] = df.ROMol.progress_apply(MAP4.calculate)

Calculate the distance between every compound and Afatinib

In [None]:
ENC = tm.Minhash(2048)

df['Afatinib_dist'] = df.MAP4.apply(lambda x: ENC.get_distance(x, afatinib_map4))
df = df.sort_values(by=['Afatinib_dist'])
df = df.head(200)
df.reset_index(drop=True, inplace=True)
df

#### TMAP visualization

Define a function that fills a quadratic matrix of arbitrary length with empty strings

In [None]:
def EmptyStringMatrix(length):

    empty_string_matrix = []

    for i in range(length):
        empty_string_list = []
        for j in range(length):
            empty_string_list.append('')
        empty_string_matrix.append(empty_string_list)
    
    return empty_string_matrix

Define a function that generate all possible unique pairs of SMILES displayed as reaction SMILES

In [None]:
smiles_list = df.Smiles.values.tolist()

def PairwiseReactionSMILES(smiles_list):

    pwrs = EmptyStringMatrix(len(smiles_list))

    for i in range(len(smiles_list)):
        for j in range(i, len(smiles_list)):
            pwrs[i][j] = f'{smiles_list[i]}>>{smiles_list[j]}'
            pwrs[j][i] = f'{smiles_list[i]}>>{smiles_list[j]}'
    
    return pwrs

reaction_smiles = pd.DataFrame(PairwiseReactionSMILES(smiles_list)).to_numpy().flatten()

Calculate molecule pair properties

In [None]:
def MeanProperty(list_of_properties):

    pairwise_difference = np.zeros((len(list_of_properties), len(list_of_properties)))

    for i in range(len(list_of_properties)):
        for j in range(i, len(list_of_properties)):
            pairwise_difference[i, j] = (list_of_properties[i] + list_of_properties[j])/2
            pairwise_difference[j, i] = (list_of_properties[i] + list_of_properties[j])/2
    
    return pairwise_difference

In [None]:
def DifferenceProperty(list_of_properties):

    pairwise_difference = np.zeros((len(list_of_properties), len(list_of_properties)))

    for i in range(len(list_of_properties)):
        for j in range(i, len(list_of_properties)):
            pairwise_difference[i, j] = (abs(list_of_properties[i] - list_of_properties[j]))
            pairwise_difference[j, i] = (abs(list_of_properties[i] - list_of_properties[j]))
    
    return pairwise_difference

In [None]:
pairwise_mw = pd.DataFrame(MeanProperty(df.MW.values.tolist())).to_numpy().flatten()
pairwise_hac = pd.DataFrame(MeanProperty(df.HAC.values.tolist())).to_numpy().flatten()
pairwise_fcsp3 = pd.DataFrame(MeanProperty(df.FCsp3.values.tolist())).to_numpy().flatten()
diff_activity = pd.DataFrame(DifferenceProperty(df['Standard Value'].values.tolist())).to_numpy().flatten()
diff_activity_log50 = pd.DataFrame(DifferenceProperty(df['log(IC50)'].values.tolist())).to_numpy().flatten()

Define pairwise label

In [None]:
def PairwiseLabel(labels):

    label = EmptyStringMatrix(len(labels))

    for i in range(len(labels)):
        for j in range(i, len(labels)):
            label[i][j] = f'{labels[i]} / {labels[j]}'
            label[j][i] = f'{labels[i]} / {labels[j]}'
    
    return label

In [None]:
pairwise_label = pd.DataFrame(PairwiseLabel(df['Molecule ChEMBL ID'].values.tolist())).to_numpy().flatten()
pairwise_activity = pd.DataFrame(PairwiseLabel(df['log(IC50)'].values.tolist())).to_numpy().flatten()

DRFP

In [None]:
drfps = DrfpEncoder.encode(reaction_smiles)

In [None]:
import warnings
from sklearn.neighbors import NearestNeighbors
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=DataConversionWarning)

knn = 20

knn_search = NearestNeighbors(n_neighbors=knn, radius=1.0, algorithm='auto', leaf_size=30, metric='jaccard', p=2, metric_params=None, n_jobs=None)
knn_search.fit(np.array(drfps))

edge_list = []

for i in tqdm(range(len(drfps))):
    dists, idxs = knn_search.kneighbors(drfps[i].reshape(1, -1))
    for j in range(knn):
        edge_list.append([i, idxs[0, j], dists[0, j]])

In [None]:
cfg = tm.LayoutConfiguration()

cfg.node_size = 1 / 30
cfg.mmm_repeats = 2
cfg.sl_extra_scaling_steps = 5
cfg.k = 20
cfg.sl_scaling_type = tm.RelativeToAvgLength

x_, y_, s, t, gp = tm.layout_from_edge_list(len(drfps), edge_list, cfg)
tm_layout_drfp = {'x': list(x_), 'y': list(y_), 's': list(s), 't': list(t)}

Prepare final dataframe for TMAP

In [None]:
df_tmap = pd.DataFrame(list(zip(pairwise_label, pairwise_activity, reaction_smiles, pairwise_mw, pairwise_hac, pairwise_fcsp3, diff_activity, diff_activity_log50)), 
                                columns=['Label', 'ActLabel', 'ReactionSMILES', 'uMW', 'uHAC', 'uFCsp3', 'dIC50', 'dlog(IC50)'])

In [None]:
labels = []

for i, row in df_tmap.iterrows():
    labels.append(
            row["ReactionSMILES"]
            + "__"
            + f'{row["ReactionSMILES"]}'
            + "__"
            + f'Label: {row["Label"]}'
            + "__"
            + f'Activities: {row["ActLabel"]}'
            + "__"
            + f'dlog(IC50): {row["dlog(IC50)"]}'
        )

In [None]:
f = Faerun(
    view="front", 
    coords=False,
    title="",
    clear_color='#FFFFFF',
)

f.add_scatter(
    "DRFP_TMAP",
    {
        "x": tm.VectorFloat(tm_layout_drfp['x']),
        "y": tm.VectorFloat(tm_layout_drfp['y'] ),
        "c": [
            df_tmap.uMW.values.tolist(),
            df_tmap.uHAC.values.tolist(), 
            df_tmap.uFCsp3.values.tolist(),
            df_tmap.dIC50.values.tolist(),
            df_tmap['dlog(IC50)'].values.tolist()
            ],
        "labels": labels,
    },
    shader="sphere",
    point_scale=2,
    max_point_size=20,
    legend_labels=[None, None, None, None, None],
    categorical=[False, False, False, False, False],
    colormap=['rainbow', 'rainbow', 'rainbow', 'rainbow', 'rainbow'],
    series_title=['uMW', 'uHAC', 'uFCsp3', 'dIC50', 'dlog(IC50)'],
    has_legend=True,
)
f.add_tree("DRFP_TMAP_tree", {"from": tm.VectorUint(tm_layout_drfp['s']), "to": tm.VectorUint(tm_layout_drfp['t'])}, point_helper="DRFP_TMAP")
f.plot('plots/Erb2_DRFP_TMAP', template='reaction_smiles')

Define function for merged MinHashed fingerprints

In [None]:
def MergeMAP4(map4_list):

    map4_matrix = []
    for i in range(len(map4_list)):
        map4_row = []
        for j in range(len(map4_list)):
            map4_row.append(np.minimum(map4_list[i], map4_list[j]))
        map4_matrix.append(map4_row)
    return map4_matrix

Calculate merged MAP4 for all molecular pairs

In [None]:
pairwise_map4 = pd.DataFrame(MergeMAP4(df.MAP4.values.tolist())).to_numpy().flatten()

TMAP

Layout

In [None]:
lf = tm.LSHForest(1024, 64)

merged_map4 = np.array(df_tmap['MAP4'])
fps = []

for i in merged_map4:
    vec = tm.VectorUint(i)
    fps.append(vec)

lf.batch_add(fps)
lf.index()

cfg = tm.LayoutConfiguration() #configuration parameters for tmap layout
cfg.node_size = 1 / 30 #size of nodes which affects the magnitude of their repelling force. Decreasing this values generally resolves overlaps in a very crowded tree
cfg.mmm_repeats = 2 #number of repeats of the per-level layout algorithm
cfg.sl_extra_scaling_steps = 5 #sets the number of repeats of the scaling
cfg.k = 45 #number of nearest neighbours used to create the k-nearest neighbour graph
cfg.sl_scaling_type = tm.RelativeToAvgLength #Defines the relative scale of the graph
x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)