# Chemical Space Network 


## 1. Import RDKit, Networkx, and other libraries

In [49]:
from rdkit import Chem
from rdkit import rdBase
from rdkit.Chem import Draw
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem import rdDepictor
rdDepictor.SetPreferCoordGen(True)
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import rdFMCS
from rdkit import DataStructs
from rdkit.Chem import rdmolops

import numpy as np

import pandas as pd

import networkx as nx

import matplotlib as mpl
import matplotlib.pyplot as plt

In [50]:
# Print versions of libraries used
print('RDKit version: ',rdBase.rdkitVersion)
print('Networkx version',nx.__version__)


RDKit version:  2023.03.1
Networkx version 2.8.8


## 2. Load Data

In [51]:
full_df = pd.read_csv("../Data/Virtualpredictions.csv")


# 3. Data Preparation and Checks


In [52]:
# Check for presence of disconnected SMILES notation via string matching 
df1 = full_df[~full_df['3,3 Catalyst Substituent '].str.contains("\.")]
len(full_df)==len(df1)

True

In [53]:

smis = df1['3,3 Catalyst Substituent '].tolist()
num_frags = []
for smi in smis:
    mol = Chem.MolFromSmiles(smi)
    num_frags.append(len(Chem.GetMolFrags(mol))) # returns number of fragments

In [54]:
# now check that all molecules have only one fragment
all(frag == 1 for frag in num_frags)

True

In [55]:
# We'll use the original SMILES as unique dictionary keys, so we should verify that the
#  SMILES are unique strings too.
set_smis = set(smis)
len(set_smis) == len(smis)

False

## 4. Compile Node data

In [56]:
# set the dataframe index as Smiles (we already verified they are all unique from eachother)
df2 = df1.set_index('3,3 Catalyst Substituent ')

In [57]:
df2

Unnamed: 0_level_0,Unnamed: 0,nucleophile SMILES,starting electrophile SMILES,N Catalyst Substituent,ddg,ee,cluster
"3,3 Catalyst Substituent",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c2ccc1ccccc1c2,0,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,NS(=O)(=O)C(F)(F)F,1.186380,0.910254,10
c2ccc1ccccc1c2,1,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,NS(=O)(=O)C(F)(F)C(F)(F)F,1.221371,0.917676,10
c2ccc1ccccc1c2,2,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,NS(=O)(=O)c1c(F)c(F)c(F)c(F)c1F,1.334461,0.937845,10
c2ccc1ccccc1c2,3,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,NS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,1.260528,0.925284,10
c2ccc1ccccc1c2,4,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,NS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C...,1.259334,0.925062,10
...,...,...,...,...,...,...,...
c3ccc2ccc1ccccc1c2c3,363995,C1=CCCC=C1,C=CC=O,NS(=O)(=O)C(F)(F)F,1.192303,0.911554,1
c3ccc2ccc1ccccc1c2c3,363996,C1=CCCC=C1,C=CC=O,NS(=O)(=O)C(F)(F)C(F)(F)F,1.261160,0.925401,1
c3ccc2ccc1ccccc1c2c3,363997,C1=CCCC=C1,C=CC=O,NS(=O)(=O)c1c(F)c(F)c(F)c(F)c1F,1.276828,0.928248,1
c3ccc2ccc1ccccc1c2c3,363998,C1=CCCC=C1,C=CC=O,NS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,1.295856,0.931565,1


In [58]:
node_data = np.unique(df2.index)
node_data


array(['CC(C)(C)c1ccccc1', 'CC(C)CCc1cccc(CCC(C)C)c1',
       'CC3(C)c1ccccc1c2ccccc23', 'CCCCCCc1cccc(CCCCCC)c1',
       'CCCc1cccc(CCC)c1', 'CCc1cccc(CC)c1',
       'Cc1cc(C)c2CCc4cccc3CCc1c2c34', 'Cc1cc(C)c2ccc4cccc3ccc1c2c34',
       'Cc1cc(C)ccc1', 'Cc1cccc(C)c1C', 'Cc1ccccc1',
       'FC(F)(F)C(F)(F)C(F)(F)c1cccc(C(F)(F)C(F)(F)C(F)(F)F)c1',
       'FC(F)(F)C(F)(c1cccc(C(F)(C(F)(F)F)C(F)(F)F)c1)C(F)(F)F',
       'FC(F)(F)c1cccc(C(F)(F)F)c1',
       'FS(F)(F)(F)(F)c1cccc(S(F)(F)(F)(F)F)c1', 'FS(F)(F)(F)(F)c1ccccc1',
       'c1ccc3c(c1)CCc2ccccc23', 'c1cccc2ccccc12', 'c1ccccc1',
       'c2ccc1ccccc1c2', 'c2cccc(c1ccccc1)c2', 'c3ccc2ccc1ccccc1c2c3',
       'c4ccc3c1ccccc1C2(CCC2)c3c4', 'c4ccc3c1ccccc1C2(CCCC2)c3c4',
       'c4ccc3c1ccccc1c2ccccc2c3c4'], dtype=object)

In [59]:
node_subset = df2.groupby(['3,3 Catalyst Substituent '])['ee'].mean().to_dict()
node_subset['CC(C)(C)c1ccccc1']

0.899319951570055

## 5. Compute and Compile Edge Data

In [60]:
# We first need to create subset pairs of the SMILES
smis = [] 
for key in node_data:
    smis.append(key)

from itertools import combinations
smis_subsets = list(combinations(smis,2))
len(smis_subsets)

300

In [61]:
# create a dictionary, subsets
subsets = {}
for i, (smi1,smi2) in enumerate(smis_subsets):
    field = {}
    field["smi1"] = smi1
    subsets[i] = field
    
    field["smi2"] = smi2
    subsets[i] = field
len(subsets)

300

In [62]:
# add mol objects to our subsets dictionary
for key,value in subsets.items():
    subsets[key].update({"mol1": Chem.MolFromSmiles(value['smi1'])})
    subsets[key].update({"mol2": Chem.MolFromSmiles(value['smi2'])})

### Compute Tanimoto Similarity (RDKit fingerprint based)

In [63]:
# compute and add Tanimoto Similarity using default RDKit fingerprints
for key,value in subsets.items():
    fp1 = Chem.RDKFingerprint(value['mol1'])
    fp2 = Chem.RDKFingerprint(value['mol2'])
    tan_sim = round(DataStructs.TanimotoSimilarity(fp1,fp2), 3)
    subsets[key].update({"tan_similarity": tan_sim})

### Compute MCS-based Tanimoto Coefficient (multiprocessing)

In [64]:
# get number of processors
import multiprocessing
print(multiprocessing.cpu_count())
num_cpus=2

8


In [65]:
# Add maximum common substructure (MCS)-based Tanimoto Coefficient


def tc_mcs(mol1,mol2,key):
    # get maximum common substructure instance
    mcs = rdFMCS.FindMCS([mol1,mol2],timeout=10) # adding a 10 second timeout
    
    # get number of common bonds
    mcs_bonds = mcs.numBonds
    
    # get number of bonds for each
    # default is only heavy atom bonds
    mol1_bonds = mol1.GetNumBonds()
    mol2_bonds = mol2.GetNumBonds()
    
    # compute MCS-based Tanimoto
    tan_mcs = mcs_bonds / (mol1_bonds + mol2_bonds - mcs_bonds)
    return key, tan_mcs

# create a list of mol1, mol2, and their dictionary key as tuples
mol_tuples = []
for key, value in subsets.items():
    mol_tuples.append((value['mol1'],value['mol2'], key))

# run multiprocessing on the tc_mcs function
from multiprocessing import Pool

if __name__ == '__main__':

    for  mol1, mol2, key in mol_tuples:
        subsets[key].update({"tan_mcs": tc_mcs(mol1, mol2,key)[1]})

In [66]:
list(subsets.values())[0:5]

[{'smi1': 'CC(C)(C)c1ccccc1',
  'smi2': 'CC(C)CCc1cccc(CCC(C)C)c1',
  'mol1': <rdkit.Chem.rdchem.Mol at 0x177a50ba0>,
  'mol2': <rdkit.Chem.rdchem.Mol at 0x177a50b30>,
  'tan_similarity': 0.382,
  'tan_mcs': 0.4444444444444444},
 {'smi1': 'CC(C)(C)c1ccccc1',
  'smi2': 'CC3(C)c1ccccc1c2ccccc23',
  'mol1': <rdkit.Chem.rdchem.Mol at 0x177a50d60>,
  'mol2': <rdkit.Chem.rdchem.Mol at 0x177a50dd0>,
  'tan_similarity': 0.186,
  'tan_mcs': 0.5882352941176471},
 {'smi1': 'CC(C)(C)c1ccccc1',
  'smi2': 'CCCCCCc1cccc(CCCCCC)c1',
  'mol1': <rdkit.Chem.rdchem.Mol at 0x177a50e40>,
  'mol2': <rdkit.Chem.rdchem.Mol at 0x177a50eb0>,
  'tan_similarity': 0.361,
  'tan_mcs': 0.4},
 {'smi1': 'CC(C)(C)c1ccccc1',
  'smi2': 'CCCc1cccc(CCC)c1',
  'mol1': <rdkit.Chem.rdchem.Mol at 0x177a50190>,
  'mol2': <rdkit.Chem.rdchem.Mol at 0x177a50f20>,
  'tan_similarity': 0.431,
  'tan_mcs': 0.5714285714285714},
 {'smi1': 'CC(C)(C)c1ccccc1',
  'smi2': 'CCc1cccc(CC)c1',
  'mol1': <rdkit.Chem.rdchem.Mol at 0x177a51620>,
  

# 6. Save Data

Save the data, so you don't have to re-compute the tanimoto and MCS similarity again.

In [67]:
# Save the subsets data as a pickle
import pickle
with open('../Data/subsets_cata.pickle', 'wb') as outfile:
    pickle.dump(subsets, outfile, pickle.HIGHEST_PROTOCOL)

In [68]:
import pickle
with open('../Data/node_data_cata.pickle', 'wb') as outfile:
    pickle.dump(node_subset, outfile, pickle.HIGHEST_PROTOCOL)