# Chemical Space Network Calculations

**V.F. Scalfani, V.D. Patel, and A.M. Fernandez** \
v. October 18, 2022

*w/ glucocorticoid_receptor_2034_2.csv dataset*

## 1. Import RDKit, Networkx, and other libraries

In [290]:
# RDKit stuff
from rdkit import Chem
from rdkit import rdBase
from rdkit.Chem import Draw
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem import rdDepictor
rdDepictor.SetPreferCoordGen(True)
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import rdFMCS
from rdkit import DataStructs
from rdkit.Chem import rdmolops

# numpy
import numpy as np

# pandas
import pandas as pd

# networkx
import networkx as nx

# matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt

In [291]:
# Print versions of libraries used
print('RDKit version: ',rdBase.rdkitVersion)
print('Numpy version:', np.__version__)
print('Pandas version:', pd.__version__)
print('Networkx version',nx.__version__)
print('MatplotLib version:', mpl.__version__)

RDKit version:  2023.03.1
Numpy version: 1.24.3
Pandas version: 2.0.1
Networkx version 2.8.8
MatplotLib version: 3.7.1


## 2. Load ChEMBL Dataset

In [292]:
# pd.options.display.max_rows = 30
df = pd.read_csv("Virtualpredictions.csv")
df

Unnamed: 0.1,Unnamed: 0,nucleophile SMILES,electrophile SMILES,"3,3â€™ Catalyst Substituent",N Catalyst Substituent,ddg,ee,cluster
0,0,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,c2ccc1ccccc1c2,NS(=O)(=O)C(F)(F)F,1.194218,0.813590,6
1,1,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,c2ccc1ccccc1c2,NS(=O)(=O)C(F)(F)C(F)(F)F,1.219084,0.822577,6
2,2,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,c2ccc1ccccc1c2,NS(=O)(=O)c1c(F)c(F)c(F)c(F)c1F,1.286832,0.844923,6
3,3,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,c2ccc1ccccc1c2,NS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,1.191409,0.812546,6
4,4,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,c2ccc1ccccc1c2,NS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C...,1.191409,0.812546,6
...,...,...,...,...,...,...,...,...
869755,869755,C1=CCCC=C1,C=CC=O,c3ccc2ccc1ccccc1c2c3,NS(=O)(=O)C(F)(F)F,0.805889,0.596744,32
869756,869756,C1=CCCC=C1,C=CC=O,c3ccc2ccc1ccccc1c2c3,NS(=O)(=O)C(F)(F)C(F)(F)F,0.819697,0.607657,32
869757,869757,C1=CCCC=C1,C=CC=O,c3ccc2ccc1ccccc1c2c3,NS(=O)(=O)c1c(F)c(F)c(F)c(F)c1F,1.021453,0.737241,32
869758,869758,C1=CCCC=C1,C=CC=O,c3ccc2ccc1ccccc1c2c3,NS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.810616,0.600514,32


# 3. Data Preparation and Checks

Note that we are using the chemical structures as-is (i.e., using ChEMBL's standardization workflow). See supporting manuscript for more details. 

In [293]:
# Create a new dataframe with only Chembl ID, Smiles, and Standard Value (Ki, for this example)
# We are ignoring the Standard relation here for Ki values (i.e., >, <, etc. are treated as =)

"""df1 = df[['Molecule ChEMBL ID','Smiles','Standard Value']].copy()

# drop any rows with NaN (missing Standard Values)
df1.dropna(inplace=True)
df1"""
full_df = pd.read_csv("Virtualpredictions.csv")

len(full_df)

869760

In [294]:
# Check for presence of disconnected SMILES notation via string matching 
df2 = full_df[~full_df['3,3â€™ Catalyst Substituent '].str.contains("\.")]
len(full_df) # same as df1

869760

In [295]:
# We should double check for disconnected fragments in the event that
# the dot disconnect bond is used with ring-closures
# see: http://www.dalkescientific.com/writings/diary/archive/2004/12/12/library_generation_with_smiles.html

smis = df2['3,3â€™ Catalyst Substituent '].tolist()
num_frags = []
for smi in smis:
    mol = Chem.MolFromSmiles(smi)
    num_frags.append(len(Chem.GetMolFrags(mol))) # returns number of fragments

In [296]:
# now check that all molecules have only one fragment
all(frag == 1 for frag in num_frags)

True

In [297]:
df2

Unnamed: 0.1,Unnamed: 0,nucleophile SMILES,electrophile SMILES,"3,3â€™ Catalyst Substituent",N Catalyst Substituent,ddg,ee,cluster
0,0,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,c2ccc1ccccc1c2,NS(=O)(=O)C(F)(F)F,1.194218,0.813590,6
1,1,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,c2ccc1ccccc1c2,NS(=O)(=O)C(F)(F)C(F)(F)F,1.219084,0.822577,6
2,2,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,c2ccc1ccccc1c2,NS(=O)(=O)c1c(F)c(F)c(F)c(F)c1F,1.286832,0.844923,6
3,3,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,c2ccc1ccccc1c2,NS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,1.191409,0.812546,6
4,4,C=CC[Si](C)(C)C,O=Cc2ccc1ccccc1c2,c2ccc1ccccc1c2,NS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C...,1.191409,0.812546,6
...,...,...,...,...,...,...,...,...
869755,869755,C1=CCCC=C1,C=CC=O,c3ccc2ccc1ccccc1c2c3,NS(=O)(=O)C(F)(F)F,0.805889,0.596744,32
869756,869756,C1=CCCC=C1,C=CC=O,c3ccc2ccc1ccccc1c2c3,NS(=O)(=O)C(F)(F)C(F)(F)F,0.819697,0.607657,32
869757,869757,C1=CCCC=C1,C=CC=O,c3ccc2ccc1ccccc1c2c3,NS(=O)(=O)c1c(F)c(F)c(F)c(F)c1F,1.021453,0.737241,32
869758,869758,C1=CCCC=C1,C=CC=O,c3ccc2ccc1ccccc1c2c3,NS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,0.810616,0.600514,32


In [298]:
# Group by ChEMBLID/Smiles rows, and then take the mean of the standard Ki value to account for duplicates.
# See: Zhang, B. et al. J Comput Aided Mol Des 2015, 29 (10), 937–950. 
# https://doi.org/10.1007/s10822-015-9872-1.
#average_ee_by_name = df2.groupby('Reaction',"Catalyst")['ee'].mean().reset_index()

df3 = df2.groupby("3,3â€™ Catalyst Substituent ")['ee'].mean().reset_index()

# rename the Standard Value Column to Ki
df3.rename(columns={'ee': 'Ki'}, inplace=True)
df3

Unnamed: 0,"3,3â€™ Catalyst Substituent",Ki
0,CC(C)(C)c1ccccc1,0.835703
1,CC(C)CCc1cccc(CCC(C)C)c1,0.884771
2,CC3(C)c1ccccc1c2ccccc23,0.897272
3,CCCCCCc1cccc(CCCCCC)c1,0.888566
4,CCCc1cccc(CCC)c1,0.88277
5,CCc1cccc(CC)c1,0.884102
6,Cc1cc(C)c2CCc4cccc3CCc1c2c34,0.864035
7,Cc1cc(C)c2ccc4cccc3ccc1c2c34,0.86678
8,Cc1cc(C)ccc1,0.711824
9,Cc1cccc(C)c1C,0.792059


In [299]:
# Double check that all SMILES are unique (different) compounds
# To be on the safe side, we can parse the SMILES as RDKit mol objects
# then write out canonical smiles and check

smis = df3['3,3â€™ Catalyst Substituent '].tolist()
rdkit_can_smiles = []
for smi in smis:
    mol = Chem.MolFromSmiles(smi)       
    rdkit_can_smiles.append(Chem.MolToSmiles(mol, canonical=True)) # default is true
    
set_rdkit_can_smiles = set(rdkit_can_smiles)
len(set_rdkit_can_smiles) == len(rdkit_can_smiles)

False

In [300]:
# We'll use the original SMILES as unique dictionary keys, so we should verify that the
# ChEMBL SMILES are unique strings too.
set_smis = set(smis)
len(set_smis) == len(smis)

True

## 4. Compile Node data

In [301]:
# We will use pKi for coloring nodes.
from math import log10
"""
# Ki values are in nM units
# 1. convert to M
df3.loc[:,"Ki_M"] = (df3.loc[:,"Ki"] * (10**-9))

# 2. then compute -log10[Ki]
def minuslog(x):
    return -log10(x)

df3.loc[:,"pKi"] = (df3.loc[:,"Ki_M"].apply(minuslog))"""
df3["pKi"] = df3["Ki"]

In [302]:
# drop Ki and Ki_M columns (no longer needed)
df3.drop(["Ki","Ki"],axis=1,inplace=True)
df3

Unnamed: 0,"3,3â€™ Catalyst Substituent",pKi
0,CC(C)(C)c1ccccc1,0.835703
1,CC(C)CCc1cccc(CCC(C)C)c1,0.884771
2,CC3(C)c1ccccc1c2ccccc23,0.897272
3,CCCCCCc1cccc(CCCCCC)c1,0.888566
4,CCCc1cccc(CCC)c1,0.88277
5,CCc1cccc(CC)c1,0.884102
6,Cc1cc(C)c2CCc4cccc3CCc1c2c34,0.864035
7,Cc1cc(C)c2ccc4cccc3ccc1c2c34,0.86678
8,Cc1cc(C)ccc1,0.711824
9,Cc1cccc(C)c1C,0.792059


In [303]:
# get the max/min Ki values, which will be used for a color bar later
print(df3.pKi.max())
print(df3.pKi.min())

0.8972723117853201
0.5763813942963912


In [304]:
# set the dataframe index as Smiles (we already verified they are all unique from eachother)
df4 = df3.set_index('3,3â€™ Catalyst Substituent ')

In [305]:
pd.set_option("expand_frame_repr", False)
print(df4.head(10)) # view first 10

                                   pKi
3,3â€™ Catalyst Substituent           
CC(C)(C)c1ccccc1              0.835703
CC(C)CCc1cccc(CCC(C)C)c1      0.884771
CC3(C)c1ccccc1c2ccccc23       0.897272
CCCCCCc1cccc(CCCCCC)c1        0.888566
CCCc1cccc(CCC)c1              0.882770
CCc1cccc(CC)c1                0.884102
Cc1cc(C)c2CCc4cccc3CCc1c2c34  0.864035
Cc1cc(C)c2ccc4cccc3ccc1c2c34  0.866780
Cc1cc(C)ccc1                  0.711824
Cc1cccc(C)c1C                 0.792059


In [306]:
# save to a dictionary
node_data = df4.to_dict('index')

In [307]:
# SMILES are the keys
list(node_data.keys())[0]

'CC(C)(C)c1ccccc1'

In [308]:
# ChEMBL ID and pKi are the associated values
list(node_data.values())[0]

{'pKi': 0.8357030625011037}

In [309]:
# print(node_data)

## 5. Compute and Compile Edge Data

In [310]:
# We first need to create subset pairs of the SMILES
smis = [] # using ChEMBL provided SMILES
for key,value in node_data.items():
    smis.append(key)

from itertools import combinations
smis_subsets = list(combinations(smis,2))
len(smis_subsets)

276

In [311]:
# View first 5
smis_subsets[0:5]

[('CC(C)(C)c1ccccc1', 'CC(C)CCc1cccc(CCC(C)C)c1'),
 ('CC(C)(C)c1ccccc1', 'CC3(C)c1ccccc1c2ccccc23'),
 ('CC(C)(C)c1ccccc1', 'CCCCCCc1cccc(CCCCCC)c1'),
 ('CC(C)(C)c1ccccc1', 'CCCc1cccc(CCC)c1'),
 ('CC(C)(C)c1ccccc1', 'CCc1cccc(CC)c1')]

In [312]:
# create a dictionary, subsets
subsets = {}
for i, (smi1,smi2) in enumerate(smis_subsets):
    field = {}
    field["smi1"] = smi1
    subsets[i] = field
    
    field["smi2"] = smi2
    subsets[i] = field
len(subsets)

276

In [313]:
# get first key
list(subsets)[0]

0

In [314]:
# get first value
list(subsets.values())[0]

{'smi1': 'CC(C)(C)c1ccccc1', 'smi2': 'CC(C)CCc1cccc(CCC(C)C)c1'}

In [315]:
# get smi1
list(subsets.values())[0]['smi1']

'CC(C)(C)c1ccccc1'

In [316]:
# add mol objects to our subsets dictionary
for key,value in subsets.items():
    subsets[key].update({"mol1": Chem.MolFromSmiles(value['smi1'])})
    subsets[key].update({"mol2": Chem.MolFromSmiles(value['smi2'])})

In [317]:
list(subsets.keys())[0]

0

In [318]:
list(subsets.values())[0]

{'smi1': 'CC(C)(C)c1ccccc1',
 'smi2': 'CC(C)CCc1cccc(CCC(C)C)c1',
 'mol1': <rdkit.Chem.rdchem.Mol at 0x7f1e9119dee0>,
 'mol2': <rdkit.Chem.rdchem.Mol at 0x7f1e9119df50>}

### Compute Tanimoto Similarity (RDKit fingerprint based)

In [319]:
# compute and add Tanimoto Similarity using default RDKit fingerprints
for key,value in subsets.items():
    fp1 = Chem.RDKFingerprint(value['mol1'])
    fp2 = Chem.RDKFingerprint(value['mol2'])
    tan_sim = round(DataStructs.TanimotoSimilarity(fp1,fp2), 3)
    subsets[key].update({"tan_similarity": tan_sim})

In [320]:
list(subsets.values())[0]

{'smi1': 'CC(C)(C)c1ccccc1',
 'smi2': 'CC(C)CCc1cccc(CCC(C)C)c1',
 'mol1': <rdkit.Chem.rdchem.Mol at 0x7f1e9119dee0>,
 'mol2': <rdkit.Chem.rdchem.Mol at 0x7f1e9119df50>,
 'tan_similarity': 0.382}

### Compute MCS-based Tanimoto Coefficient (single processor)

In [321]:
# Add maximum common substructure (MCS)-based Tanimoto Coefficient
# See Vogt, M. et al. J Comput Aided Mol Des 2016, 30 (3), 191–208. 
# https://doi.org/10.1007/s10822-016-9906-3.

############
############

# Compute and add MCS-based similarity
# This may take a very long time...

#############
#############

#def tc_mcs(mol1,mol2):
#    # get maximum common substructure instance
#    mcs = rdFMCS.FindMCS([mol1,mol2],timeout=10) # adding a 10 second timeout for now
    
#    # get number of common bonds
#    mcs_bonds = mcs.numBonds
    
#    # get number of bonds for each
#    # default is only heavy atom bonds
#    mol1_bonds = mol1.GetNumBonds()
#    mol2_bonds = mol2.GetNumBonds()
    
#    # compute MCS-based Tanimoto  
#    tan_mcs = mcs_bonds / (mol1_bonds + mol2_bonds - mcs_bonds)
#    return tan_mcs


# loop through subsets and compute tc_mcs

#for key,value in subsets.items():
#    tan_mcs_value = round(tc_mcs(value['mol1'], value['mol2']), 3)
#    print(key) # to watch progress
#    subsets[key].update({"tan_mcs": tan_mcs_value})

### Compute MCS-based Tanimoto Coefficient (multiprocessing)

In [322]:
# get number of processors
import multiprocessing
print(multiprocessing.cpu_count())

24


In [323]:
# From the Python docs, this below is number of usable CPUs (works on Unix/Linux)
# https://docs.python.org/3/library/multiprocessing.html
# we subtracted 2 from total number, so that we can still easily use computer for other tasks
import os
num_cpus = len(os.sched_getaffinity(0)) - 2
num_cpus

22

In [324]:
# Add maximum common substructure (MCS)-based Tanimoto Coefficient
# See Vogt, M. et al. J Comput Aided Mol Des 2016, 30 (3), 191–208. 
# https://doi.org/10.1007/s10822-016-9906-3.

############
############

# Compute and add MCS-based similarity

# Here are benchmark times with the 10 second timeout in FindMCS and the 81,000 compound gluco pairs:
    # Intel Core i9-9980HK (2.4 GHz x 16) Ubuntu laptop using 14 of the 16 CPUs: ~ 1 hour
    # Intel Core i5-2520M (2.5 GHz x 4) Ubuntu laptop using 3 of the 4 cores: ~ 4 hours

#############
#############

def tc_mcs(mol1,mol2,key):
    # get maximum common substructure instance
    mcs = rdFMCS.FindMCS([mol1,mol2],timeout=10) # adding a 10 second timeout
    
    # get number of common bonds
    mcs_bonds = mcs.numBonds
    
    # get number of bonds for each
    # default is only heavy atom bonds
    mol1_bonds = mol1.GetNumBonds()
    mol2_bonds = mol2.GetNumBonds()
    
    # compute MCS-based Tanimoto
    tan_mcs = mcs_bonds / (mol1_bonds + mol2_bonds - mcs_bonds)
    return key, tan_mcs

# create a list of mol1, mol2, and their dictionary key as tuples
mol_tuples = []
for key, value in subsets.items():
    mol_tuples.append((value['mol1'],value['mol2'], key))

# run multiprocessing on the tc_mcs function
from multiprocessing import Pool

if __name__ == '__main__':
    with Pool(num_cpus) as p: # In our case, num_cpus = 14
        star_map = p.starmap(tc_mcs, mol_tuples)
    for key, tan_mcs in star_map:
        subsets[key].update({"tan_mcs": round(tan_mcs,3)})

In [325]:
list(subsets.values())[0:5]

[{'smi1': 'CC(C)(C)c1ccccc1',
  'smi2': 'CC(C)CCc1cccc(CCC(C)C)c1',
  'mol1': <rdkit.Chem.rdchem.Mol at 0x7f1e9119dee0>,
  'mol2': <rdkit.Chem.rdchem.Mol at 0x7f1e9119df50>,
  'tan_similarity': 0.382,
  'tan_mcs': 0.444},
 {'smi1': 'CC(C)(C)c1ccccc1',
  'smi2': 'CC3(C)c1ccccc1c2ccccc23',
  'mol1': <rdkit.Chem.rdchem.Mol at 0x7f1e9119dfc0>,
  'mol2': <rdkit.Chem.rdchem.Mol at 0x7f1e9119e9d0>,
  'tan_similarity': 0.186,
  'tan_mcs': 0.588},
 {'smi1': 'CC(C)(C)c1ccccc1',
  'smi2': 'CCCCCCc1cccc(CCCCCC)c1',
  'mol1': <rdkit.Chem.rdchem.Mol at 0x7f1e9119d150>,
  'mol2': <rdkit.Chem.rdchem.Mol at 0x7f1e9119d460>,
  'tan_similarity': 0.361,
  'tan_mcs': 0.4},
 {'smi1': 'CC(C)(C)c1ccccc1',
  'smi2': 'CCCc1cccc(CCC)c1',
  'mol1': <rdkit.Chem.rdchem.Mol at 0x7f1e9119d380>,
  'mol2': <rdkit.Chem.rdchem.Mol at 0x7f1e9119e030>,
  'tan_similarity': 0.431,
  'tan_mcs': 0.571},
 {'smi1': 'CC(C)(C)c1ccccc1',
  'smi2': 'CCc1cccc(CC)c1',
  'mol1': <rdkit.Chem.rdchem.Mol at 0x7f1e9119e0a0>,
  'mol2': <rdk

In [326]:
# Keys are integers
list(subsets.keys())[0:5]

[0, 1, 2, 3, 4]

In [327]:
# Here is what the star_map variable looks like
star_map[0:5]

[(0, 0.4444444444444444),
 (1, 0.5882352941176471),
 (2, 0.4),
 (3, 0.5714285714285714),
 (4, 0.6666666666666666)]

# 6. Save Data

Save the data, so you don't have to re-compute the tanimoto and MCS similarity again.

In [328]:
# Save the subsets data as a pickle
import pickle
with open('subsets_cata.pickle', 'wb') as outfile:
    pickle.dump(subsets, outfile, pickle.HIGHEST_PROTOCOL)

In [329]:
# Save the node data
with open('node_data_cata.pickle', 'wb') as outfile:
    pickle.dump(node_data, outfile, pickle.HIGHEST_PROTOCOL)

In [330]:
subsets

{0: {'smi1': 'CC(C)(C)c1ccccc1',
  'smi2': 'CC(C)CCc1cccc(CCC(C)C)c1',
  'mol1': <rdkit.Chem.rdchem.Mol at 0x7f1e9119dee0>,
  'mol2': <rdkit.Chem.rdchem.Mol at 0x7f1e9119df50>,
  'tan_similarity': 0.382,
  'tan_mcs': 0.444},
 1: {'smi1': 'CC(C)(C)c1ccccc1',
  'smi2': 'CC3(C)c1ccccc1c2ccccc23',
  'mol1': <rdkit.Chem.rdchem.Mol at 0x7f1e9119dfc0>,
  'mol2': <rdkit.Chem.rdchem.Mol at 0x7f1e9119e9d0>,
  'tan_similarity': 0.186,
  'tan_mcs': 0.588},
 2: {'smi1': 'CC(C)(C)c1ccccc1',
  'smi2': 'CCCCCCc1cccc(CCCCCC)c1',
  'mol1': <rdkit.Chem.rdchem.Mol at 0x7f1e9119d150>,
  'mol2': <rdkit.Chem.rdchem.Mol at 0x7f1e9119d460>,
  'tan_similarity': 0.361,
  'tan_mcs': 0.4},
 3: {'smi1': 'CC(C)(C)c1ccccc1',
  'smi2': 'CCCc1cccc(CCC)c1',
  'mol1': <rdkit.Chem.rdchem.Mol at 0x7f1e9119d380>,
  'mol2': <rdkit.Chem.rdchem.Mol at 0x7f1e9119e030>,
  'tan_similarity': 0.431,
  'tan_mcs': 0.571},
 4: {'smi1': 'CC(C)(C)c1ccccc1',
  'smi2': 'CCc1cccc(CC)c1',
  'mol1': <rdkit.Chem.rdchem.Mol at 0x7f1e9119e0a0>,

In [331]:
mean_ee_nuc = df.groupby(['3,3â€™ Catalyst Substituent ','nucleophile SMILES'])['ee'].mean()
mean_ee_ele = df.groupby(['3,3â€™ Catalyst Substituent ','electrophile SMILES'])['ee'].mean()

In [332]:
mean_ee_nuc['CC(C)(C)c1ccccc1']

nucleophile SMILES
C/C(=C\CCCCCl)C(=O)/C=C/C(C)C            0.831315
C/C(=C\CCc1ccccc1)C(=O)/C=C/C(C)C        0.819915
C/C=C(C)/C(=O)/C=C/C(C)(C)C              0.842640
C/C=C(C)/C(=O)/C=C/C(C)C                 0.836371
C/C=C(C)/C=C/CCO                         0.850077
C1=CCC=C1                                0.838227
C1=CCCC=C1                               0.861372
C=C(/C=C/C(C)(C)C)C(C)(C)C               0.831474
C=C(/C=C/c1ccccc1)O[Si](C)(C)C(C)(C)C    0.835625
C=C(C)/C=C(C)/C                          0.842165
C=C(C)/C=C/C                             0.819481
C=C(C)/C=C/CCOCc1ccccc1                  0.841064
C=C(C)C(=C)C                             0.850839
C=C(OC)O[Si](C)(C)C                      0.851807
C=C(OC)O[Si](C)(C)C(C)(C)C               0.872003
C=C(OC)O[Si](CC)(CC)CC                   0.853280
C=C(OC)O[Si]1(C(C)C)C(C)CC1C             0.808796
C=C(OC1CCCCC1)O[Si](C)(C)C(C)(C)C        0.825969
C=C(OCc1ccccc1)O[Si](C)(C)C              0.827597
C=C(OCc1ccccc1)O[Si](C)(C)C(C)(

In [333]:
nuc_tuples = []
ele_tuples = []

cats = np.unique(df['3,3â€™ Catalyst Substituent '])
nucs = np.unique(df['nucleophile SMILES'])
eles = np.unique(df['electrophile SMILES'])


for cat in cats:
    for i in range(len(nucs)):
        nuc_tuples.append((cat,nucs[i], mean_ee_nuc[cat][i]))
for cat in cats:
    for i in range(len(eles)):
        ele_tuples.append((cat,eles[i], mean_ee_ele[cat][i]))


with open('subsets_cata_nuc.pickle', 'wb') as outfile:
    pickle.dump(nuc_tuples, outfile, pickle.HIGHEST_PROTOCOL)
with open('subsets_cata_ele.pickle', 'wb') as outfile:
    pickle.dump(ele_tuples, outfile, pickle.HIGHEST_PROTOCOL)