In [1]:
# RDkit
from rdkit import Chem
from rdkit.Chem import AllChem

# H2O
import h2o
from h2o.automl import H2OAutoML
h2o.init()

import pandas as pd
from sklearn import preprocessing
from tqdm import tqdm, trange

# utility functions
def get_ECFP4(row):
    if not row.strip(): return [0] * 2048
    return AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row), 
                                                 4, nBits=2048).ToList()
#     try:
#         return AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row), 
#                                                      4, nBits=2048).ToList()
#     except:
# #         print(f'ECFP did not work for {row}')
#         return [0] * 2048

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,18 mins 48 secs
H2O_cluster_timezone:,Europe/Kiev
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.3
H2O_cluster_version_age:,1 month and 3 days
H2O_cluster_name:,H2O_from_python_ogurb_h485ji
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,10
H2O_cluster_allowed_cores:,10


# EDA

In [5]:
df.REAGENTID1.nunique()

15489

In [7]:
df.preparatory_1.nunique(dropna=False)

5

In [12]:
df.preparatory_2.nunique(dropna=False)

4

In [13]:
df.preparatory_3.nunique(dropna=False)

2

In [8]:
df.REAGENTID2.nunique(dropna=False)

13600

In [9]:
df.REAGENTID3.nunique(dropna=False)

336

In [10]:
df.REAGENTSMI3.unique()

array([nan, 'NNC(=O)C1CCN1C2CCCC2', 'COC(=O)c1nc(Br)n[nH]1',
       'NNC(=O)c1cn[nH]c1', 'NNC(=O)C1CCN1Cc2ccccc2',
       'CC(C)(C)N1CCC1C(=O)NN', 'CCOC(=O)c1cnc(N)s1',
       'COC(=O)c1cc(O)n(C)n1', 'CCOC(=O)c1cn[nH]c1C#N',
       'COC(=O)c1[nH]c(=O)[nH]c1C', 'COC(=O)c1c[nH]nn1',
       'CCOC(=O)C(C)C(C)(C)O', 'CCOC(=O)C1CCCN1S(=O)(=O)N',
       'CS(=O)(=O)N1CCC1C(=O)NN', 'Cn1cccc1C(=O)NN',
       'NNC(=O)Cc1cn2ccccc2n1', 'COC(=O)CCc1nn[nH]n1',
       'Cc1cc(C(=O)NN)c(C)o1', 'NNC(=O)c1ccccn1', 'NNC(=O)c1cccnc1',
       'CC(=O)NN', 'NNC(=O)C(=O)NC1CC1', 'NNC(=O)C1CCC1',
       'Cc1ccc(o1)C(=O)NN', 'CCOC(=O)c1ccc2n(CC)c(=O)[nH]c2c1',
       'Cc1csc(CC(=O)NN)n1', 'COc1ccccc1C(=O)NN',
       'C[C@@H]1CC[C@@H](O1)C(=O)NN', 'NNC(=O)Cc1ccccc1O',
       'NNC(=O)c1cccc2cccnc12', 'Cc1cc(C)n(CC(=O)NN)n1',
       'CCOC(=O)c1nnn[nH]1', 'CCOC(=O)c1cnn[nH]1', 'COC(=O)c1ncon1',
       'Cl.COC(=O)c1ncc[nH]1', 'CS(=O)(=O)CC(=O)NN',
       'NNC(=O)C1CCc2nncn2C1', 'CCN1CC(CC1=O)C(=O)OC',
       'COC(=O)c

In [11]:
df.contributor.nunique(dropna=False)

4

In [18]:
df.YIELD_tier.value_counts(dropna=False)

low       39137
0         20723
medium    12950
high       7204
Name: YIELD_tier, dtype: int64

In [21]:
df.REACTIONID.value_counts(dropna=False)

527       27880
207       18353
34         8449
20         7212
271570     4344
512        4127
270942     3009
38         2364
2714       2217
274090     2059
Name: REACTIONID, dtype: int64

In [22]:
df.isna().sum()

SMILES               0
YIELD_tier           0
YIELD_numeric        0
REACTIONID           0
REAGENTSMI1          0
preparatory_1        0
REAGENTSMI2          0
preparatory_2        0
REAGENTSMI3      77005
preparatory_3        0
contributor          0
catalyst_1           0
catalyst_2           0
catalyst_3           0
dtype: int64

# Load and preprocess data

In [2]:
df = pd.read_csv('2020-06_Enamine_experimental_data_corrected.txt',
                 sep='\t',
                 low_memory=False).drop(columns=['InChI_key',
                                                 'REAGENTID1',
                                                 'REAGENTID2',
                                                 'REAGENTID3'])
# fill NaNs for categorical columns
df.contributor.fillna('NaN', inplace=True)
# df.REACTIONID.fillna('NaN', inplace=True)  # no NaNs
df.preparatory_1.fillna('NaN', inplace=True)
df.preparatory_2.fillna('NaN', inplace=True)
df.preparatory_3.fillna('NaN', inplace=True)
df.catalyst_1.fillna('NaN', inplace=True)
df.catalyst_2.fillna('NaN', inplace=True)
df.catalyst_3.fillna('NaN', inplace=True)

# fill NaNs for SMILES columns
df.REAGENTSMI3.fillna(' ', inplace=True)

# encode values
df['contributor'] = preprocessing.LabelEncoder().fit_transform(df.contributor)
# df['REACTIONID'] = preprocessing.LabelEncoder().fit_transform(df.REACTIONID)  # no NaNs
df['preparatory_1'] = preprocessing.LabelEncoder().fit_transform(df.preparatory_1)
df['preparatory_2'] = preprocessing.LabelEncoder().fit_transform(df.preparatory_2)
df['preparatory_3'] = preprocessing.LabelEncoder().fit_transform(df.preparatory_3)
df['catalyst_1'] = preprocessing.LabelEncoder().fit_transform(df.catalyst_1)
df['catalyst_2'] = preprocessing.LabelEncoder().fit_transform(df.catalyst_2)
df['catalyst_3'] = preprocessing.LabelEncoder().fit_transform(df.catalyst_3)


# categorical:
# - contributor
# - REACTIONID
# - preparatory_1
# - preparatory_2
# - preparatory_3
# - catalyst_1
# - catalyst_2
# - catalyst_3

print(df.shape)
df.head()

(80014, 14)


Unnamed: 0,SMILES,YIELD_tier,YIELD_numeric,REACTIONID,REAGENTSMI1,preparatory_1,REAGENTSMI2,preparatory_2,REAGENTSMI3,preparatory_3,contributor,catalyst_1,catalyst_2,catalyst_3
0,CC1=CC(CS(=O)(=O)NC2=CC=C(SC(F)F)C=C2)=NO1,low,30.4,20,Nc1ccc(SC(F)F)cc1,4,Cc1cc(CS(=O)(=O)Cl)no1,3,,1,0,5,2,0
1,CC(=O)NC1=CC=C(NS(=O)(=O)CC2CCN(C2)C(=O)OCC2=C...,0,0.0,20,CC(=O)Nc1ccc(N)cc1,4,ClS(=O)(=O)CC1CCN(C1)C(=O)OCc2ccccc2,3,,1,0,5,2,0
2,COC(=O)C1=CC=CC=C1NS(=O)(=O)C1=CC=C2NC(=O)OC2=C1,high,80.0,20,COC(=O)c1ccccc1N,4,ClS(=O)(=O)c1ccc2[nH]c(=O)oc2c1,3,,1,0,5,2,0
3,COC(=O)C1=CC=CC=C1NS(=O)(=O)C1CCOC1=O,0,0.0,20,COC(=O)c1ccccc1N,4,ClS(=O)(=O)C1CCOC1=O,3,,1,0,5,2,0
4,FC(F)(F)C1=CC=C(CCS(=O)(=O)NC2=CC=C3OCCOC3=C2)...,low,16.7,20,Nc1ccc2OCCOc2c1,4,FC(F)(F)c1ccc(CCS(=O)(=O)Cl)cc1,3,,1,0,5,2,0


# SMILES to Morgan circular fingerprints with radius 4

In [3]:
%%time

SMILES_cols = ['SMILES', 'REAGENTSMI1', 'REAGENTSMI2', 'REAGENTSMI3']
SMILES_fps = dict()

for col in tqdm(SMILES_cols):
    ecfp_df = pd.DataFrame(df[col].apply(get_ECFP4).tolist())
    ecfp_df.columns = list(map(lambda n: f'{col}-ECFP4-{n}', ecfp_df.columns))
    df = pd.concat([df, ecfp_df], axis='columns')
    df = df.drop(columns=[col])
    del ecfp_df

df['YIELD_bin'] = 1
df.loc[df.YIELD_tier=='0', 'YIELD_bin'] = 0
clf_bin_df = df.drop(columns=['YIELD_numeric', 'YIELD_tier'])
clf_mult_df = df[df.YIELD_bin!=0].drop(columns=['YIELD_numeric', 'YIELD_bin'])
reg_df = df[df.YIELD_bin!=0].drop(columns=['YIELD_tier', 'YIELD_bin'])
del df

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [02:53<00:00, 43.43s/it]

CPU times: user 2min 32s, sys: 14.7 s, total: 2min 46s
Wall time: 2min 53s





In [13]:
%%time

clf_bin_df.to_csv('yield_clf_bin_df.csv', index=False)
clf_mult_df.to_csv('yield_clf_mult_df.csv', index=False)
reg_df.to_csv('yield_reg_df.csv', index=False)

CPU times: user 3min 52s, sys: 12.9 s, total: 4min 5s
Wall time: 4min 8s
