In [1]:
import numpy as np
import pandas as pd
import urllib.request 
from rdkit import Chem, rdBase
from rdkit.Chem import AllChem, Draw, Descriptors, PandasTools
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn.model_selection import train_test_split

url = 'https://raw.githubusercontent.com/onecoinbuybus/Database_chemoinformatics/master/smiles_cas_N6512.smi'
urllib.request.urlretrieve(url, 'ames.txt') 
df = pd.read_csv('ames.txt',header=None, sep='\t') 
df.columns = ['smiles', 'CAS_NO', 'activity']
PandasTools.AddMoleculeColumnToFrame(frame=df, smilesCol='smiles')

none_list=[]
for i in range(df.shape[0]):
    if Chem.MolFromSmiles(df['smiles'][i]) is None:
        none_list.append(i)
        
df=df.drop(none_list)
mols=[Chem.MolFromSmiles(smile) for smile in df['smiles']]

maccskeys = []
for m in mols:
    maccskey = [x for x in AllChem.GetMACCSKeysFingerprint(m)]
    maccskeys.append(maccskey)

RDKit ERROR: [20:24:22] SMILES Parse Error: syntax error while parsing: NNC(=O)CNC(=O)\C=N\#N
RDKit ERROR: [20:24:22] SMILES Parse Error: Failed parsing SMILES 'NNC(=O)CNC(=O)\C=N\#N' for input: 'NNC(=O)CNC(=O)\C=N\#N'
RDKit ERROR: [20:24:22] SMILES Parse Error: syntax error while parsing: O=C1NC(=O)\C(=N/#N)\C=N1
RDKit ERROR: [20:24:22] SMILES Parse Error: Failed parsing SMILES 'O=C1NC(=O)\C(=N/#N)\C=N1' for input: 'O=C1NC(=O)\C(=N/#N)\C=N1'
RDKit ERROR: [20:24:22] SMILES Parse Error: syntax error while parsing: NC(=O)CNC(=O)\C=N\#N
RDKit ERROR: [20:24:22] SMILES Parse Error: Failed parsing SMILES 'NC(=O)CNC(=O)\C=N\#N' for input: 'NC(=O)CNC(=O)\C=N\#N'
RDKit ERROR: [20:24:22] SMILES Parse Error: syntax error while parsing: CCCCN(CC(O)C1=C\C(=N/#N)\C(=O)C=C1)N=O
RDKit ERROR: [20:24:22] SMILES Parse Error: Failed parsing SMILES 'CCCCN(CC(O)C1=C\C(=N/#N)\C(=O)C=C1)N=O' for input: 'CCCCN(CC(O)C1=C\C(=N/#N)\C(=O)C=C1)N=O'
RDKit ERROR: [20:24:22] SMILES Parse Error: syntax error while pars

In [2]:
descriptor_names = ['qed', 'MolLogP']
descriptor_calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)
descriptors = pd.DataFrame(
    [descriptor_calculator.CalcDescriptors(mol) for mol in mols[:110]],
    columns=descriptor_names
)

In [3]:
properties=np.array(descriptors)
properties[:10]

array([[ 0.18458152,  6.3494    ],
       [ 0.64928404,  3.9456    ],
       [ 0.39401407, -1.3818    ],
       [ 0.41063321,  5.17742   ],
       [ 0.88511218,  3.2809    ],
       [ 0.45584159,  2.21164   ],
       [ 0.30569748,  6.0138    ],
       [ 0.3683784 ,  0.6731    ],
       [ 0.61264255,  1.8412    ],
       [ 0.60254391,  4.1766    ]])

In [4]:
data=np.array(maccskeys[:110])

In [5]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from multiprocessing import Pool
import copy as cp
import random
from sklearn.model_selection import GridSearchCV
import sys, csv, time, argparse

In [6]:
# x
features_observed = data[:10]
features_unchecked = data[10:]

#y
properties_observed = properties[:10]
properties_unchecked = properties[10:]

In [7]:
sc = StandardScaler()
sc.fit(features_observed)
sc_features_observed = sc.transform(features_observed)
sc_features_unchecked = sc.transform(features_unchecked)
sc_property = StandardScaler() 
sc_property.fit(properties_observed)
sc_properties_observed = sc_property.transform(properties_observed)

In [8]:
def build_model(prediction_model, x_train, y_train):
    if prediction_model == 'RF': 
        params = {'n_estimators':[10, 50, 100]}
        gridsearch = GridSearchCV(RandomForestRegressor(), param_grid=params, cv = 3, scoring="r2", n_jobs=parallel, verbose = 1)
        gridsearch.fit(x_train,y_train)
        model =  RandomForestRegressor(n_estimators = gridsearch.best_params_['n_estimators'])
        model.fit(x_train, y_train)
        return model

In [9]:
parallel = 1

#根据不同物性建造不同的model
model_list = []
for d in range(2):
    model = build_model('RF', sc_features_observed, properties_observed[:,d])
    model_list.append(model)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits


In [10]:
#将备选的东西里面的值预测一遍

predicted_properties_list = []
for d in range(2):
    predicted_properties_list.append(model_list[d].predict(sc_features_unchecked))
predicted_properties_list = np.array(predicted_properties_list).T

In [11]:
# Stein Novelty Calculation

In [12]:
def hesgau(x, y, sigma):
    dim = len(x)
    dist = np.sum(np.power(x-y, 2))
    return (dim/sigma - dist/sigma**2)*np.exp(-dist/(2*sigma))

def stein_novelty(point, data_list, sigma):
    n = len(data_list)
    score = 0
    score = np.sum([hesgau(point, data_list[k,:], sigma) for k in range(n)])
    score = score/(n*(n+1)/2)
    return -score

In [13]:
sc_predicted_properties_list = sc_property.transform(predicted_properties_list) 

In [14]:
stein_novelty(sc_predicted_properties_list[0], sc_properties_observed, sigma=1)

-0.05654122932088566

In [15]:
# finding

In [16]:
def recommend_next(prediction_model, features_observed, features_unchecked, properties_observed):

    sc = StandardScaler()
    sc.fit(features_observed)
    sc_features_observed = sc.transform(features_observed)
    sc_features_unchecked = sc.transform(features_unchecked)
    sc_property = StandardScaler() 
    sc_property.fit(properties_observed)
    sc_properties_observed = sc_property.transform(properties_observed)
    
    #根据不同物性建造不同的model
    model_list = []
    for d in range(2):
        model = build_model(prediction_model, sc_features_observed, properties_observed[:,d])
        model_list.append(model)

    predicted_properties_list = []
    for d in range(2):
        predicted_properties_list.append(model_list[d].predict(sc_features_unchecked))
    predicted_properties_list = np.array(predicted_properties_list).T
    
    #Calc. Stein Novelty
    sc_predicted_properties_list = sc_property.transform(predicted_properties_list) 
    sn_data = [stein_novelty(point, sc_properties_observed, sigma=1) for point in sc_predicted_properties_list]
    
    #Select and save next candidate
    maximum_index = np.argmax(sn_data)
    
    return maximum_index, predicted_properties_list[maximum_index], sn_data[maximum_index]

In [17]:
num_loop=10

for l in range(num_loop):
        print('Exploration:', l)
        recommended_index, predicted_properties, SN = recommend_next('RF', features_observed, features_unchecked, properties_observed)
        print('Recommended_index', recommended_index, 'predicted_properties', predicted_properties, 'Stein novelty', SN)

        #Add the experimental or simulation result of the recommended data
        features_observed = np.append(features_observed, [features_unchecked[recommended_index]], axis = 0)
        properties_observed = np.append(properties_observed, [properties_unchecked[recommended_index]], axis = 0)         

        #Removed the recommend data
        features_unchecked = np.delete(features_unchecked, recommended_index, axis = 0)
        properties_unchecked = np.delete(properties_unchecked, recommended_index, axis = 0)

Exploration: 0
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Recommended_index 54 predicted_properties [0.59782408 0.903816  ] Stein novelty -0.03669726271314275
Exploration: 1
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Recommended_index 12 predicted_properties [ 0.44592185 -0.031954  ] Stein novelty -0.034621463301366825
Exploration: 2
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Recommended_index 11 predicted_properties [0.45004822 0.6821412 ] Stein novelty -0.03998524519295293
Exploration: 3
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Recommended_index 10 predicted_properties [0.41952621 5.3883508 ] Stein novelty -0.0369188367749818
Exploration: 4
Fitting 3 folds for each of 3 candidates, total