## script that build qsar models.

At first, calculate molecular descriptors.
The code is follows

In [1]:
import sys
import pickle as cPickle
import numpy as np
from rdkit import Chem
from rdkit.Chem import DataStructs
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn import preprocessing
 
min_max_scaler = preprocessing.MinMaxScaler()

trainset = [mol for mol in Chem.SDMolSupplier("solubility.train.sdf") if mol is not None]
testset = [mol for mol in Chem.SDMolSupplier("solubility.test.sdf") if mol is not None]
 
nms = [x[0] for x in Descriptors._descList]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms)
print(len(nms))

trainDescrs = [calc.CalcDescriptors(x) for x in trainset]
testDescrs  = [calc.CalcDescriptors(x) for x in testset]
trainDescrs = np.array(trainDescrs)
testDescrs = np.array(testDescrs)
 
x_train_minmax = min_max_scaler.fit_transform( trainDescrs )
x_test_minmax = min_max_scaler.fit_transform( testDescrs )

208


In [2]:
classes={'(A) low':0,'(B) medium':1,'(C) high':1}

train_acts = np.array([classes[mol.GetProp("SOL_classification")] for mol in trainset], dtype="int")
test_acts = np.array([classes[mol.GetProp("SOL_classification")] for mol in testset], dtype="int")

dataset = ( (x_train_minmax, train_acts),(x_train_minmax, train_acts), (x_test_minmax, test_acts) )
#tmp_dataset = np.array(dataset)
#print (np.any(np.isnan(tmp_dataset)))

f = open("rdk_sol_set_norm_descs.pkl", "wb")
cPickle.dump(dataset,f)
f.close()

Now I could get train and test data set as pkl file.
Next, build the model using scikit-learn
The code build the model using RANDOMFOREST, SVM, Naive Bayes, Ristrict Bollzmann-SVM classifiler(RBS).
Scikit-learn can join RBM-SVM using pipeline method.
Model can save as pkl file using cPicke. (following code print results only. ;-) )
Scikit-learn is very simple to use, and powerful.

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
#from sklearn import cross_validation
from sklearn import metrics
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline


f = open("rdk_sol_set_norm_descs.pkl", "rb")
train, valid, test = cPickle.load(f)

train_x, train_y = train
test_x, test_y = test

print ("RANDPMFOREST")
nclf = RandomForestClassifier( n_estimators=100, max_depth=5, random_state=0, n_jobs=1 )
nclf = nclf.fit( train_x, train_y )
preds = nclf.predict( test_x )
print (metrics.confusion_matrix(test_y, preds))
print (metrics.classification_report(test_y, preds))
accuracy = nclf.score(test_x, test_y)
print (accuracy)

RANDPMFOREST
[[101   1]
 [ 50 105]]
              precision    recall  f1-score   support

           0       0.67      0.99      0.80       102
           1       0.99      0.68      0.80       155

    accuracy                           0.80       257
   macro avg       0.83      0.83      0.80       257
weighted avg       0.86      0.80      0.80       257

0.8015564202334631


In [4]:
print ("SVM")
clf_svm = svm.SVC( gamma=0.001, C=100. )
clf_svm = clf_svm.fit( train_x, train_y )
preds_SVM = clf_svm.predict( test_x )
print (metrics.confusion_matrix( test_y, preds_SVM ))
print (metrics.classification_report( test_y, preds_SVM ))
accuracy = clf_svm.score( test_x, test_y )
print (accuracy)

SVM
[[ 97   5]
 [ 29 126]]
              precision    recall  f1-score   support

           0       0.77      0.95      0.85       102
           1       0.96      0.81      0.88       155

    accuracy                           0.87       257
   macro avg       0.87      0.88      0.87       257
weighted avg       0.89      0.87      0.87       257

0.867704280155642


In [5]:
print ("NB")
gnb = GaussianNB()
clf_NB = gnb.fit( train_x, train_y )
preds_NB = clf_NB.predict( test_x )
print (metrics.confusion_matrix( test_y, preds_NB ) )
print (metrics.classification_report( test_y, preds_NB ) )
accuracy = clf_NB.score( test_x, test_y )
print (accuracy)

NB
[[ 42  60]
 [ 16 139]]
              precision    recall  f1-score   support

           0       0.72      0.41      0.53       102
           1       0.70      0.90      0.79       155

    accuracy                           0.70       257
   macro avg       0.71      0.65      0.66       257
weighted avg       0.71      0.70      0.68       257

0.7042801556420234


In [6]:
print ("RBM")
cls_svm2 = svm.SVC( gamma=0.001, C=100. )
rbm = BernoulliRBM(random_state = 0, verbose = True)
classifier = Pipeline( steps=[("rbm", rbm), ("cls_svm2", cls_svm2)] )
rbm.learning_rate = 0.06
rbm.n_iter = 20
rbm.n_compornents = 1000
classifier.fit(train_x, train_y)
pred_RBM = classifier.predict(test_x)
print (metrics.confusion_matrix(test_y, pred_RBM) )
print (metrics.classification_report(test_y, pred_RBM) )
accuracy = classifier.score( test_x, test_y )
print (accuracy)

RBM
[BernoulliRBM] Iteration 1, pseudo-likelihood = -54.15, time = 0.24s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -54.07, time = 0.27s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -53.57, time = 0.25s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -50.35, time = 0.33s
[BernoulliRBM] Iteration 5, pseudo-likelihood = -48.29, time = 0.32s
[BernoulliRBM] Iteration 6, pseudo-likelihood = -49.30, time = 0.31s
[BernoulliRBM] Iteration 7, pseudo-likelihood = -47.74, time = 0.28s
[BernoulliRBM] Iteration 8, pseudo-likelihood = -48.52, time = 0.39s
[BernoulliRBM] Iteration 9, pseudo-likelihood = -47.15, time = 0.30s
[BernoulliRBM] Iteration 10, pseudo-likelihood = -49.40, time = 0.30s
[BernoulliRBM] Iteration 11, pseudo-likelihood = -48.34, time = 0.31s
[BernoulliRBM] Iteration 12, pseudo-likelihood = -47.81, time = 0.62s
[BernoulliRBM] Iteration 13, pseudo-likelihood = -48.61, time = 0.32s
[BernoulliRBM] Iteration 14, pseudo-likelihood = -46.24, time = 0.28s
[BernoulliRBM] Iteration 