In [1]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors

In [2]:
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef

#from sklearn.externals import joblib
import joblib

#### Reading molecules and activity from SDF

In [3]:
fname = "data/logBB.sdf"

mols = []
y = []
for mol in Chem.SDMolSupplier(fname):
    if mol is not None:
        mols.append(mol)
        y.append(mol.GetIntProp("logBB_class"))

nmol = len(mols)
print ("The number of molecule is ", nmol)

The number of molecule is  321


#### Calculate descriptors (fingerprints) and convert them into numpy array

In [4]:
# check wether the data set is balanced
sum(y) / len(y)

0.5545171339563862

In [5]:
def rdkit_numpy_convert(fp):
    output = []
    for f in fp:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [6]:
# generate binary Morgan fingerprint with radius 2
fp = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols]

In [7]:
x = rdkit_numpy_convert(fp)

# Set random seed to make all further calculations reproducible
seed = 42
# randomly select 20% of compounds as test set
indices = np.arange(len(y))

# Split the whole set on training and test sets
x_tr, x_ts, y_tr, y_ts, idx1, idx2 = train_test_split(x, y, indices, test_size=0.20, random_state=seed)

In [8]:
def predict_(similarity, fp, nmol, idx1, idx2, y_ts):
    sim_dist = np.zeros((nmol,nmol))
    for imol1 in range(nmol):
        for imol2 in range(imol1+1,nmol):
            if (similarity == "DiceSimilarity"):
                sim_dist[imol1,imol2] = DataStructs.DiceSimilarity(fp[imol1],fp[imol2])
            elif (similarity == "FingerprintSimilarity"):
                sim_dist[imol1,imol2] = DataStructs.FingerprintSimilarity(fp[imol1],fp[imol2])
            sim_dist[imol2,imol1] = sim_dist[imol1,imol2]


    y_pred_ts = []
    for idtest in idx2:
        numerator_   = y_tr[:] * sim_dist[idtest,idx1]
        denominator_ = sim_dist[idtest,idx1]
        y_pred_ts.append( int(round(sum(numerator_)/sum(denominator_))) )
    
    # calc statistics
    print(similarity)
    print("Accuracy = ", accuracy_score(y_ts, y_pred_ts))
    print("MCC = ", matthews_corrcoef(y_ts, y_pred_ts))
    print("Kappa = ", cohen_kappa_score(y_ts, y_pred_ts))
    print("\n")

In [9]:
predict_("DiceSimilarity", fp, nmol, idx1, idx2, y_ts)
predict_("FingerprintSimilarity", fp, nmol, idx1, idx2, y_ts)

DiceSimilarity
Accuracy =  0.7230769230769231
MCC =  0.47559486560567094
Kappa =  0.3689320388349514


FingerprintSimilarity
Accuracy =  0.7230769230769231
MCC =  0.47559486560567094
Kappa =  0.3689320388349514




In [10]:
fp = [Chem.RDKFingerprint(m) for m in mols]
predict_("DiceSimilarity", fp, nmol, idx1, idx2, y_ts)
predict_("FingerprintSimilarity", fp, nmol, idx1, idx2, y_ts)

DiceSimilarity
Accuracy =  0.6615384615384615
MCC =  0.36961423790257175
Kappa =  0.34523809523809523


FingerprintSimilarity
Accuracy =  0.6923076923076923
MCC =  0.43334083064439444
Kappa =  0.40476190476190477




In [25]:
from rdkit.Chem import MACCSkeys
fp = [MACCSkeys.GenMACCSKeys(m) for m in mols]
predict_("DiceSimilarity", fp, nmol, idx1, idx2, y_ts)
predict_("FingerprintSimilarity", fp, nmol, idx1, idx2, y_ts)

DiceSimilarity
Accuracy =  0.8
MCC =  0.5973424234499494
Kappa =  0.5947242206235013


FingerprintSimilarity
Accuracy =  0.7846153846153846
MCC =  0.5702439778492803
Kappa =  0.5658396946564885




In [26]:
from rdkit.Chem.AtomPairs import Pairs
fp = [Pairs.GetAtomPairFingerprint(m) for m in mols]
predict_("DiceSimilarity", fp, nmol, idx1, idx2, y_ts)
predict_("FingerprintSimilarity", fp, nmol, idx1, idx2, y_ts)

DiceSimilarity
Accuracy =  0.8
MCC =  0.621568283815359
Kappa =  0.5573598742797277




AttributeError: 'IntSparseIntVect' object has no attribute 'GetNumBits'

In [27]:
fp = [Pairs.GetAtomPairFingerprintAsBitVect(m) for m in mols]
predict_("DiceSimilarity", fp, nmol, idx1, idx2, y_ts)
predict_("FingerprintSimilarity", fp, nmol, idx1, idx2, y_ts)

DiceSimilarity
Accuracy =  0.7846153846153846
MCC =  0.5756878531144476
Kappa =  0.5260416666666667


FingerprintSimilarity
Accuracy =  0.8
MCC =  0.621568283815359
Kappa =  0.5573598742797277




In [28]:
fp = [AllChem.GetMorganFingerprint(m,2) for m in mols]
predict_("DiceSimilarity", fp, nmol, idx1, idx2, y_ts)
predict_("FingerprintSimilarity", fp, nmol, idx1, idx2, y_ts)

DiceSimilarity
Accuracy =  0.676923076923077
MCC =  0.34941520467836257
Kappa =  0.2593597395550733




AttributeError: 'UIntSparseIntVect' object has no attribute 'GetNumBits'

#### Create folds for cross-validation

In [11]:
cv = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
#cv = StratifiedKFold(n_splits=5, random_state=None)

#### Scale X

This step may be crucial for certain modeling approaches lke SVM.
In the case of binary fingerprints it may be less useful.

In [12]:
# obtain scale object which can be further applied to scale any data to fit the training set
scale = StandardScaler().fit(x_tr)
x_tr = scale.transform(x_tr)

In [13]:
# it is a good idea to save it for future use
joblib.dump(scale, "data/logBB_scale.pkl", compress=3)

['data/logBB_scale.pkl']

#### Search for optimal tuning parameters and build the model

In [14]:
# create grid search dictionary
param_grid = {"max_features": [x_tr.shape[1] // 10, x_tr.shape[1] // 7, x_tr.shape[1] // 5, x_tr.shape[1] // 3], 
              "n_estimators": [100, 250, 500]}

In [15]:
# setup model building
m = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=2, cv=cv, verbose=1)

In [16]:
# run model building
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=RandomForestClassifier(), n_jobs=2,
             param_grid={'max_features': [204, 292, 409, 682],
                         'n_estimators': [100, 250, 500]},
             verbose=1)

In [17]:
m.best_params_

{'max_features': 409, 'n_estimators': 250}

In [18]:
m.best_score_

0.7890648567119156

In [19]:
m.cv_results_['mean_test_score']

array([0.78129713, 0.773454  , 0.78129713, 0.78137255, 0.78514329,
       0.77737557, 0.78129713, 0.78906486, 0.78514329, 0.78137255,
       0.7852187 , 0.7852187 ])

In [20]:
m.cv_results_['params']

[{'max_features': 204, 'n_estimators': 100},
 {'max_features': 204, 'n_estimators': 250},
 {'max_features': 204, 'n_estimators': 500},
 {'max_features': 292, 'n_estimators': 100},
 {'max_features': 292, 'n_estimators': 250},
 {'max_features': 292, 'n_estimators': 500},
 {'max_features': 409, 'n_estimators': 100},
 {'max_features': 409, 'n_estimators': 250},
 {'max_features': 409, 'n_estimators': 500},
 {'max_features': 682, 'n_estimators': 100},
 {'max_features': 682, 'n_estimators': 250},
 {'max_features': 682, 'n_estimators': 500}]

#### Save model

In [21]:
joblib.dump(m, "data/logBB_rf_morgan.pkl", compress=3)

['data/logBB_rf_morgan.pkl']

#### Predict test set compounds

In [22]:
# load scale if necessary
scale = joblib.load("data/logBB_scale.pkl")

# scale descriptors of the test set compounds
x_ts = scale.transform(x_ts)

# predict logBB class
pred_rf = m.predict(x_ts)

In [23]:
pred_rf

array([0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1])

In [24]:
# calc statistics
print("Accuracy = ", accuracy_score(y_ts, pred_rf))
print("MCC = ", matthews_corrcoef(y_ts, pred_rf))
print("Kappa = ", cohen_kappa_score(y_ts, pred_rf))

Accuracy =  0.7692307692307693
MCC =  0.5276492699681888
Kappa =  0.527387300048473
