In [1]:
import numpy as np
import pandas as pd

database = pd.read_csv('delaney.csv')

In [2]:
database.head()

Unnamed: 0,Compound ID,measured log(solubility:mol/L),ESOL predicted log(solubility:mol/L),SMILES
0,"1,1,1,2-Tetrachloroethane",-2.18,-2.794,ClCC(Cl)(Cl)Cl
1,"1,1,1-Trichloroethane",-2.0,-2.232,CC(Cl)(Cl)Cl
2,"1,1,2,2-Tetrachloroethane",-1.74,-2.549,ClC(Cl)C(Cl)Cl
3,"1,1,2-Trichloroethane",-1.48,-1.961,ClCC(Cl)Cl
4,"1,1,2-Trichlorotrifluoroethane",-3.04,-3.077,FC(F)(Cl)C(F)(Cl)Cl


In [3]:
def AromaticAtoms(m):
    aromatic_atoms = [m.GetAtomWithIdx(i).GetIsAromatic() for i in range(m.GetNumAtoms())]
    aa_count = []
    for i in aromatic_atoms:
        if i==True:
            aa_count.append(1)
    sum_aa_count = sum(aa_count)
    return sum_aa_count

In [4]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def generate_Delaney_desc(smiles):
    '''
    To predict LogS (log of the aqueous solubility), the study by Delaney makes use of 4 molecular descriptors:
    cLogP (Octanol-water partition coefficient)
    MW (Molecular weight)
    RB (Number of rotatable bonds)
    AP (Aromatic proportion = number of aromatic atoms / number of heavy atoms)
    '''

    moldata= []
    for elem in smiles:
        mol=Chem.MolFromSmiles(elem) 
        moldata.append(mol)
       
    baseData= np.arange(1,1)
    i=0  
    for mol in moldata:        
       
        desc_MolLogP            = Descriptors.MolLogP(mol)
        desc_MolWt              = Descriptors.MolWt(mol)
        desc_NumRotatableBonds  = Descriptors.NumRotatableBonds(mol)
        desc_AromaticProportion = AromaticAtoms(mol)/Descriptors.HeavyAtomCount(mol)
        
        row = np.array([desc_MolLogP,
                        desc_MolWt,
                        desc_NumRotatableBonds,
                        desc_AromaticProportion])   
    
        if (i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1      
    
    columnNames=["MolLogP","MolWt","NumRotatableBonds","AromaticProportion"]   
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)
    
    return descriptors

In [5]:
X = generate_Delaney_desc(database["SMILES"])
Y = database.iloc[:,1]

In [8]:
X.head()

Unnamed: 0,MolLogP,MolWt,NumRotatableBonds,AromaticProportion
0,2.5954,167.85,0.0,0.0
1,2.3765,133.405,0.0,0.0
2,2.5938,167.85,1.0,0.0
3,2.0289,133.405,1.0,0.0
4,2.9189,187.375,1.0,0.0


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)


from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
model = linear_model.LinearRegression()
model.fit(X_train, Y_train)

print ("\nTraining set")
Y_pred_train = model.predict(X_train)
print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_train, Y_pred_train))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_train, Y_pred_train))


print ("\nTest set")
Y_pred_test = model.predict(X_test)
print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)
print('Mean squared error (MSE): %.2f'
      % mean_squared_error(Y_test, Y_pred_test))
print('Coefficient of determination (R^2): %.2f'
      % r2_score(Y_test, Y_pred_test))


Training set
Coefficients: [-0.74800644 -0.00661186  0.01688302 -0.3756643 ]
Intercept: 0.23066830457999998
Mean squared error (MSE): 1.00
Coefficient of determination (R^2): 0.77

Test set
Coefficients: [-0.74800644 -0.00661186  0.01688302 -0.3756643 ]
Intercept: 0.23066830457999998
Mean squared error (MSE): 1.06
Coefficient of determination (R^2): 0.78
