In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem

In [3]:
smiles_train = []
y_train = []
with open('./Dataset/basic_train_0.70_smiles.txt') as f:
    for line in f.readlines():
        line = line.replace('\n','').split('\t')
        smiles_train.append(line[0])
        y_train.append(float(line[1]))

In [4]:
smiles_val = []
y_val = []
with open('./Dataset/basic_val_0.15_smiles.txt') as f:
    for line in f.readlines():
        line = line.replace('\n','').split('\t')
        smiles_val.append(line[0])
        y_val.append(float(line[1]))

In [5]:
fp_train = []
for i in smiles_train:
    mol = Chem.MolFromSmiles(i)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol,2)
    fp = [int(i) for i in fp]
    fp_train.append(fp)
    
fp_val= []
for i in smiles_val:
    mol = Chem.MolFromSmiles(i)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol,2)
    fp = [int(i) for i in fp]
    fp_val.append(fp)

In [6]:
X_train = np.array(fp_train)
print(X_train.shape)
X_val = np.array(fp_val)
print(X_val.shape)

(5905, 2048)
(1265, 2048)


In [7]:
n_estimators_list = [50,100,300,500,1000,1500,2000]
learning_rate_list = [0.05,0.1,0.2,0.5]

In [14]:
for n_estimators in n_estimators_list:
    for learning_rate in learning_rate_list:
        model = XGBRegressor(n_estimators = n_estimators,learning_rate = learning_rate,random_state= 0,nthread = 20)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        print('n_estimators: '+ str(n_estimators))
        print('learning_rate: '+ str(learning_rate))
        print(mean_absolute_error(y_val, y_pred))
        print(mean_squared_error(y_val, y_pred)**0.5)
        print(r2_score(y_val, y_pred))
        print('\n')

n_estimators: 50
learning_rate: 0.05
1.5638923944898993
2.0177221634149745
0.5884703910748506


n_estimators: 50
learning_rate: 0.1
1.2982313621001482
1.7985067789407738
0.6730340583949888


n_estimators: 50
learning_rate: 0.2
1.19463723272465
1.6977107818715365
0.7086562055702337


n_estimators: 50
learning_rate: 0.5
1.106494960217517
1.6196227704463158
0.7348411624551089


n_estimators: 100
learning_rate: 0.05
1.3025336371027343
1.7981060462897678
0.6731797473997683


n_estimators: 100
learning_rate: 0.1
1.1810959699220016
1.6886495901323728
0.7117578845884389


n_estimators: 100
learning_rate: 0.2
1.1109003049206465
1.608857744926847
0.7383542711899156


n_estimators: 100
learning_rate: 0.5
1.0567723696322469
1.5845929944651702
0.7461870237870643


n_estimators: 300
learning_rate: 0.05
1.130185227167521
1.6356237092555646
0.7295760494019135


n_estimators: 300
learning_rate: 0.1
1.05394655406157
1.5646054189355156
0.752549680620072


n_estimators: 300
learning_rate: 0.2
1.0081672313

In [10]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

In [13]:
print(mean_absolute_error(y_val, y_pred))
print(mean_squared_error(y_val, y_pred)**0.5)
print(r2_score(y_val, y_pred))

0.9589733094710295
1.5497186180516194
0.7572361257424128
