In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem as ch
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import pickle as pkl
import seaborn as sns



In [2]:
def split_data(X_new):
    X_final, validate = np.split(X_new.sample(frac=1), [int(.8*len(X_new))])
    X_train, X_test, y_train, y_test = train_test_split(X_final.drop(columns=['pChemBL'],inplace=False), X_final['pChemBL'], test_size=0.2, random_state=42)
    X_validate = validate.drop(columns='pChemBL',inplace=False).copy()
    y_validate = validate['pChemBL']
    return X_train, X_test, X_validate, y_train, y_test, y_validate
def train_RFR(X_train, X_test, X_validate, y_train, y_test, y_validate):
    model2 = RandomForestRegressor(n_estimators= 150)
    model2.fit(X_train,y_train)
    r_square = model2.score(X_train,y_train)
    rmse = np.sqrt(mean_squared_error(model2.predict(X_train),y_train))
    print("Training set results:\nRMSE = {}\t R^2 = {}".format(rmse,r_square))
    q_square = model2.score(X_test,y_test)
    rmse = np.sqrt(mean_squared_error(model2.predict(X_test),y_test))
    print("Test set results\nRMSE = {}\t Q^2 = {}".format(rmse,q_square))
    q_square = model2.score(X_validate,y_validate)
    rmse = np.sqrt(mean_squared_error(y_validate,model2.predict(X_validate)))
    print("Validation set results\nRMSE = {}\t Q^2 = {}".format(rmse,q_square))    
    return model2

In [3]:
X = pd.read_csv('no_zeros_no_chembl.csv')
y = X['pChemBL']
X.drop(columns='pChemBL',inplace=True)

In [5]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
reg = ExtraTreesRegressor(n_estimators=150)
reg = reg.fit(X, y)

In [None]:
reg.score(X,y)

In [38]:
model = SelectFromModel(reg, prefit=True)
X_new = model.transform(X)
X_new.shape               


(2861, 279)

In [39]:
cols = []
for i in range(X_new.shape[1]):
    cols.append('comp'+str(i+1))
X_new = pd.DataFrame(data=X_new,columns=cols)

In [40]:
X_new['pChemBL'] = y
X_train, X_test, X_validate, y_train, y_test, y_validate = split_data(X_new)
model = train_RFR(X_train, X_test, X_validate, y_train, y_test, y_validate)

Training set results:
RMSE = 0.2612342814942656	 R^2 = 0.951231462226615
Test set results
RMSE = 0.6711355166778638	 Q^2 = 0.6546759899619428
Validation set results
RMSE = 0.7139772320201792	 Q^2 = 0.5980952575220048


In [None]:
model = SelectFromModel(reg, prefit=True,max_features=25)
X_new = model.transform(X)
cols = []
for i in range(X_new.shape[1]):
    cols.append('comp'+str(i+1))
X_new = pd.DataFrame(data=X_new,columns=cols)
X_new['pChemBL'] = y
X_train, X_test, X_validate, y_train, y_test, y_validate = split_data(X_new)
model = train_RFR(X_train, X_test, X_validate, y_train, y_test, y_validate)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
X_new = SelectKBest(mutual_info_regression, k=50).fit_transform(X, y)

cols = []
for i in range(50):
    cols.append('comp'+str(i+1))
X_red = pd.DataFrame(data=X_new, columns=cols)

In [33]:
X_red['pChemBL'] = y
X_train, X_test, X_validate, y_train, y_test, y_validate = split_data(X_red)
RFR = train_RFR(X_train, X_test, X_validate, y_train, y_test, y_validate)

Training set results:
RMSE = 0.2739322287215124	 R^2 = 0.9468244640055198
Test set results
RMSE = 0.6161916091171696	 Q^2 = 0.7009729340649702
Validation set results
RMSE = 0.6717152615156313	 Q^2 = 0.6397030563793147


In [41]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
ko = 15
X_new = SelectKBest(mutual_info_regression, k=ko).fit_transform(X, y)

cols = []
for i in range(ko):
    cols.append('comp'+str(i+1))
X_red = pd.DataFrame(data=X_new, columns=cols)
X_red['pChemBL'] = y
X_train, X_test, X_validate, y_train, y_test, y_validate = split_data(X_red)
RFR = train_RFR(X_train, X_test, X_validate, y_train, y_test, y_validate)

Training set results:
RMSE = 0.2835793651287902	 R^2 = 0.9410312390992518
Test set results
RMSE = 0.6982204063849987	 Q^2 = 0.6342943954025049
Validation set results
RMSE = 0.7285472901564857	 Q^2 = 0.6090261968313742


In [6]:
from sklearn.feature_selection import RFECV
select = RFECV(reg,step =200,n_jobs=-1,min_features_to_select= 50,verbose=1)
select.fit(X,y)
X_new = select.transform(X)

Fitting estimator with 1208 features.
Fitting estimator with 1008 features.
Fitting estimator with 808 features.


In [10]:
cols = []
for i in range(X_new.shape[1]):
    cols.append('comp'+str(i+1))
X_red = pd.DataFrame(data=X_new, columns=cols)
X_red['pChemBL'] = y
X_train, X_test, X_validate, y_train, y_test, y_validate = split_data(X_red)
RFR = train_RFR(X_train, X_test, X_validate, y_train, y_test, y_validate)

Training set results:
RMSE = 0.26038643091486924	 R^2 = 0.9506658314566686
Test set results
RMSE = 0.6892970161527208	 Q^2 = 0.6311874891385347
Validation set results
RMSE = 0.7153796013506576	 Q^2 = 0.624192456639348


In [21]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
#X_red.drop(columns='pChemBL',inplace=True)
X_red.head()
X_red.drop(columns='comp50',inplace=True)

In [None]:
cols = []
for i in range(50):
    cols.append('comp'+str(i+1))
X_new = SelectKBest(mutual_info_regression, k=50).fit_transform(X_red, y)
X_red = pd.DataFrame(data=X_new, columns=cols)

In [22]:
X_red['pChemBL'] = y
X_train, X_test, X_validate, y_train, y_test, y_validate = split_data(X_red)
RFR = train_RFR(X_train, X_test, X_validate, y_train, y_test, y_validate)

Training set results:
RMSE = 0.25776643379761544	 R^2 = 0.9506208177595016
Test set results
RMSE = 0.6484785432255382	 Q^2 = 0.6888873283556138
Validation set results
RMSE = 0.6953832131541388	 Q^2 = 0.6545151705403043


In [25]:
cols=[]
for i in range(25):
    cols.append('comp'+str(i+1))
X_red.drop(columns='pChemBL',inplace=True)    
X_new = SelectKBest(mutual_info_regression, k=25).fit_transform(X_red, y)
X_red = pd.DataFrame(data=X_new, columns=cols)
X_red['pChemBL'] = y
X_train, X_test, X_validate, y_train, y_test, y_validate = split_data(X_red)
RFR = train_RFR(X_train, X_test, X_validate, y_train, y_test, y_validate)

Training set results:
RMSE = 0.2769289461389025	 R^2 = 0.9442085567589638
Test set results
RMSE = 0.7155156440570092	 Q^2 = 0.6291208962414494
Validation set results
RMSE = 0.6721193455727127	 Q^2 = 0.6449727110165622
