In [104]:
import pandas as pd
import numpy as np
import os

from mordred import Calculator, descriptors
from rdkit import Chem
from rdkit.Chem import AllChem
#import deepchem as dc

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, f1_score

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR, SVC

In [105]:
use_descriptors = True
use_fingerprints = False

regression = True
threshold = 7

scaffold_split = True


### All hyperparameters need to be supplimented into a function

models_with_parameters = [
['rf', {
#  "bootstrap": True,
#  #"criterion": "entropy",
#  #"criterion": "squared_error",
#  "criterion": "entropy",
#  "min_samples_split": 32,
#  "n_estimators": 30
}],
['lr', {
}],
['nn', {
}],
['gb', {
}],
['sv', {
}]
]

metrics = []


### Retrieving data the old way:

In [106]:
### Input standard SMILES column
def CalculateMorganFingerprint(mol):
    mol = mol.apply(Chem.MolFromSmiles)
    mfpgen = AllChem.GetMorganGenerator(radius=2,fpSize=2048)
    fingerprint = np.array([mfpgen.GetFingerprintAsNumPy(x) for x in mol])
    fingerprint = pd.DataFrame(fingerprint, columns = ['mfp'+str(i) for i in range(fingerprint.shape[1])])
    return fingerprint

### Input standard SMILES column
def CalculateDescriptors(mol):
    mol = mol.apply(Chem.MolFromSmiles)
    calc = Calculator(descriptors, ignore_3D=False)
    X_mordred = calc.pandas(mol, nproc=1)
    X_mordred = X_mordred.select_dtypes(['number'])
    #normalize
    X_mordred = (X_mordred-X_mordred.min())/(X_mordred.max()-X_mordred.min())
    #drop columns wth low std
    X_mordred = X_mordred.loc[:,X_mordred.std()>0.01]
    return X_mordred

In [108]:
def Load_downloaded_CSV(path, regression = False, calculate_pIC50 = False):
    df = pd.read_csv(path)
    
    ### Replace with standardizing molecules and then dropping duplicates
    #df.drop_duplicates('mol')
    #df = df.dropna()
    
    if 'target' in df.columns:
        df['Target'] = df['target']
        df.drop('target', axis=1, inplace=True)
        
    if regression:
        if 'IC50' in df.columns:
            calculate_pIC50 = True
            df['Target'] = df['IC50']
            df.drop('IC50', axis=1, inplace=True)
        if 'pIC50' in df.columns:
            df['Target'] = df['pIC50']
            df.drop('pIC50', axis=1, inplace=True)
    else:
        if 'Class' in df.columns:
            df['Target'] = df['Class']
            df.drop('Class', axis=1, inplace=True)
        
    if 'SMILES' in df.columns:
        df['mol'] = df['SMILES']
        df.drop('SMILES', axis=1, inplace=True)
    
    if calculate_pIC50:
        df['Target'] = [-np.log10(i * 10**(-9)) for i in list(df['Target'])]
        if not regression:
            df['Target'] = [int(i > 7) for i in list(df['Target'])]

    df = df[['mol', 'Target']]

    if use_descriptors:
        new_df = CalculateDescriptors(df['mol'])
    if use_fingerprints:
        new_df = CalculateMorganFingerprint(df['mol'])
        
    new_df['Target'] = df['Target']

    return new_df

def Split_downloaded_CSV(df):
    X = df.drop(['Target'], axis=1)
    y = df[['Target']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.111, random_state=42)
    return X_train, y_train, X_test, y_test, X_valid, y_valid

In [109]:
### ror-gamma
#csv_path = r"C:\Users\admin\Documents\GitHub\czasteczkowa-inzynierka\experiments\ROR-gamma\ROR_data_1.csv"
#df_loaded = Load_downloaded_CSV(csv_path, regression=regression, calculate_pIC50=True)
### bace
csv_path = r"C:\Users\admin\Documents\GitHub\czasteczkowa-inzynierka\experiments\BACE\bace.csv"
df_loaded = Load_downloaded_CSV(csv_path, regression=regression)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  new_df['Target'] = df['Target']


In [110]:
X_train, y_train, X_test, y_test, X_valid, y_valid = Split_downloaded_CSV(df_loaded)

In [111]:
print(X_train.shape)
print(X_test.shape)
print(X_valid.shape)

(1209, 1220)
(152, 1220)
(152, 1220)


### Configurations

In [112]:
def model_builder(model_name, hyperparams, regression):
    if model_name == 'rf':
        if "n_estimators" not in hyperparams.keys():
            hyperparams["n_estimators"] = 100
        if "min_samples_split" not in hyperparams.keys():
            hyperparams["min_samples_split"] = 2
        if "bootstrap" not in hyperparams.keys():
            hyperparams["bootstrap"] = True  

        if regression:
            if "criterion" not in hyperparams.keys():
                hyperparams["criterion"] = "squared_error"
            model = RandomForestRegressor(n_estimators=hyperparams["n_estimators"],
                                    min_samples_split=hyperparams["min_samples_split"],
                                    criterion=hyperparams["criterion"],
                                    bootstrap=hyperparams["bootstrap"])
        else:
            if "criterion" not in hyperparams.keys():
                hyperparams["criterion"] = "gini"
            model = RandomForestClassifier(n_estimators=hyperparams["n_estimators"],
                                    min_samples_split=hyperparams["min_samples_split"], 
                                    criterion=hyperparams["criterion"],
                                    bootstrap=hyperparams["bootstrap"])
            
    if model_name == 'lr':
        if regression:
            model = LinearRegression()
        else:
            if "C" not in hyperparams.keys():
                hyperparams["C"] = 1
            if "penalty" not in hyperparams.keys():
                hyperparams["penalty"] = "l2"
            if "solver" not in hyperparams.keys():
                hyperparams["solver"] = "liblinear"
            model = LogisticRegression(C=hyperparams["C"], penalty=hyperparams["penalty"], solver=hyperparams["solver"])

    if model_name == 'nn':
        if "hidden_layer_sizes" not in hyperparams.keys():
            hyperparams["hidden_layer_sizes"] = (100,)
        if "activation" not in hyperparams.keys():
            hyperparams["activation"] = "relu"
        if "alpha" not in hyperparams.keys():
            hyperparams["alpha"] = 0.0001  
        if "max_iter" not in hyperparams.keys():
            hyperparams["max_iter"] = 500#200
        if regression:
            model = MLPRegressor(hidden_layer_sizes=hyperparams["hidden_layer_sizes"], activation=hyperparams["activation"], 
                                  alpha=hyperparams["alpha"], max_iter=hyperparams["max_iter"])
        else:
            model = MLPClassifier(hidden_layer_sizes=hyperparams["hidden_layer_sizes"], activation=hyperparams["activation"], 
                                  alpha=hyperparams["alpha"], max_iter=hyperparams["max_iter"])
        
    if model_name == 'gb':
        if "n_estimators" not in hyperparams.keys():
            hyperparams["n_estimators"] = 100
        if "learning_rate" not in hyperparams.keys():
            hyperparams["learning_rate"] = 0.1
        if regression:
            model = GradientBoostingRegressor(n_estimators=hyperparams["n_estimators"], learning_rate=hyperparams["learning_rate"])
        else:
            model = GradientBoostingClassifier(n_estimators=hyperparams["n_estimators"], learning_rate=hyperparams["learning_rate"])

    if model_name == 'sv':
        if "C" not in hyperparams.keys():
            hyperparams["C"] = 1
        if "degree" not in hyperparams.keys():
            hyperparams["degree"] = 3
        if "kernel" not in hyperparams.keys():
            hyperparams["kernel"] = "rbf"
        if regression:
            if "epsilon" not in hyperparams.keys():
                hyperparams["epsilon"] = 0.1
            model = SVR(C=hyperparams["C"], degree=hyperparams["degree"], kernel=hyperparams["kernel"], epsilon=hyperparams["epsilon"])
        else:
            model = SVC(C=hyperparams["C"], degree=hyperparams["degree"], kernel=hyperparams["kernel"])
            
    return model
    

### Train and test

In [113]:
def train_and_test(model, X_train, y_train, X_test, y_test, X_valid, y_valid, regression, metrics=[], iterations=1):
    for i in range(iterations):
        model.fit(X_train, np.reshape(y_train, (-1, )))
        
        y_test_predicted = model.predict(X_test)
        y_valid_predicted = model.predict(X_valid)

        #print("Standard train-test results:")

        results_test = {}
        results_valid = {}

        if regression:
            if 'rmse' in metrics or len(metrics) == 0:
                metric_test = mean_squared_error(y_test, y_test_predicted, squared=False)
                metric_valid = mean_squared_error(y_valid, y_valid_predicted, squared=False)
                results_test["rmse"] = metric_test
                results_valid["rmse"] = metric_valid
            if 'mse' in metrics or len(metrics) == 0:
                metric_test = mean_squared_error(y_test, y_test_predicted)
                metric_valid = mean_squared_error(y_valid, y_valid_predicted)
                results_test["mse"] = metric_test
                results_valid["mse"] = metric_valid
            if 'mae' in metrics or len(metrics) == 0:
                metric_test = mean_absolute_error(y_test, y_test_predicted)
                metric_valid = mean_absolute_error(y_valid, y_valid_predicted)
                results_test["mae"] = metric_test
                results_valid["mae"] = metric_valid
            if 'r2' in metrics or len(metrics) == 0:
                metric_test = r2_score(y_test, y_test_predicted)
                metric_valid = r2_score(y_valid, y_valid_predicted)
                results_test["r2"] = metric_test
                results_valid["r2"] = metric_valid
            
        else:
            if 'roc_auc' in metrics or len(metrics) == 0:
                metric_test = roc_auc_score(y_test, y_test_predicted)
                metric_valid = roc_auc_score(y_valid, y_valid_predicted)
                results_test["roc_auc"] = metric_test
                results_valid["roc_auc"] = metric_valid
            if 'accuracy' in metrics or len(metrics) == 0:
                metric_test = accuracy_score(y_test, y_test_predicted)
                metric_valid = accuracy_score(y_valid, y_valid_predicted)
                results_test["accuracy"] = metric_test
                results_valid["accuracy"] = metric_valid
            if 'precision' in metrics or len(metrics) == 0:
                metric_test = precision_score(y_test, y_test_predicted)
                metric_valid = precision_score(y_valid, y_valid_predicted)
                results_test["precision"] = metric_test
                results_valid["precision"] = metric_valid
            if 'recall' in metrics or len(metrics) == 0:
                metric_test = recall_score(y_test, y_test_predicted)
                metric_valid = recall_score(y_valid, y_valid_predicted)
                results_test["recall"] = metric_test
                results_valid["recall"] = metric_valid
            if 'f1' in metrics or len(metrics) == 0:
                metric_test = f1_score(y_test, y_test_predicted)
                metric_valid = f1_score(y_valid, y_valid_predicted)
                results_test["f1"] = metric_test
                results_valid["f1"] = metric_valid

    return results_test, results_valid

In [114]:
for model_name, hyperparams in models_with_parameters:
    print(model_name)

    model = model_builder(model_name, hyperparams, regression)
    results_test, results_valid = train_and_test(model, X_train, y_train, X_test, y_test, X_valid, y_valid, regression=regression, metrics=metrics)
    print(results_test)
    print(results_valid)
    
    

rf
{'rmse': 0.7245011008975704, 'mse': 0.5249018452017914, 'mae': 0.5580280639451204, 'r2': 0.7120181662438041}
{'rmse': 0.6943787540083267, 'mse': 0.4821618540181563, 'mae': 0.5259046646528743, 'r2': 0.7148523137660241}
lr
{'rmse': 3.5797579415337184, 'mse': 12.814666919973726, 'mae': 1.8354871385690792, 'r2': -6.030631178616216}
{'rmse': 19.952064602563127, 'mse': 398.0848819048525, 'mae': 4.1860628997481495, 'r2': -234.42505914542923}
nn
{'rmse': 0.9488780459383465, 'mse': 0.9003695460637747, 'mae': 0.7660887487655016, 'r2': 0.5060217918761574}
{'rmse': 0.8427382666162169, 'mse': 0.7102077860193059, 'mae': 0.7149876502464926, 'r2': 0.5799872900747268}
gb
{'rmse': 0.6993894585993036, 'mse': 0.48914561479982693, 'mae': 0.5433892482426249, 'r2': 0.731635443061355}
{'rmse': 0.7003646091970713, 'mse': 0.4905105858157665, 'mae': 0.53146507710448, 'r2': 0.7099149228562344}
sv
{'rmse': 0.7673992774203482, 'mse': 0.5889016509852725, 'mae': 0.5861397563961007, 'r2': 0.676905351156477}
{'rmse'

### Creating a results dataframe

In [115]:
model_name_dict_reg = {"rf": "RandomForestRegressor", "lr": "LinearRegression", "nn": "MLPRegressor", "gb": "GradientBoostingRegressor", "sv": "SVR"}
model_name_dict_class = {"rf": "RandomForestClassifier", "lr": "LogisticRegression", "nn": "MLPClassifier", "gb": "GradientBoostingClassifier", "sv": "SVC"}

In [116]:
### Row by row
data = {"model": [], "set": []}
if regression:
    for metric in ["rmse", "mse", "mae", "r2"]:
        data[metric] = []
else:
    for metric in ["roc_auc", "accuracy", "precision", "recall", "f1"]:
        data[metric] = []

results_df = pd.DataFrame(data)
for model_name, hyperparams in models_with_parameters:

    model = model_builder(model_name, hyperparams, regression)
    results_test, results_valid = train_and_test(model, X_train, y_train, X_test, y_test, X_valid, y_valid, regression=regression, metrics=metrics)
    print(results_test)
    print(results_valid)
    if regression:
        results_test["model"] = model_name_dict_reg[model_name]
    else:
        results_test["model"] = model_name_dict_class[model_name]
    results_test["set"] = "test"
    results_df.loc[len(results_df)] = results_test
    if regression:
        results_valid["model"] = model_name_dict_reg[model_name]
    else:
        results_valid["model"] = model_name_dict_class[model_name]
    results_valid["set"] = "valid"
    results_df.loc[len(results_df)] = results_valid

{'rmse': 0.7222102843383881, 'mse': 0.5215876948041354, 'mae': 0.5579373519986839, 'r2': 0.7138364397316671}
{'rmse': 0.7043392967840308, 'mse': 0.4960938449942231, 'mae': 0.531224259782378, 'r2': 0.7066130161974766}
{'rmse': 3.5797579415337184, 'mse': 12.814666919973726, 'mae': 1.8354871385690792, 'r2': -6.030631178616216}
{'rmse': 19.952064602563127, 'mse': 398.0848819048525, 'mae': 4.1860628997481495, 'r2': -234.42505914542923}
{'rmse': 0.8080527634823731, 'mse': 0.6529492685714999, 'mae': 0.6204444133739621, 'r2': 0.6417663046303468}
{'rmse': 0.6903936710127397, 'mse': 0.4766434209744472, 'mae': 0.5278811525425215, 'r2': 0.7181158826297516}
{'rmse': 0.6976516326761565, 'mse': 0.48671780057570685, 'mae': 0.5429715945500043, 'r2': 0.7329674376021871}
{'rmse': 0.7008886948052593, 'mse': 0.4912449625058199, 'mae': 0.527794848248927, 'r2': 0.7094806167985324}
{'rmse': 0.7673992774203482, 'mse': 0.5889016509852725, 'mae': 0.5861397563961007, 'r2': 0.676905351156477}
{'rmse': 0.6510818713

In [117]:
results_df

Unnamed: 0,model,set,rmse,mse,mae,r2
0,RandomForestRegressor,test,0.72221,0.521588,0.557937,0.713836
1,RandomForestRegressor,valid,0.704339,0.496094,0.531224,0.706613
2,LinearRegression,test,3.579758,12.814667,1.835487,-6.030631
3,LinearRegression,valid,19.952065,398.084882,4.186063,-234.425059
4,MLPRegressor,test,0.808053,0.652949,0.620444,0.641766
5,MLPRegressor,valid,0.690394,0.476643,0.527881,0.718116
6,GradientBoostingRegressor,test,0.697652,0.486718,0.542972,0.732967
7,GradientBoostingRegressor,valid,0.700889,0.491245,0.527795,0.709481
8,SVR,test,0.767399,0.588902,0.58614,0.676905
9,SVR,valid,0.651082,0.423908,0.502574,0.749304


In [118]:
csv_path = "results.csv"
results_df.to_csv(csv_path)