In [97]:
import warnings
from itertools import product

import os
import numpy as np
import pandas as pd
from mordred import Calculator, descriptors
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import (StratifiedKFold, KFold,
                                     cross_val_score, train_test_split)

from sklearn.metrics import r2_score
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.svm import SVC

warnings.filterwarnings("ignore", category=Warning)

N_SPLITS = 2
RANDOM_STATE = 148260

### Preprocessing

In [98]:
def LoadCSV(path):
    df = pd.read_csv(path)
    return df

def LoadCSV_BACE(path, regression = False):
    df = pd.read_csv(path)
    df.drop_duplicates('mol')
    df = df.dropna()
    df.drop(['CID', 'canvasUID'], axis=1, inplace=True)
    if regression:
        df['Target'] = df['pIC50']
        df.drop('Class', axis=1, inplace=True)
        df.drop('pIC50', axis=1, inplace=True)
    else:
        df['Target'] = df['Class']
        df.drop('Class', axis=1, inplace=True)
        df.drop('pIC50', axis=1, inplace=True)
    return df

def split_data_BACE(df, scaffold=True):
    X = df.drop(['Target', 'mol'], axis=1)

    if not scaffold:
        y = df[['Target']]
        X = X.drop(['Model'], axis=1)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.111, random_state=42)
        return X_train, y_train, X_test, y_test, X_valid, y_valid

    #dummy = list(X['Model'])
    #
    #sc = StandardScaler()
    #X = sc.fit_transform(X.drop(['Model'], axis=1))
    #X = pd.DataFrame(X)
    #
    #X['Model'] = dummy

    X_train = X[X['Model'] == 'Train']
    X_test = X[X['Model'] == 'Test']
    X_valid = X[X['Model'] == 'Valid']

    y = df[['Target', 'Model']]

    y_train = y[y['Model'] == 'Train']
    y_test = y[y['Model'] == 'Test']
    y_valid = y[y['Model'] == 'Valid']
    
    X_train.drop('Model', axis=1, inplace=True)
    X_test.drop('Model', axis=1, inplace=True)
    X_valid.drop('Model', axis=1, inplace=True)
    y_train.drop('Model', axis=1, inplace=True)
    y_test.drop('Model', axis=1, inplace=True)
    y_valid.drop('Model', axis=1, inplace=True)
    
    return X_train, y_train, X_test, y_test, X_valid, y_valid

In [99]:
dir_path = os.path.join(os.path.dirname(os.getcwd()), "BACE", "bace.csv")
print(dir_path)

df_regression = LoadCSV_BACE(dir_path, regression=True)
df_classification = LoadCSV_BACE(dir_path)

c:\Users\admin\Documents\GitHub\czasteczkowa-inzynierka\experiments\BACE\bace.csv


In [100]:
scaffold = False

In [101]:
X_train_class, y_train_class, X_test_class, y_test_class, X_valid_class, y_valid_class = split_data_BACE(df_classification, scaffold=scaffold)

In [102]:
X_train_regre, y_train_regre, X_test_regre, y_test_regre, X_valid_regre, y_valid_regre = split_data_BACE(df_regression, scaffold=scaffold)

In [103]:
df_classification.head()

Unnamed: 0,mol,Model,MW,AlogP,HBA,HBD,RB,HeavyAtomCount,ChiralCenterCount,ChiralCenterCountAllPossible,...,PEOE6 (PEOE6),PEOE7 (PEOE7),PEOE8 (PEOE8),PEOE9 (PEOE9),PEOE10 (PEOE10),PEOE11 (PEOE11),PEOE12 (PEOE12),PEOE13 (PEOE13),PEOE14 (PEOE14),Target
0,O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2c...,Train,431.56979,4.4014,3,2,5,32,2,2,...,53.205711,78.640335,226.85541,107.43491,37.133846,0.0,7.98017,0.0,0.0,1
1,Fc1cc(cc(F)c1)C[C@H](NC(=O)[C@@H](N1CC[C@](NC(...,Train,657.81073,2.6412,5,4,16,47,6,6,...,73.817162,47.1716,365.67694,174.07675,34.923889,7.98017,24.148668,0.0,24.663788,1
2,S1(=O)(=O)N(c2cc(cc3c2n(cc3CC)CC1)C(=O)N[C@H](...,Train,591.74091,2.5499,4,3,11,42,2,3,...,70.365707,47.941147,192.40652,255.75255,23.654478,0.230159,15.87979,0.0,24.663788,1
3,S1(=O)(=O)C[C@@H](Cc2cc(O[C@H](COCC)C(F)(F)F)c...,Train,591.67828,3.168,4,3,12,40,4,5,...,56.657166,37.954151,194.35304,202.76335,36.498634,0.980913,8.188327,0.0,26.385181,1
4,S1(=O)(=O)N(c2cc(cc3c2n(cc3CC)CC1)C(=O)N[C@H](...,Train,629.71283,3.5086,3,3,11,44,2,3,...,78.945702,39.361153,179.71288,220.4613,23.654478,0.230159,15.87979,0.0,26.100143,1


In [104]:
y_train_regre

Unnamed: 0,Target
1435,4.356547
1153,5.769551
918,6.580000
1191,5.638272
521,7.585027
...,...
152,5.494850
1497,3.488117
34,7.920819
383,8.000000


In [105]:
print(X_train_class.shape)
print(X_test_class.shape)
print(X_valid_class.shape)
print(f"{round(X_train_class.shape[0] / df_classification.shape[0], 2)}")
print(f"{round(X_test_class.shape[0] / df_classification.shape[0], 2)}")
print(f"{round(X_valid_class.shape[0] / df_classification.shape[0], 2)}")

(1209, 589)
(152, 589)
(152, 589)
0.8
0.1
0.1


In [106]:
df_classification.describe()

Unnamed: 0,MW,AlogP,HBA,HBD,RB,HeavyAtomCount,ChiralCenterCount,ChiralCenterCountAllPossible,RingCount,PSA,...,PEOE6 (PEOE6),PEOE7 (PEOE7),PEOE8 (PEOE8),PEOE9 (PEOE9),PEOE10 (PEOE10),PEOE11 (PEOE11),PEOE12 (PEOE12),PEOE13 (PEOE13),PEOE14 (PEOE14),Target
count,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,...,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0
mean,479.661988,3.17708,3.732981,2.001322,8.04957,34.089227,0.522802,2.31725,3.769993,99.842829,...,52.348846,48.76374,181.83558,148.442348,30.371697,3.48865,11.74056,1.239762,14.387597,0.456709
std,122.083053,1.396633,1.444778,1.629343,4.741135,8.520088,1.162539,1.612558,0.87739,34.973718,...,25.9938,18.201519,99.717702,60.548833,12.162452,5.148336,9.073406,3.293804,13.32989,0.498287
min,138.187,-4.3611,0.0,0.0,0.0,10.0,0.0,0.0,0.0,16.610001,...,0.0,-3.551821,1.91697,-5.536391,-2.216191,-7.286308,-6.106466,-7.379991,-1.273524,0.0
25%,389.3313,2.3355,3.0,0.0,4.0,28.0,0.0,1.0,3.0,77.050003,...,34.319988,36.54715,102.23377,102.51045,20.13299,0.0,7.98017,0.0,0.0,0.0
50%,463.6283,3.1713,4.0,2.0,7.0,33.0,0.0,2.0,4.0,95.040001,...,51.479984,47.624382,171.91722,140.68362,30.107586,0.55013,8.188327,0.0,21.710098,0.0
75%,564.63953,4.0155,4.0,3.0,11.0,40.0,1.0,3.0,4.0,116.63,...,66.553795,58.844093,253.67908,185.65926,37.133846,7.98017,15.87979,0.0,24.663788,1.0
max,1350.4733,7.6174,12.0,15.0,40.0,97.0,10.0,12.0,7.0,525.06,...,161.34286,124.27273,865.47333,378.51627,121.6719,29.823961,80.218018,16.681131,61.65947,1.0


### Run configurations

In [107]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
def results_metrics(y_true, y_pred, regression=False):
    if regression:
        mae = mean_absolute_error(y_true, y_pred)
        rmse = mean_squared_error(y_true, y_pred, squared=False)
        mse = mean_squared_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)
        return [mae, rmse, mse, r2]
    else:
        accuracy = accuracy_score(y_true, y_pred)
        roc_auc = roc_auc_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        return [accuracy, roc_auc, precision, recall, f1]

In [108]:
def run_rf(X_train, X_test, X_valid, y_train, y_test, y_valid, n_estimators, max_depth, min_samples_split, min_samples_leaf, regression=False):
    
    if regression:
        name = "RandomForestRegressor"
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
    else:
        name = "RandomForestClassifier"
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    results = results_metrics(y_test, y_predicted, regression)

    output_str = f"Model: {name}-{n_estimators}-{max_depth}-{min_samples_split}-{min_samples_leaf} | "
    if regression: output_str += f"MAE: {results[0]} | RMSE: {results[1]} | MSE: {results[2]} | R2: {results[3]}"
    else: output_str += f"Accuracy: {results[0]} | ROC-AUC: {results[1]} | Precision: {results[2]} | Recall: {results[3]} | F1: {results[4]}"
    return (output_str)

def run_lr(X_train, X_test, X_valid, y_train, y_test, y_valid, C, penalty, solver, regression=False):
    if regression:
        name = "LinearRegression"
        model = LinearRegression()
    else:
        name = "LogisticRegression"
        model = LogisticRegression(C=C, penalty=penalty, solver=solver)
        
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    results = results_metrics(y_test, y_predicted, regression)

    output_str = f"Model: {name}-{C}-{penalty}-{solver} | "
    if regression: output_str += f"MAE: {results[0]} | RMSE: {results[1]} | MSE: {results[2]} | R2: {results[3]}"
    else: output_str += f"Accuracy: {results[0]} | ROC-AUC: {results[1]} | Precision: {results[2]} | Recall: {results[3]} | F1: {results[4]}"
    return (output_str)

def run_nn(X_train, X_test, X_valid, y_train, y_test, y_valid, hidden_layer_sizes, activation, alpha, max_iter, regression=False):
    if regression:
        name = "MLPRegressor"
        model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, activation=activation, alpha=alpha, max_iter=max_iter)
    else:
        name = "MLPClassifier"
        model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, alpha=alpha, max_iter=max_iter)
         
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    results = results_metrics(y_test, y_predicted, regression)

    output_str = f"Model: {name}-{hidden_layer_sizes}-{activation}-{alpha}-{max_iter} | "
    if regression: output_str += f"MAE: {results[0]} | RMSE: {results[1]} | MSE: {results[2]} | R2: {results[3]}"
    else: output_str += f"Accuracy: {results[0]} | ROC-AUC: {results[1]} | Precision: {results[2]} | Recall: {results[3]} | F1: {results[4]}"
    return (output_str)

def run_gb(X_train, X_test, X_valid, y_train, y_test, y_valid, n_estimators, learning_rate, regression=False):
    if regression:
        name = "GradientBoostingRegressor"
        model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
    else:
        name = "GradientBoostingClassifier"
        model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)

    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    results = results_metrics(y_test, y_predicted, regression)
    
    output_str = f"Model: {name}-{n_estimators}-{learning_rate} | "
    if regression: output_str += f"MAE: {results[0]} | RMSE: {results[1]} | MSE: {results[2]} | R2: {results[3]}"
    else: output_str += f"Accuracy: {results[0]} | ROC-AUC: {results[1]} | Precision: {results[2]} | Recall: {results[3]} | F1: {results[4]}"
    return output_str

def run_svm(X_train, X_test, X_valid, y_train, y_test, y_valid, c, d, e, regression=False):
    if regression:
        name = "SVR"
        model = SVR(C=c, degree=d, epsilon=e, kernel="poly")
    else:
        name = "SVC"
        model = SVC(C=c, degree=d, kernel="poly") ### Epsilon is ignored

    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    results = results_metrics(y_test, y_predicted, regression)
    
    output_str = f"Model: {name}-{c}-{d}-{e} | "
    if regression: output_str += f"MAE: {results[0]} | RMSE: {results[1]} | MSE: {results[2]} | R2: {results[3]}"
    else: output_str += f"Accuracy: {results[0]} | ROC-AUC: {results[1]} | Precision: {results[2]} | Recall: {results[3]} | F1: {results[4]}"
    return output_str

In [109]:
def run_all(X_train, X_test, X_valid, y_train, y_test, y_valid, regression=False):
    results = []

    print("Run")

    #### -----

    param_grid_rf={
        'n_estimators': [50, 100],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    param_combinations = list(product(*param_grid_rf.values()))
    for combination in param_combinations:
        n, m, s, l = combination
        results.append(run_rf(X_train, X_test, X_valid, y_train, y_test, y_valid, n, m, s, l, regression))
        print(results[-1])
    ### -----

    param_grid_lr = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    }
    if regression:
        param_grid_lr = {
            'C': [0.001],
            'penalty': ['l1'],
            'solver': ['liblinear']
        }
    param_combinations = list(product(*param_grid_lr.values()))
    for combination in param_combinations:
        C, p, s = combination
        results.append(run_lr(X_train, X_test, X_valid, y_train, y_test, y_valid, C, p, s, regression))
        print(results[-1])
    ### -----

    param_grid_mlp = {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01],
        'max_iter': [200, 500, 1000]
    }
    param_combinations = list(product(*param_grid_mlp.values()))
    for combination in param_combinations:
        h, ac, a, i = combination
        results.append(run_nn(X_train, X_test, X_valid, y_train, y_test, y_valid, h, ac, a, i, regression))
        print(results[-1])
    ### -----

    param_grid_gb={
        'n_estimators': [10, 100, 200], 
        'learning_rate': [0.1,0.5,1.0,2.0]
    }
    param_combinations = list(product(*param_grid_gb.values()))
    for combination in param_combinations:
        n, lr = combination
        results.append(run_gb(X_train, X_test, X_valid, y_train, y_test, y_valid, n, lr, regression))
        print(results[-1])
    ### -----
    
    param_grid_svm = {
        'C': [0.01, 0.1, 1, 10, 100, 1000],
        'degree': [2, 3, 4, 5],
        'epsilon': ["no epsilon"]
    }
    
    if regression:
        param_grid_svm = {
            'C': [0.01, 0.1, 1, 10, 100, 1000],
            'degree': [2, 3, 4],
            'epsilon': [0.01, 0.1, 1]
        }
        param_combinations = list(product(*param_grid_svm.values()))
        for combination in param_combinations:
            c, d, e = combination
            results.append(run_svm(X_train, X_test, X_valid, y_train, y_test, y_valid, c, d, e, regression))
            print(results[-1])
    
    return results

In [110]:
def run_configured(regression=False, pca=False):
    sc = StandardScaler()

    if regression:
        X_train = X_train_regre
        X_test =  X_test_regre
        X_valid = X_valid_regre
        y_train = y_train_regre
        y_test =  y_test_regre
        y_valid = y_valid_regre
    else:
        X_train = X_train_class
        X_test =  X_test_class
        X_valid = X_valid_class
        y_train = y_train_class
        y_test =  y_test_class
        y_valid = y_valid_class

    if pca:
        pca = PCA(n_components=0.95)
        X_train = pca.fit_transform(X_train)
        X_test =  pca.fit_transform(X_test)
        X_valid = pca.fit_transform(X_valid)

    results = run_all(X_train, X_test, X_valid, y_train, y_test, y_valid, regression=regression)

    csv_path = "BACE_comparison_results_"
    if scaffold: csv_path = "Scaffold\\" + csv_path
    else: csv_path = "No_scaffold\\" + csv_path
    if regression: csv_path += "regression" 
    else: csv_path += "classification"
    if not scaffold: csv_path += "_no"
    csv_path += "_scaffold"
    if pca: csv_path += "_pca"
    csv_path += "_svm.csv"
    
    dir_path = os.path.join(os.path.dirname(os.getcwd()), "BACE", csv_path)
    print(dir_path)
    
    data = [[j.split(":")[1] for j in i.split(" | ")] for i in results]
    #for index, row in enumerate(data):
    #    data[index].insert(0, data_tuples[0][index])
        
    print(data[0])
    
    #data_tuples = [tuple(item.split('; ')) for item in results]
    
    column_names = [i.split(":")[0] for i in results[0].split(" | ")]
    print(column_names)
    df = pd.DataFrame(data, columns=column_names)
    
    df.to_csv(dir_path)

    return

In [111]:
run_grid = {
        'regression': [False, True],
        'pca': [False, True]
    }

run_param_combinations = list(product(*run_grid.values()))

for combination in run_param_combinations:
    r, p = combination
    run_configured(r, p)

Run
Model: RandomForestClassifier-50-None-2-1 | Accuracy: 0.7960526315789473 | ROC-AUC: 0.796969696969697 | Precision: 0.8484848484848485 | Recall: 0.7272727272727273 | F1: 0.7832167832167832
Model: RandomForestClassifier-50-None-2-2 | Accuracy: 0.7960526315789473 | ROC-AUC: 0.7967965367965367 | Precision: 0.8382352941176471 | Recall: 0.7402597402597403 | F1: 0.7862068965517242
Model: RandomForestClassifier-50-None-2-4 | Accuracy: 0.7960526315789473 | ROC-AUC: 0.796969696969697 | Precision: 0.8484848484848485 | Recall: 0.7272727272727273 | F1: 0.7832167832167832
Model: RandomForestClassifier-50-None-5-1 | Accuracy: 0.8157894736842105 | ROC-AUC: 0.8164502164502165 | Precision: 0.855072463768116 | Recall: 0.7662337662337663 | F1: 0.8082191780821918
Model: RandomForestClassifier-50-None-5-2 | Accuracy: 0.8223684210526315 | ROC-AUC: 0.822943722943723 | Precision: 0.8571428571428571 | Recall: 0.7792207792207793 | F1: 0.8163265306122449
Model: RandomForestClassifier-50-None-5-4 | Accuracy: 0