In [90]:
import warnings
from itertools import product

import numpy as np
import pandas as pd
from mordred import Calculator, descriptors
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import (StratifiedKFold, KFold,
                                     cross_val_score, train_test_split)
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.svm import SVC

warnings.filterwarnings("ignore", category=Warning)

N_SPLITS = 2
RANDOM_STATE = 148260

### Preprocessing

In [91]:
def LoadCSV(path):
    df = pd.read_csv(path)
    return df

def LoadCSV_BACE(path, regression = False):
    df = pd.read_csv(path)
    df.drop_duplicates('mol')
    df = df.dropna()
    df.drop(['CID', 'canvasUID'], axis=1, inplace=True)
    if regression:
        df['Target'] = df['pIC50']
        df.drop('Class', axis=1, inplace=True)
        df.drop('pIC50', axis=1, inplace=True)
    else:
        df['Target'] = df['Class']
        df.drop('Class', axis=1, inplace=True)
        df.drop('pIC50', axis=1, inplace=True)
    return df

def split_data_BACE(df):
    X = df.drop(['Target', 'mol'], axis=1)

    X_train = X[X['Model'] == 'Train']
    X_test = X[X['Model'] == 'Test']
    X_valid = X[X['Model'] == 'Valid']

    y = df[['Target', 'Model']]

    y_train = y[y['Model'] == 'Train']
    y_test = y[y['Model'] == 'Test']
    y_valid = y[y['Model'] == 'Valid']
    
    X_train.drop('Model', axis=1, inplace=True)
    X_test.drop('Model', axis=1, inplace=True)
    X_valid.drop('Model', axis=1, inplace=True)
    y_train.drop('Model', axis=1, inplace=True)
    y_test.drop('Model', axis=1, inplace=True)
    y_valid.drop('Model', axis=1, inplace=True)
    
    return X_train, y_train, X_test, y_test, X_valid, y_valid

In [92]:
df_regression = LoadCSV_BACE(r"C:\Users\wojci\Documents\GitHub\czasteczkowa-inzynierka\experiments\BACE\bace.csv", regression=True)
df_classification = LoadCSV_BACE(r"C:\Users\wojci\Documents\GitHub\czasteczkowa-inzynierka\experiments\BACE\bace.csv")

In [93]:
X_train_class, y_train_class, X_test_class, y_test_class, X_valid_class, y_valid_class = split_data_BACE(df_classification)

In [94]:
X_train_regre, y_train_regre, X_test_regre, y_test_regre, X_valid_regre, y_valid_regre = split_data_BACE(df_regression)

In [95]:
df_classification.shape

(1513, 592)

In [96]:
y_train_regre.shape

(203, 1)

In [97]:
print(X_train_class.shape)
print(X_test_class.shape)
print(X_valid_class.shape)
print(f"{round(X_train_class.shape[0] / df_classification.shape[0], 2)}")
print(f"{round(X_test_class.shape[0] / df_classification.shape[0], 2)}")
print(f"{round(X_valid_class.shape[0] / df_classification.shape[0], 2)}")

(203, 589)
(1265, 589)
(45, 589)
0.13
0.84
0.03


In [98]:
X_train_regre.head()

Unnamed: 0,MW,AlogP,HBA,HBD,RB,HeavyAtomCount,ChiralCenterCount,ChiralCenterCountAllPossible,RingCount,PSA,...,PEOE5 (PEOE5),PEOE6 (PEOE6),PEOE7 (PEOE7),PEOE8 (PEOE8),PEOE9 (PEOE9),PEOE10 (PEOE10),PEOE11 (PEOE11),PEOE12 (PEOE12),PEOE13 (PEOE13),PEOE14 (PEOE14)
0,431.56979,4.4014,3,2,5,32,2,2,4,77.239998,...,0.0,53.205711,78.640335,226.85541,107.43491,37.133846,0.0,7.98017,0.0,0.0
1,657.81073,2.6412,5,4,16,47,6,6,4,124.58,...,0.0,73.817162,47.1716,365.67694,174.07675,34.923889,7.98017,24.148668,0.0,24.663788
2,591.74091,2.5499,4,3,11,42,2,3,5,125.86,...,0.0,70.365707,47.941147,192.40652,255.75255,23.654478,0.230159,15.87979,0.0,24.663788
3,591.67828,3.168,4,3,12,40,4,5,3,123.84,...,0.0,56.657166,37.954151,194.35304,202.76335,36.498634,0.980913,8.188327,0.0,26.385181
4,629.71283,3.5086,3,3,11,44,2,3,5,116.63,...,0.0,78.945702,39.361153,179.71288,220.4613,23.654478,0.230159,15.87979,0.0,26.100143


In [99]:
X_test_regre.head()

Unnamed: 0,MW,AlogP,HBA,HBD,RB,HeavyAtomCount,ChiralCenterCount,ChiralCenterCountAllPossible,RingCount,PSA,...,PEOE5 (PEOE5),PEOE6 (PEOE6),PEOE7 (PEOE7),PEOE8 (PEOE8),PEOE9 (PEOE9),PEOE10 (PEOE10),PEOE11 (PEOE11),PEOE12 (PEOE12),PEOE13 (PEOE13),PEOE14 (PEOE14)
248,403.55969,5.7644,2,2,7,30,0,1,3,68.010002,...,0.0,84.122887,46.316166,247.78938,90.395477,37.133846,0.0,7.98017,0.0,0.0
249,615.73102,1.4277,5,5,13,44,0,7,4,135.58,...,0.0,67.9608,38.272877,304.05246,152.16188,34.923889,7.98017,32.336994,0.0,24.663788
250,498.6525,3.387,4,3,9,36,0,3,4,88.059998,...,0.0,48.077168,49.532818,332.80533,84.453911,34.435734,15.387257,8.188327,0.0,24.663788
251,484.62601,2.9008,4,3,9,35,0,3,4,88.059998,...,0.0,48.077168,45.445873,299.93298,95.216072,34.435734,15.387257,8.188327,0.0,24.663788
252,639.75238,3.8163,6,3,17,46,0,2,3,117.07,...,11.863713,37.771442,88.147522,261.31158,250.92554,35.014828,0.0,23.571255,0.0,24.663788


In [100]:
df_classification.describe()

Unnamed: 0,MW,AlogP,HBA,HBD,RB,HeavyAtomCount,ChiralCenterCount,ChiralCenterCountAllPossible,RingCount,PSA,...,PEOE6 (PEOE6),PEOE7 (PEOE7),PEOE8 (PEOE8),PEOE9 (PEOE9),PEOE10 (PEOE10),PEOE11 (PEOE11),PEOE12 (PEOE12),PEOE13 (PEOE13),PEOE14 (PEOE14),Target
count,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,...,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0
mean,479.661988,3.17708,3.732981,2.001322,8.04957,34.089227,0.522802,2.31725,3.769993,99.842829,...,52.348846,48.76374,181.83558,148.442348,30.371697,3.48865,11.74056,1.239762,14.387597,0.456709
std,122.083053,1.396633,1.444778,1.629343,4.741135,8.520088,1.162539,1.612558,0.87739,34.973718,...,25.9938,18.201519,99.717702,60.548833,12.162452,5.148336,9.073406,3.293804,13.32989,0.498287
min,138.187,-4.3611,0.0,0.0,0.0,10.0,0.0,0.0,0.0,16.610001,...,0.0,-3.551821,1.91697,-5.536391,-2.216191,-7.286308,-6.106466,-7.379991,-1.273524,0.0
25%,389.3313,2.3355,3.0,0.0,4.0,28.0,0.0,1.0,3.0,77.050003,...,34.319988,36.54715,102.23377,102.51045,20.13299,0.0,7.98017,0.0,0.0,0.0
50%,463.6283,3.1713,4.0,2.0,7.0,33.0,0.0,2.0,4.0,95.040001,...,51.479984,47.624382,171.91722,140.68362,30.107586,0.55013,8.188327,0.0,21.710098,0.0
75%,564.63953,4.0155,4.0,3.0,11.0,40.0,1.0,3.0,4.0,116.63,...,66.553795,58.844093,253.67908,185.65926,37.133846,7.98017,15.87979,0.0,24.663788,1.0
max,1350.4733,7.6174,12.0,15.0,40.0,97.0,10.0,12.0,7.0,525.06,...,161.34286,124.27273,865.47333,378.51627,121.6719,29.823961,80.218018,16.681131,61.65947,1.0


### Run configurations

In [101]:
def run_rf(X, y, n_estimators, max_depth, min_samples_split, min_samples_leaf, regression=False):
    if regression:
        name = "RandomForestRegressor"
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "RandomForestClassifier"
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'roc_auc'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{n_estimators}-{max_depth}-{min_samples_split}-{min_samples_leaf}; {mean_accuracy:.4f}")

def run_lr(X, y, C, penalty, solver, regression=False):
    if regression:
        name = "LinearRegression"
        model = LinearRegression()
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "LogisticRegression"
        model = LogisticRegression(C=C, penalty=penalty, solver=solver)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'roc_auc'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{C}-{penalty}-{solver}; {mean_accuracy:.4f}")

def run_nn(X, y, hidden_layer_sizes, activation, alpha, max_iter, regression=False):
    if regression:
        name = "MLPRegressor"
        model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, activation=activation, alpha=alpha, max_iter=max_iter)
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "MLPClassifier"
        model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, alpha=alpha, max_iter=max_iter)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'roc_auc'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{hidden_layer_sizes}-{activation}-{alpha}-{max_iter}; {mean_accuracy:.4f}")

def run_gb(X, y, n_estimators, learning_rate, regression=False):
    if regression:
        name = "GradientBoostingRegressor"
        model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "GradientBoostingClassifier"
        model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'roc_auc'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{n_estimators}-{learning_rate}; {mean_accuracy:.4f}")

def run_svm(X, y, c, d, e, regression=False):
    if regression:
        name = "SVR"
        model = SVR(C=c, degree=d, epsilon=e, kernel="poly")
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "SVC"
        model = SVC(C=c, degree=d, kernel="poly") ### Epsilon is ignored
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'roc_auc'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{c}-{d}-{e}; {mean_accuracy:.4f}")

In [102]:
def run_all(X, y, regression=False):
    results = []

    print("Run")

    #### -----

    param_grid_rf={
        'n_estimators': [50, 100],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    param_combinations = list(product(*param_grid_rf.values()))
    for combination in param_combinations:
        n, m, s, l = combination
        results.append(run_rf(X, y, n, m, s, l, regression))
        print(results[-1])
    ### -----

    param_grid_lr = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    }
    if regression:
        param_grid_lr = {
            'C': [0.001],
            'penalty': ['l1'],
            'solver': ['liblinear']
        }
    param_combinations = list(product(*param_grid_lr.values()))
    for combination in param_combinations:
        C, p, s = combination
        results.append(run_lr(X, y, C, p, s, regression))
        print(results[-1])
    ### -----

    param_grid_mlp = {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01],
        'max_iter': [200, 500, 1000]
    }
    param_combinations = list(product(*param_grid_mlp.values()))
    for combination in param_combinations:
        h, ac, a, i = combination
        results.append(run_nn(X, y, h, ac, a, i, regression))
        print(results[-1])
    ### -----

    param_grid_gb={
        'n_estimators': [10, 100, 200], 
        'learning_rate': [0.1,0.5,1.0,2.0]
    }
    param_combinations = list(product(*param_grid_gb.values()))
    for combination in param_combinations:
        n, lr = combination
        results.append(run_gb(X, y, n, lr, regression))
        print(results[-1])
    ### -----
    
    param_grid_svm = {
        'C': [0.01, 0.1, 1, 10, 100, 1000],
        'degree': [2, 3, 4, 5],
        'epsilon': ["no epsilon"]
    }
    
    if regression:
        param_grid_svm = {
            'C': [0.01, 0.1, 1, 10, 100, 1000],
            'degree': [2, 3, 4, 5],
            'epsilon': [0.01, 0.1, 1]
        }
        param_combinations = list(product(*param_grid_svm.values()))
        for combination in param_combinations:
            c, d, e = combination
            results.append(run_svm(X, y, c, d, e, regression))
            print(results[-1])
    
    return results

In [103]:
def run_configured(regression=False, pca=False):
    sc = StandardScaler()

    if regression:
        X = sc.fit_transform(pd.concat([X_train_regre, X_test_regre]))
        y = pd.concat([y_train_regre, y_test_regre])
        
    else:
        X = sc.fit_transform(pd.concat([X_train_class, X_test_class]))
        y = pd.concat([y_train_class, y_test_class])

    if pca:
        pca = PCA(n_components=0.95)
        X = pca.fit_transform(X)

    results = run_all(X, y, regression=regression)

    csv_path = "BACE_comparison_results_"
    if regression: csv_path += "regression" 
    else: csv_path += "classification"
    csv_path += "_sc"
    if pca: csv_path += "pca"
    csv_path += "_svm.csv"

    data_tuples = [tuple(item.split('; ')) for item in results]
    df = pd.DataFrame(data_tuples, columns=['Classifier', 'Accuracy'])
    df.to_csv(csv_path)

    return

In [104]:
run_grid = {
        'regression': [False, True],
        'pca': [False, True]
    }

run_param_combinations = list(product(*run_grid.values()))

for combination in run_param_combinations:
    r, p = combination
    run_configured(r, p)

Run
RandomForestClassifier-50-None-2-1; 0.8680
RandomForestClassifier-50-None-2-2; 0.8710
RandomForestClassifier-50-None-2-4; 0.8641
RandomForestClassifier-50-None-5-1; 0.8728
RandomForestClassifier-50-None-5-2; 0.8710
RandomForestClassifier-50-None-5-4; 0.8691
RandomForestClassifier-50-None-10-1; 0.8685
RandomForestClassifier-50-None-10-2; 0.8731
RandomForestClassifier-50-None-10-4; 0.8648
RandomForestClassifier-50-10-2-1; 0.8755
RandomForestClassifier-50-10-2-2; 0.8703
RandomForestClassifier-50-10-2-4; 0.8679
RandomForestClassifier-50-10-5-1; 0.8733
RandomForestClassifier-50-10-5-2; 0.8727
RandomForestClassifier-50-10-5-4; 0.8713
RandomForestClassifier-50-10-10-1; 0.8681
RandomForestClassifier-50-10-10-2; 0.8735
RandomForestClassifier-50-10-10-4; 0.8687
RandomForestClassifier-50-20-2-1; 0.8679
RandomForestClassifier-50-20-2-2; 0.8676
RandomForestClassifier-50-20-2-4; 0.8681
RandomForestClassifier-50-20-5-1; 0.8721
RandomForestClassifier-50-20-5-2; 0.8689
RandomForestClassifier-50-20-