# Imports 

In [44]:
import warnings
from itertools import product

import numpy as np
import pandas as pd
from mordred import Calculator, descriptors
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import (StratifiedKFold, KFold,
                                     cross_val_score, train_test_split)
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore", category=Warning)

N_SPLITS = 2
RANDOM_STATE = 148260

In [45]:
def CalculateMorganFingerprint(mol):
    mfpgen = AllChem.GetMorganGenerator(radius=2,fpSize=2048)
    fingerprint = np.array([mfpgen.GetFingerprintAsNumPy(x) for x in mol])
    fingerprint = pd.DataFrame(fingerprint, columns = ['mfp'+str(i) for i in range(fingerprint.shape[1])])
    return fingerprint

In [46]:
def CalculateDescriptors(mol):
    calc = Calculator(descriptors, ignore_3D=False)
    X_mordred = calc.pandas(mol, nproc=1)
    X_mordred = X_mordred.select_dtypes(['number'])
    #normalize
    X_mordred = (X_mordred-X_mordred.min())/(X_mordred.max()-X_mordred.min())
    #drop columns wth low std
    X_mordred = X_mordred.loc[:,X_mordred.std()>0.01]
    return X_mordred

In [47]:
def LoadDatasetCSV(path, threshold=7.0, regression = False):
    df = pd.read_csv(path)
    df['molecule_from_smiles'] = df['smiles'].apply(Chem.MolFromSmiles)
    df['smiles'] = df['smiles'].map(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
    df.drop_duplicates('smiles')
    df = df.dropna()
    if regression:
        df['Target'] = df['pIC50']
    else:
        df['Target'] = df['pIC50'] > threshold
    return df

In [48]:
def split_data(df, approach = 'desc', split = 0.7):
    #TODO: support for different approaches - if applicable
    if approach == 'desc':
        X = CalculateDescriptors(df['molecule_from_smiles'])
    else:
        X = CalculateMorganFingerprint(df['molecule_from_smiles'])
    y = df["Target"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1-split), random_state=42)
    return X_train, y_train, X_test, y_test

In [49]:
def run_rf(X, y, n_estimators, max_depth, min_samples_split, min_samples_leaf, regression=False):
    if regression:
        name = "RandomForestRegressor"
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "RandomForestClassifier"
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'accuracy'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{n_estimators}-{max_depth}-{min_samples_split}-{min_samples_leaf}; {mean_accuracy:.4f}")

def run_lr(X, y, C, penalty, solver, regression=False):
    if regression:
        name = "LinearRegression"
        model = LinearRegression()
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "LogisticRegression"
        model = LogisticRegression(C=C, penalty=penalty, solver=solver)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'accuracy'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{C}-{penalty}-{solver}; {mean_accuracy:.4f}")

def run_nn(X, y, hidden_layer_sizes, activation, alpha, max_iter, regression=False):
    if regression:
        name = "MLPRegressor"
        model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, activation=activation, alpha=alpha, max_iter=max_iter)
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "MLPClassifier"
        model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, alpha=alpha, max_iter=max_iter)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'accuracy'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{hidden_layer_sizes}-{activation}-{alpha}-{max_iter}; {mean_accuracy:.4f}")

def run_gb(X, y, n_estimators, learning_rate, regression=False):
    if regression:
        name = "GradientBoostingRegressor"
        model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "GradientBoostingClassifier"
        model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'accuracy'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{n_estimators}-{learning_rate}; {mean_accuracy:.4f}")

In [60]:
from sklearn.svm import SVR
from sklearn.svm import SVC

def run_svm(X, y, c, d, regression=False):
    if regression:
        name = "SVR"
        model = SVR(C=c, degree=d, kernel="poly")
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "SVC"
        model = SVC(C=c, degree=d, kernel="poly")
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'accuracy'
    print("Eval\n")
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{c}-{d}; {mean_accuracy:.4f}")

In [51]:
data_classification = LoadDatasetCSV("data\processed\simple_input_data.csv")
data_classification.head()

Unnamed: 0,smiles,pIC50,molecule_from_smiles,Target
0,Cc1ccccc1-c1ccc2nc(N)c(C[C@@H](C)C(=O)N[C@@H]3...,9.154901,<rdkit.Chem.rdchem.Mol object at 0x000001F984B...,True
1,CCCO[C@H]1C[NH2+][C@@H]([C@@H](O)[C@H](Cc2cc(F...,8.853872,<rdkit.Chem.rdchem.Mol object at 0x000001F984B...,True
2,CCc1cn2c3c(cc(C(=O)N[C@@H](Cc4ccccc4)[C@H](O)C...,8.69897,<rdkit.Chem.rdchem.Mol object at 0x000001F984B...,True
3,CCOC[C@@H](Oc1cc(C[C@@H]2CS(=O)(=O)C[C@H]([NH2...,8.69897,<rdkit.Chem.rdchem.Mol object at 0x000001F984B...,True
4,CCc1cn2c3c(cc(C(=O)N[C@@H](Cc4ccccc4)[C@H](O)C...,8.69897,<rdkit.Chem.rdchem.Mol object at 0x000001F984B...,True


In [52]:
data_regression = LoadDatasetCSV("data\processed\simple_input_data.csv", regression=True)
data_regression.head()

Unnamed: 0,smiles,pIC50,molecule_from_smiles,Target
0,Cc1ccccc1-c1ccc2nc(N)c(C[C@@H](C)C(=O)N[C@@H]3...,9.154901,<rdkit.Chem.rdchem.Mol object at 0x000001F984B...,9.154901
1,CCCO[C@H]1C[NH2+][C@@H]([C@@H](O)[C@H](Cc2cc(F...,8.853872,<rdkit.Chem.rdchem.Mol object at 0x000001F984B...,8.853872
2,CCc1cn2c3c(cc(C(=O)N[C@@H](Cc4ccccc4)[C@H](O)C...,8.69897,<rdkit.Chem.rdchem.Mol object at 0x000001F984B...,8.69897
3,CCOC[C@@H](Oc1cc(C[C@@H]2CS(=O)(=O)C[C@H]([NH2...,8.69897,<rdkit.Chem.rdchem.Mol object at 0x000001F984B...,8.69897
4,CCc1cn2c3c(cc(C(=O)N[C@@H](Cc4ccccc4)[C@H](O)C...,8.69897,<rdkit.Chem.rdchem.Mol object at 0x000001F984B...,8.69897


In [53]:
X_train_desc_classification, y_train_desc_classification, X_test_desc_classification, y_test_desc_classification = split_data(data_classification)

In [54]:
X_train_desc_classification

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
209,0.582920,0.571986,0.00,0.00,0.582398,0.456539,0.479510,0.582398,0.446868,0.792690,...,0.847445,0.521445,0.588711,0.407040,0.239499,0.604317,0.591743,0.596,0.565881,0.564214
613,0.366165,0.341732,0.00,0.25,0.373272,0.503185,0.528503,0.373272,0.582010,0.631118,...,0.719285,0.341022,0.362488,0.351430,0.084586,0.381295,0.376147,0.384,0.280719,0.355700
1131,0.234412,0.223823,0.00,0.50,0.216149,0.592990,0.622827,0.216149,0.374746,0.486042,...,0.694361,0.228140,0.220422,0.603534,0.032005,0.287770,0.256881,0.274,0.241499,0.194084
140,0.228498,0.219951,0.25,0.50,0.216418,0.555507,0.578297,0.216418,0.561592,0.477725,...,0.633569,0.333643,0.212989,0.492783,0.025660,0.230216,0.247706,0.260,0.156298,0.184704
1094,0.186547,0.188806,0.00,0.75,0.178725,0.656629,0.563294,0.178725,0.527251,0.423776,...,0.616091,0.312119,0.180768,0.622083,0.020099,0.230216,0.206422,0.226,0.177164,0.156566
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,0.331611,0.321370,0.00,0.25,0.322084,0.421017,0.442201,0.322084,0.403183,0.595470,...,0.696437,0.312289,0.335223,0.362681,0.070607,0.345324,0.344037,0.348,0.321097,0.308081
1294,0.309270,0.325988,0.00,0.00,0.309149,0.273714,0.287486,0.309149,0.386712,0.572930,...,0.573711,0.295718,0.306415,0.137594,0.063828,0.294964,0.302752,0.296,0.316847,0.311688
860,0.283667,0.268915,0.00,1.00,0.290459,0.861105,0.795958,0.290459,0.927731,0.549717,...,0.790551,0.402997,0.241079,0.499394,0.034358,0.359712,0.321101,0.364,0.134660,0.233045
1459,0.331611,0.321370,0.00,0.25,0.322084,0.421017,0.442201,0.322084,0.403183,0.595470,...,0.696437,0.312289,0.335223,0.362681,0.070607,0.345324,0.344037,0.348,0.321097,0.308081


In [55]:
X_train_desc_regression, y_train_desc_regression, X_test_desc_regression, y_test_desc_regression = split_data(data_regression)

In [56]:
X_train_fp_classification, y_train_fp_classification, X_test_fp_classification, y_test_fp_classification = split_data(data_classification, approach = 'fp')

In [57]:
X_train_fp_regression, y_train_fp_regression, X_test_fp_regression, y_test_fp_regression = split_data(data_regression, approach = 'fp')

# Descriptors

In [68]:
import time

def run_experiment(regression=False, fingerprint=False, pca=False):
    sc = StandardScaler()

    if fingerprint:
        if regression:
            X = sc.fit_transform(pd.concat([X_train_fp_regression, X_test_fp_regression]))
            y = pd.concat([y_train_fp_regression, y_test_fp_regression])
            
        else:
            X = sc.fit_transform(pd.concat([X_train_fp_classification, X_test_fp_classification]))
            y = pd.concat([y_train_fp_classification, y_test_fp_classification])

    else:
        if regression:
            X = sc.fit_transform(pd.concat([X_train_desc_regression, X_test_desc_regression]))
            y = pd.concat([y_train_desc_regression, y_test_desc_regression])
            
        else:
            X = sc.fit_transform(pd.concat([X_train_desc_classification, X_test_desc_classification]))
            y = pd.concat([y_train_desc_classification, y_test_desc_classification])

    if pca:
        pca = PCA(n_components=0.95)
        X = pca.fit_transform(X)
        
    results = []
    param_grid_svm={
        #'gamma': [0.001, 0.1, 1], ### Gamma 100 C 0.1 degree 3 still loops and stalls
        'C': [0.01, 0.1, 1, 10, 100, 1000], # C >= 10 are problematic
        'degree': [2, 3, 4, 5]
    }

    param_combinations = list(product(*param_grid_svm.values()))
    for combination in param_combinations:
        c, d = combination
        print(f"\nRegression: {regression}, Fingerprint: {fingerprint}, PCA: {pca}")
        for index, key in enumerate(param_grid_svm.keys()):
            print(f"{key}: {combination[index]}, ", end="")
        start = time.time()
        results.append(run_svm(X, y, c, d, regression))
        print(f"\nAccuracy: {results[-1]}\n")
        end = time.time()
        print(f"Elapsed time: {round(end - start, 2)}s\n")


    csv_path = "results_"
    if regression: csv_path += "regression" 
    else: csv_path += "classification"
    if fingerprint: csv_path += "_fingerprints"
    else: csv_path += "_descriptors"
    csv_path += "_sc"
    if pca: csv_path += "pca"
    csv_path += "_svm.csv"

    data_tuples = [tuple(item.split('; ')) for item in results]
    df = pd.DataFrame(data_tuples, columns=['Classifier', 'Accuracy'])
    df.to_csv(csv_path)
    
    return results, csv_path

In [69]:
run_grid = {
        'regression': [False, True],
        'fingerprint': [False, True],
        'pca': [False, True]
    }

run_param_combinations = list(product(*run_grid.values()))

result_dict = {}

for combination in run_param_combinations:
    n, m, s = combination
    res, run_name = run_experiment(n, m, s)
    result_dict[run_name] = res


Regression: False, Fingerprint: False, PCA: False
C: 0.01, degree: 2, Eval


Accuracy: SVC-0.01-2; 0.5616

Elapsed time: 0.32s


Regression: False, Fingerprint: False, PCA: False
C: 0.01, degree: 3, Eval


Accuracy: SVC-0.01-3; 0.5648

Elapsed time: 0.3s


Regression: False, Fingerprint: False, PCA: False
C: 0.01, degree: 4, Eval


Accuracy: SVC-0.01-4; 0.5708

Elapsed time: 0.3s


Regression: False, Fingerprint: False, PCA: False
C: 0.01, degree: 5, Eval


Accuracy: SVC-0.01-5; 0.5734

Elapsed time: 0.3s


Regression: False, Fingerprint: False, PCA: False
C: 0.1, degree: 2, Eval


Accuracy: SVC-0.1-2; 0.6287

Elapsed time: 0.29s


Regression: False, Fingerprint: False, PCA: False
C: 0.1, degree: 3, Eval


Accuracy: SVC-0.1-3; 0.6484

Elapsed time: 0.28s


Regression: False, Fingerprint: False, PCA: False
C: 0.1, degree: 4, Eval


Accuracy: SVC-0.1-4; 0.5951

Elapsed time: 0.3s


Regression: False, Fingerprint: False, PCA: False
C: 0.1, degree: 5, Eval


Accuracy: SVC-0.1-5; 0.5971

E

In [70]:
for key, run in result_dict.items():
    print(key)
    for test in run:
        print(test)
    print()

results_classification_descriptors_sc_svm.csv
SVC-0.01-2; 0.5616
SVC-0.01-3; 0.5648
SVC-0.01-4; 0.5708
SVC-0.01-5; 0.5734
SVC-0.1-2; 0.6287
SVC-0.1-3; 0.6484
SVC-0.1-4; 0.5951
SVC-0.1-5; 0.5971
SVC-1-2; 0.7689
SVC-1-3; 0.7913
SVC-1-4; 0.7511
SVC-1-5; 0.7288
SVC-10-2; 0.7834
SVC-10-3; 0.7979
SVC-10-4; 0.7854
SVC-10-5; 0.7828
SVC-100-2; 0.7683
SVC-100-3; 0.7788
SVC-100-4; 0.7939
SVC-100-5; 0.7893
SVC-1000-2; 0.7637
SVC-1000-3; 0.7696
SVC-1000-4; 0.7775
SVC-1000-5; 0.7729

results_classification_descriptors_scpca_svm.csv
SVC-0.01-2; 0.5616
SVC-0.01-3; 0.5675
SVC-0.01-4; 0.5714
SVC-0.01-5; 0.5747
SVC-0.1-2; 0.6340
SVC-0.1-3; 0.6557
SVC-0.1-4; 0.5997
SVC-0.1-5; 0.6004
SVC-1-2; 0.7617
SVC-1-3; 0.7900
SVC-1-4; 0.7571
SVC-1-5; 0.7367
SVC-10-2; 0.7749
SVC-10-3; 0.7992
SVC-10-4; 0.7907
SVC-10-5; 0.7887
SVC-100-2; 0.7597
SVC-100-3; 0.7788
SVC-100-4; 0.7861
SVC-100-5; 0.7814
SVC-1000-2; 0.7406
SVC-1000-3; 0.7485
SVC-1000-4; 0.7709
SVC-1000-5; 0.7676

results_classification_fingerprints_sc_svm.csv
