# Imports 

In [1]:
import warnings
from itertools import product

import numpy as np
import pandas as pd
from mordred import Calculator, descriptors
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import (StratifiedKFold, KFold,
                                     cross_val_score, train_test_split)
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore", category=Warning)

N_SPLITS = 2
RANDOM_STATE = 148260

In [2]:
def CalculateMorganFingerprint(mol):
    mfpgen = AllChem.GetMorganGenerator(radius=2,fpSize=2048)
    fingerprint = np.array([mfpgen.GetFingerprintAsNumPy(x) for x in mol])
    fingerprint = pd.DataFrame(fingerprint, columns = ['mfp'+str(i) for i in range(fingerprint.shape[1])])
    return fingerprint

In [3]:
def CalculateDescriptors(mol):
    calc = Calculator(descriptors, ignore_3D=False)
    X_mordred = calc.pandas(mol, nproc=1)
    X_mordred = X_mordred.select_dtypes(['number'])
    #normalize
    X_mordred = (X_mordred-X_mordred.min())/(X_mordred.max()-X_mordred.min())
    #drop columns wth low std
    X_mordred = X_mordred.loc[:,X_mordred.std()>0.01]
    return X_mordred

In [4]:
def LoadDatasetCSV(path, threshold=7.0, regression = False):
    df = pd.read_csv(path)
    df['molecule_from_smiles'] = df['smiles'].apply(Chem.MolFromSmiles)
    df['smiles'] = df['smiles'].map(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
    df.drop_duplicates('smiles')
    df = df.dropna()
    if regression:
        df['Target'] = df['pIC50']
    else:
        df['Target'] = df['pIC50'] > threshold
    return df

In [5]:
def split_data(df, approach = 'desc', split = 0.7):
    #TODO: support for different approaches - if applicable
    if approach == 'desc':
        X = CalculateDescriptors(df['molecule_from_smiles'])
    else:
        X = CalculateMorganFingerprint(df['molecule_from_smiles'])
    y = df["Target"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1-split), random_state=42)
    return X_train, y_train, X_test, y_test

In [6]:
def run_rf(X, y, n_estimators, max_depth, min_samples_split, min_samples_leaf, regression=False):
    if regression:
        name = "RandomForestRegressor"
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "RandomForestClassifier"
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'accuracy'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{n_estimators}-{max_depth}-{min_samples_split}-{min_samples_leaf}; {mean_accuracy:.4f}")

def run_lr(X, y, C, penalty, solver, regression=False):
    if regression:
        name = "LinearRegression"
        model = LinearRegression()
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "LogisticRegression"
        model = LogisticRegression(C=C, penalty=penalty, solver=solver)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'accuracy'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{C}-{penalty}-{solver}; {mean_accuracy:.4f}")

def run_nn(X, y, hidden_layer_sizes, activation, alpha, max_iter, regression=False):
    if regression:
        name = "MLPRegressor"
        model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, activation=activation, alpha=alpha, max_iter=max_iter)
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "MLPClassifier"
        model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, alpha=alpha, max_iter=max_iter)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'accuracy'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{hidden_layer_sizes}-{activation}-{alpha}-{max_iter}; {mean_accuracy:.4f}")

def run_gb(X, y, n_estimators, learning_rate, regression=False):
    if regression:
        name = "GradientBoostingRegressor"
        model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "GradientBoostingClassifier"
        model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'accuracy'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{n_estimators}-{learning_rate}; {mean_accuracy:.4f}")

In [7]:
from sklearn.svm import SVR
from sklearn.svm import SVC

def run_svm(X, y, g, c, d, regression=False):
    if regression:
        name = "SVR"
        model = SVR(gamma=g, C=c, degree=d, kernel="poly")
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "SVC"
        model = SVC(gamma=g, C=c, degree=d, kernel="poly")
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'accuracy'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{g}-{c}-{d}; {mean_accuracy:.4f}")

In [8]:
data_classification = LoadDatasetCSV("data\processed\simple_input_data.csv")
data_classification.head()

Unnamed: 0,smiles,pIC50,molecule_from_smiles,Target
0,Cc1ccccc1-c1ccc2nc(N)c(C[C@@H](C)C(=O)N[C@@H]3...,9.154901,<rdkit.Chem.rdchem.Mol object at 0x000001F94A0...,True
1,CCCO[C@H]1C[NH2+][C@@H]([C@@H](O)[C@H](Cc2cc(F...,8.853872,<rdkit.Chem.rdchem.Mol object at 0x000001F94A0...,True
2,CCc1cn2c3c(cc(C(=O)N[C@@H](Cc4ccccc4)[C@H](O)C...,8.69897,<rdkit.Chem.rdchem.Mol object at 0x000001F94A0...,True
3,CCOC[C@@H](Oc1cc(C[C@@H]2CS(=O)(=O)C[C@H]([NH2...,8.69897,<rdkit.Chem.rdchem.Mol object at 0x000001F94A0...,True
4,CCc1cn2c3c(cc(C(=O)N[C@@H](Cc4ccccc4)[C@H](O)C...,8.69897,<rdkit.Chem.rdchem.Mol object at 0x000001F94A0...,True


In [9]:
data_regression = LoadDatasetCSV("data\processed\simple_input_data.csv", regression=True)
data_regression.head()

Unnamed: 0,smiles,pIC50,molecule_from_smiles,Target
0,Cc1ccccc1-c1ccc2nc(N)c(C[C@@H](C)C(=O)N[C@@H]3...,9.154901,<rdkit.Chem.rdchem.Mol object at 0x000001F94B1...,9.154901
1,CCCO[C@H]1C[NH2+][C@@H]([C@@H](O)[C@H](Cc2cc(F...,8.853872,<rdkit.Chem.rdchem.Mol object at 0x000001F94B1...,8.853872
2,CCc1cn2c3c(cc(C(=O)N[C@@H](Cc4ccccc4)[C@H](O)C...,8.69897,<rdkit.Chem.rdchem.Mol object at 0x000001F94B1...,8.69897
3,CCOC[C@@H](Oc1cc(C[C@@H]2CS(=O)(=O)C[C@H]([NH2...,8.69897,<rdkit.Chem.rdchem.Mol object at 0x000001F94B1...,8.69897
4,CCc1cn2c3c(cc(C(=O)N[C@@H](Cc4ccccc4)[C@H](O)C...,8.69897,<rdkit.Chem.rdchem.Mol object at 0x000001F94B1...,8.69897


In [10]:
X_train_desc_classification, y_train_desc_classification, X_test_desc_classification, y_test_desc_classification = split_data(data_classification)

In [11]:
X_train_desc_regression, y_train_desc_regression, X_test_desc_regression, y_test_desc_regression = split_data(data_regression)

In [12]:
X_train_fp_classification, y_train_fp_classification, X_test_fp_classification, y_test_fp_classification = split_data(data_classification, approach = 'fp')

In [13]:
X_train_fp_regression, y_train_fp_regression, X_test_fp_regression, y_test_fp_regression = split_data(data_regression, approach = 'fp')

# Descriptors

In [17]:
import time

def run_experiment(regression=False, fingerprint=False, pca=False):
    sc = StandardScaler()

    if fingerprint:
        if regression:
            X = sc.fit_transform(pd.concat([X_train_fp_regression, X_test_fp_regression]))
            y = pd.concat([y_train_fp_regression, y_test_fp_regression])
            
        else:
            X = sc.fit_transform(pd.concat([X_train_fp_classification, X_test_fp_classification]))
            y = pd.concat([y_train_fp_classification, y_test_fp_classification])

    else:
        if regression:
            X = sc.fit_transform(pd.concat([X_train_desc_regression, X_test_desc_regression]))
            y = pd.concat([y_train_desc_regression, y_test_desc_regression])
            
        else:
            X = sc.fit_transform(pd.concat([X_train_desc_classification, X_test_desc_classification]))
            y = pd.concat([y_train_desc_classification, y_test_desc_classification])

    if pca:
        pca = PCA(n_components=0.95)
        X = pca.fit_transform(X)
        
    results = []
    param_grid_svm={
        'gamma': [0.001, 0.1, 1], ### Gamma 100 C 0.1 degree 3 still loops and stalls
        'C': [0.01, 0.1, 1], # C >= 10 are problematic
        'degree': [2, 3]#[2, 3, 4]
    }

    param_combinations = list(product(*param_grid_svm.values()))
    for combination in param_combinations:
        n, m, s = combination
        print(f"\nRegression: {regression}, Fingerprint: {fingerprint}, PCA: {pca}")
        for index, key in enumerate(param_grid_svm.keys()):
            print(f"{key}: {combination[index]}, ", end="")
        start = time.time()
        results.append(run_svm(X, y, n, m, s, regression))
        print(f"\nAccuracy: {results[-1]}\n")
        end = time.time()
        print(f"Elapsed time: {round(end - start, 2)}s\n")


    csv_path = "results_"
    if regression: csv_path += "regression" 
    else: csv_path += "classification"
    if fingerprint: csv_path += "_fingerprints"
    else: csv_path += "_descriptors"
    csv_path += "_sc"
    if pca: csv_path += "pca"
    csv_path += "_svm.csv"

    data_tuples = [tuple(item.split('; ')) for item in results]
    df = pd.DataFrame(data_tuples, columns=['Classifier', 'Accuracy'])
    df.to_csv(csv_path)
    
    return results, csv_path

In [18]:
run_grid = {
        'regression': [False, True],
        'fingerprint': [False],#, True],
        'pca': [False]
    }

run_param_combinations = list(product(*run_grid.values()))

result_dict = {}

for combination in run_param_combinations:
    n, m, s = combination
    res, run_name = run_experiment(n, m, s)
    result_dict[run_name] = res


Regression: False, Fingerprint: False, PCA: False
gamma: 0.001, C: 0.01, degree: 2, 
Accuracy: SVC-0.001-0.01-2; 0.5609

Elapsed time: 0.28s


Regression: False, Fingerprint: False, PCA: False
gamma: 0.001, C: 0.01, degree: 3, 
Accuracy: SVC-0.001-0.01-3; 0.5727

Elapsed time: 0.28s


Regression: False, Fingerprint: False, PCA: False
gamma: 0.001, C: 0.1, degree: 2, 
Accuracy: SVC-0.001-0.1-2; 0.6689

Elapsed time: 0.28s


Regression: False, Fingerprint: False, PCA: False
gamma: 0.001, C: 0.1, degree: 3, 
Accuracy: SVC-0.001-0.1-3; 0.7024

Elapsed time: 0.27s


Regression: False, Fingerprint: False, PCA: False
gamma: 0.001, C: 1, degree: 2, 
Accuracy: SVC-0.001-1-2; 0.7742

Elapsed time: 0.22s


Regression: False, Fingerprint: False, PCA: False
gamma: 0.001, C: 1, degree: 3, 
Accuracy: SVC-0.001-1-3; 0.7933

Elapsed time: 0.22s


Regression: False, Fingerprint: False, PCA: False
gamma: 0.1, C: 0.01, degree: 2, 
Accuracy: SVC-0.1-0.01-2; 0.7650

Elapsed time: 0.18s


Regression: False,

In [23]:
for key, run in result_dict.items():
    print(key)
    for test in run:
        print(test)
    print()

results_classification_descriptors_sc_svm.csv
SVC-0.001-0.01-2; 0.5609
SVC-0.001-0.01-3; 0.5727
SVC-0.001-0.1-2; 0.6689
SVC-0.001-0.1-3; 0.7024
SVC-0.001-1-2; 0.7742
SVC-0.001-1-3; 0.7933
SVC-0.1-0.01-2; 0.7650
SVC-0.1-0.01-3; 0.7696
SVC-0.1-0.1-2; 0.7637
SVC-0.1-0.1-3; 0.7696
SVC-0.1-1-2; 0.7637
SVC-0.1-1-3; 0.7696
SVC-1-0.01-2; 0.7637
SVC-1-0.01-3; 0.7696
SVC-1-0.1-2; 0.7637
SVC-1-0.1-3; 0.7696
SVC-1-1-2; 0.7637
SVC-1-1-3; 0.7696

results_regression_descriptors_sc_svm.csv
SVR-0.001-0.01-2; -1.6425
SVR-0.001-0.01-3; -1.7554
SVR-0.001-0.1-2; -1.1639
SVR-0.001-0.1-3; -1.6169
SVR-0.001-1-2; -0.9510
SVR-0.001-1-3; -1.5500
SVR-0.1-0.01-2; -1.6840
SVR-0.1-0.01-3; -1.9869
SVR-0.1-0.1-2; -1.6697
SVR-0.1-0.1-3; -1.9869
SVR-0.1-1-2; -1.6697
SVR-0.1-1-3; -1.9868
SVR-1-0.01-2; -1.6696
SVR-1-0.01-3; -1.9869
SVR-1-0.1-2; -1.6696
SVR-1-0.1-3; -1.9869
SVR-1-1-2; -1.6696
SVR-1-1-3; -1.9869

