In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer, accuracy_score

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from mordred import Calculator, descriptors

In [21]:
def CalculateMorganFingerprint(mol):
    mfpgen = AllChem.GetMorganGenerator(radius=2,fpSize=2048)
    fingerprint = np.array([mfpgen.GetFingerprintAsNumPy(x) for x in mol])
    fingerprint = pd.DataFrame(fingerprint, columns = ['mfp'+str(i) for i in range(fingerprint.shape[1])])
    return fingerprint

In [22]:
def CalculateDescriptors(mol):
    calc = Calculator(descriptors, ignore_3D=False)
    X_mordred = calc.pandas(mol, nproc=1)
    X_mordred = X_mordred.select_dtypes(['number'])
    #normalize
    X_mordred = (X_mordred-X_mordred.min())/(X_mordred.max()-X_mordred.min())
    #drop columns wth low std
    X_mordred = X_mordred.loc[:,X_mordred.std()>0.01]
    return X_mordred

In [23]:
def LoadDatasetCSV(path, threshold=7.0):
    df = pd.read_csv(path)
    df['mol_from_smiles'] = df['smiles'].apply(Chem.MolFromSmiles)
    df['smiles'] = df['smiles'].map(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
    df.drop_duplicates('smiles')
    df = df.dropna()
    df['Class'] = df['pIC50'] > threshold
    return df

In [24]:
def split_data(df, approach = 'desc', split = 0.7):
    #TODO: support for different approaches - if applicable
    if approach == 'desc':
        X = CalculateDescriptors(df['mol_from_smiles'])
    else:
        X = CalculateMorganFingerprint(df['mol_from_smiles'])
    y = df["Class"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1-split), random_state=42)
    return X_train, y_train, X_test, y_test

In [25]:
base_data = LoadDatasetCSV("data\processed\simple_input_data.csv")
base_data.head()

Unnamed: 0,smiles,pIC50,mol_from_smiles,Class
0,Cc1ccccc1-c1ccc2nc(N)c(C[C@@H](C)C(=O)N[C@@H]3...,9.154901,<rdkit.Chem.rdchem.Mol object at 0x000001694CB...,True
1,CCCO[C@H]1C[NH2+][C@@H]([C@@H](O)[C@H](Cc2cc(F...,8.853872,<rdkit.Chem.rdchem.Mol object at 0x000001694CB...,True
2,CCc1cn2c3c(cc(C(=O)N[C@@H](Cc4ccccc4)[C@H](O)C...,8.69897,<rdkit.Chem.rdchem.Mol object at 0x000001694CB...,True
3,CCOC[C@@H](Oc1cc(C[C@@H]2CS(=O)(=O)C[C@H]([NH2...,8.69897,<rdkit.Chem.rdchem.Mol object at 0x000001694CB...,True
4,CCc1cn2c3c(cc(C(=O)N[C@@H](Cc4ccccc4)[C@H](O)C...,8.69897,<rdkit.Chem.rdchem.Mol object at 0x000001694CB...,True


In [26]:
#X_train, y_train, X_test, y_test = split_data(base_data)

In [27]:
### Run on fingerprints
X_train, y_train, X_test, y_test = split_data(base_data, approach = 'fp')

In [28]:
#X_train_fp, y_train_fp, X_test_fp, y_test_fp = split_data(base_data, approach = 'fp')

In [29]:
scaling = {
    'No_Preprocessing': None,
    'StandardScaler_t_t': StandardScaler(with_mean = True, with_std = True),
    'StandardScaler_t_f': StandardScaler(with_mean = True, with_std = False),
    'StandardScaler_f_t': StandardScaler(with_mean = False, with_std = True),
    'StandardScaler_f_f': StandardScaler(with_mean = False, with_std = False)
}

In [30]:
pcas = {
    'No_PCA': None,
    'PCA': PCA(n_components=0.95),
    'PCA_mle': PCA(n_components='mle'),
}

In [31]:
models = {
    'DecisionTree': GradientBoostingClassifier(),
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(),
    'NeuralNetwork': MLPClassifier()
}

# Scaled-PCA data

In [32]:
from itertools import product
import warnings
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings("ignore", category=Warning)

In [33]:
sc = StandardScaler()
pca = PCA(n_components=0.95)

In [34]:
X_train_pipe = sc.fit_transform(X_train)
X_test_pipe = sc.transform(X_test)
X_train_pipe = pca.fit_transform(X_train_pipe)
X_test_pipe = pca.transform(X_test_pipe)

KeyboardInterrupt: 

In [None]:
X_pipe = sc.fit_transform(pd.concat([X_train, X_test]))
X_pipe = pca.fit_transform(X_pipe)
y_pipe = pd.concat([y_train, y_test])

In [None]:
def run_rf(X, y, n_estimators, max_depth, min_samples_split, min_samples_leaf):
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    mean_accuracy = scores.mean()
    return (f"RandomForestClassifier-{n_estimators}-{max_depth}-{min_samples_split}-{min_samples_leaf}; {mean_accuracy:.4f}")

def run_lr(X, y, C, penalty, solver):
    model = LogisticRegression(C=C, penalty=penalty, solver=solver)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    mean_accuracy = scores.mean()
    return (f"LogisticRegression-{C}-{penalty}-{solver}; {mean_accuracy:.4f}")

def run_nn(X, y, hidden_layer_sizes, activation, alpha, max_iter):
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, alpha=alpha, max_iter=max_iter)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    mean_accuracy = scores.mean()
    return (f"NeuralNetwork-{hidden_layer_sizes}-{activation}-{alpha}-{max_iter}; {mean_accuracy:.4f}")

def run_gb(X, y, n_estimators, learning_rate):
    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    mean_accuracy = scores.mean()
    return (f"GradientBoostingClassifier-{n_estimators}-{learning_rate}; {mean_accuracy:.4f}")

In [None]:
def run_all(X, y):
    results = []
    param_grid={
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    param_combinations = list(product(*param_grid.values()))
    for combination in param_combinations:
        n, m, s, l = combination
        results.append(run_rf(X, y, n, m, s, l))
        print(f"finished RF {combination}")
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    }
    param_combinations = list(product(*param_grid.values()))
    for combination in param_combinations:
        C, p, s = combination
        results.append(run_lr(X, y, C, p, s))
        print(f"finished LR {combination}")
    param_grid = {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01],
        'max_iter': [200, 500, 1000]
    }
    param_combinations = list(product(*param_grid.values()))
    for combination in param_combinations:
        h, ac, a, i = combination
        results.append(run_nn(X, y, h, ac, a, i))
        print(f"finished NN {combination}")
    param_grid={
        'n_estimators': [10, 100, 1000], 
        'learning_rate': [0.1,0.5,1.0,2.0]
    }
    param_combinations = list(product(*param_grid.values()))
    for combination in param_combinations:
        n, lr = combination
        results.append(run_gb(X, y, n, lr))
        print(f"finished GB {combination}")
        
    return results

In [None]:
results = run_all(X_pipe, y_pipe)

KeyboardInterrupt: 

In [None]:
data_tuples = [tuple(item.split('; ')) for item in results]

df = pd.DataFrame(data_tuples, columns=['Classifier', 'Accuracy'])

df.head()

ValueError: 2 columns passed, passed data had 3 columns