# Imports 

In [None]:
import warnings
from itertools import product

import numpy as np
import pandas as pd
from mordred import Calculator, descriptors
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import (StratifiedKFold, KFold,
                                     cross_val_score, train_test_split)
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore", category=Warning)

N_SPLITS = 2
RANDOM_STATE = 148260

In [None]:
def CalculateMorganFingerprint(mol):
    mfpgen = AllChem.GetMorganGenerator(radius=2,fpSize=2048)
    fingerprint = np.array([mfpgen.GetFingerprintAsNumPy(x) for x in mol])
    fingerprint = pd.DataFrame(fingerprint, columns = ['mfp'+str(i) for i in range(fingerprint.shape[1])])
    return fingerprint

In [None]:
def CalculateDescriptors(mol):
    calc = Calculator(descriptors, ignore_3D=False)
    X_mordred = calc.pandas(mol, nproc=1)
    X_mordred = X_mordred.select_dtypes(['number'])
    #normalize
    X_mordred = (X_mordred-X_mordred.min())/(X_mordred.max()-X_mordred.min())
    #drop columns wth low std
    X_mordred = X_mordred.loc[:,X_mordred.std()>0.01]
    return X_mordred

In [None]:
def LoadDatasetCSV(path, threshold=7.0, regression = False):
    df = pd.read_csv(path)
    df['molecule_from_smiles'] = df['smiles'].apply(Chem.MolFromSmiles)
    df['smiles'] = df['smiles'].map(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)))
    df.drop_duplicates('smiles')
    df = df.dropna()
    if regression:
        df['Target'] = df['pIC50']
    else:
        df['Target'] = df['pIC50'] > threshold
    return df

In [None]:
def split_data(df, approach = 'desc', split = 0.7):
    #TODO: support for different approaches - if applicable
    if approach == 'desc':
        X = CalculateDescriptors(df['molecule_from_smiles'])
    else:
        X = CalculateMorganFingerprint(df['molecule_from_smiles'])
    y = df["Target"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1-split), random_state=42)
    return X_train, y_train, X_test, y_test

In [None]:
def run_rf(X, y, n_estimators, max_depth, min_samples_split, min_samples_leaf, regression=False):
    if regression:
        name = "RandomForestRegressor"
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "RandomForestClassifier"
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'accuracy'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{n_estimators}-{max_depth}-{min_samples_split}-{min_samples_leaf}; {mean_accuracy:.4f}")

def run_lr(X, y, C, penalty, solver, regression=False):
    if regression:
        name = "LinearRegression"
        model = LinearRegression()
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "LogisticRegression"
        model = LogisticRegression(C=C, penalty=penalty, solver=solver)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'accuracy'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{C}-{penalty}-{solver}; {mean_accuracy:.4f}")

def run_nn(X, y, hidden_layer_sizes, activation, alpha, max_iter, regression=False):
    if regression:
        name = "MLPRegressor"
        model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, activation=activation, alpha=alpha, max_iter=max_iter)
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "MLPClassifier"
        model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, alpha=alpha, max_iter=max_iter)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'accuracy'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{hidden_layer_sizes}-{activation}-{alpha}-{max_iter}; {mean_accuracy:.4f}")

def run_gb(X, y, n_estimators, learning_rate, regression=False):
    if regression:
        name = "GradientBoostingRegressor"
        model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate)
        cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'neg_mean_squared_error'
    else:
        name = "GradientBoostingClassifier"
        model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
        cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
        scoring = 'accuracy'
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    mean_accuracy = scores.mean()
    return (f"{name}-{n_estimators}-{learning_rate}; {mean_accuracy:.4f}")

In [None]:
def run_all(X, y, regression=False):
    results = []
    param_grid_rf={
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    param_combinations = list(product(*param_grid_rf.values()))
    for combination in param_combinations:
        n, m, s, l = combination
        results.append(run_rf(X, y, n, m, s, l, regression))
    param_grid_lr = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    }
    if regression:
        param_grid_lr = {
            'C': [0.001],
            'penalty': ['l1'],
            'solver': ['liblinear']
        }
    param_combinations = list(product(*param_grid_lr.values()))
    for combination in param_combinations:
        C, p, s = combination
        results.append(run_lr(X, y, C, p, s, regression))
    param_grid_mlp = {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01],
        'max_iter': [200, 500, 1000]
    }
    param_combinations = list(product(*param_grid_mlp.values()))
    for combination in param_combinations:
        h, ac, a, i = combination
        results.append(run_nn(X, y, h, ac, a, i, regression))
    param_grid_gb={
        'n_estimators': [10, 100, 200], 
        'learning_rate': [0.1,0.5,1.0,2.0]
    }
    param_combinations = list(product(*param_grid_gb.values()))
    for combination in param_combinations:
        n, lr = combination
        results.append(run_gb(X, y, n, lr, regression))
        
    return results

In [None]:
data_classification = LoadDatasetCSV("data\processed\simple_input_data.csv")
data_classification.head()

In [None]:
data_regression = LoadDatasetCSV("data\processed\simple_input_data.csv", regression=True)
data_regression.head()

In [None]:
X_train_desc_classification, y_train_desc_classification, X_test_desc_classification, y_test_desc_classification = split_data(data_classification)

In [None]:
X_train_desc_regression, y_train_desc_regression, X_test_desc_regression, y_test_desc_regression = split_data(data_regression)

In [None]:
X_train_fp_classification, y_train_fp_classification, X_test_fp_classification, y_test_fp_classification = split_data(data_classification, approach = 'fp')

In [None]:
X_train_fp_regression, y_train_fp_regression, X_test_fp_regression, y_test_fp_regression = split_data(data_regression, approach = 'fp')

# Descriptors

## Classification

### Scaled-PCA data

In [None]:
#TODO - test out different PCA parameters

sc = StandardScaler()
pca = PCA(n_components=0.95)

In [None]:
X_experimental = sc.fit_transform(pd.concat([X_train_desc_classification, X_test_desc_classification]))
X_experimental = pca.fit_transform(X_experimental)
y_experimental = pd.concat([y_train_desc_classification, y_test_desc_classification])

In [None]:
results = run_all(X_experimental, y_experimental)

In [None]:
data_tuples = [tuple(item.split('; ')) for item in results]

df = pd.DataFrame(data_tuples, columns=['Classifier', 'Accuracy'])

df.to_csv("results_classification_descriptors_scpca.csv")

### Only scaled

In [None]:
sc = StandardScaler()

In [None]:
X_experimental = sc.fit_transform(pd.concat([X_train_desc_classification, X_test_desc_classification]))
y_experimental = pd.concat([y_train_desc_classification, y_test_desc_classification])

In [None]:
results = run_all(X_experimental, y_experimental)

In [None]:
data_tuples = [tuple(item.split('; ')) for item in results]

df = pd.DataFrame(data_tuples, columns=['Classifier', 'Accuracy'])

df.to_csv("results_classification_descriptors_sc.csv")

## Regression approach

### Scaled-PCA data

In [None]:
#TODO - test out different PCA parameters

sc = StandardScaler()
pca = PCA(n_components=0.95)

In [None]:
X_experimental = sc.fit_transform(pd.concat([X_train_desc_regression, X_test_desc_regression]))
X_experimental = pca.fit_transform(X_experimental)
y_experimental = pd.concat([y_train_desc_regression, y_test_desc_regression])

In [None]:
results = run_all(X_experimental, y_experimental)

In [None]:
data_tuples = [tuple(item.split('; ')) for item in results]

df = pd.DataFrame(data_tuples, columns=['Classifier', 'Accuracy'])

df.to_csv("results_regression_descriptors_scpca.csv")

### Only scaled

In [None]:
sc = StandardScaler()

In [None]:
X_experimental = sc.fit_transform(pd.concat([X_train_desc_regression, X_test_desc_regression]))
y_experimental = pd.concat([y_train_desc_regression, y_test_desc_regression])

In [None]:
results = run_all(X_experimental, y_experimental)

In [None]:
data_tuples = [tuple(item.split('; ')) for item in results]

df = pd.DataFrame(data_tuples, columns=['Classifier', 'Accuracy'])

df.to_csv("results_regression_descriptors_sc.csv")

# Fingerprints

## Classification

### Scaled-PCA data

In [None]:
#TODO - test out different PCA parameters

sc = StandardScaler()
pca = PCA(n_components=0.95)

In [None]:
X_experimental = sc.fit_transform(pd.concat([X_train_fp_classification, X_test_fp_classification]))
X_experimental = pca.fit_transform(X_experimental)
y_experimental = pd.concat([y_train_fp_classification, y_test_fp_classification])

In [None]:
results = run_all(X_experimental, y_experimental)

In [None]:
data_tuples = [tuple(item.split('; ')) for item in results]

df = pd.DataFrame(data_tuples, columns=['Classifier', 'Accuracy'])

df.to_csv("results_classification_fingerprints_scpca.csv")

### Only scaled

In [None]:
sc = StandardScaler()

In [None]:
X_experimental = sc.fit_transform(pd.concat([X_train_fp_classification, X_test_fp_classification]))
y_experimental = pd.concat([y_train_fp_classification, y_test_fp_classification])

In [None]:
results = run_all(X_experimental, y_experimental)

In [None]:
data_tuples = [tuple(item.split('; ')) for item in results]

df = pd.DataFrame(data_tuples, columns=['Classifier', 'Accuracy'])

df.to_csv("results_classification_fingerprints_sc.csv")

## Regression approach

### Scaled-PCA data

In [None]:
#TODO - test out different PCA parameters

sc = StandardScaler()
pca = PCA(n_components=0.95)

In [None]:
X_experimental = sc.fit_transform(pd.concat([X_train_fp_regression, X_test_fp_regression]))
X_experimental = pca.fit_transform(X_experimental)
y_experimental = pd.concat([y_train_fp_regression, y_test_fp_regression])

In [None]:
results = run_all(X_experimental, y_experimental)

In [None]:
data_tuples = [tuple(item.split('; ')) for item in results]

df = pd.DataFrame(data_tuples, columns=['Classifier', 'Accuracy'])

df.to_csv("results_regression_fingerprints_scpca.csv")

### Only scaled

In [None]:
sc = StandardScaler()

In [None]:
X_experimental = sc.fit_transform(pd.concat([X_train_fp_regression, X_test_fp_regression]))
y_experimental = pd.concat([y_train_fp_regression, y_test_fp_regression])

In [None]:
results = run_all(X_experimental, y_experimental)

In [None]:
data_tuples = [tuple(item.split('; ')) for item in results]

df = pd.DataFrame(data_tuples, columns=['Classifier', 'Accuracy'])

df.to_csv("results_regression_fingerprints_sc.csv")