In [6]:
import numpy as np

from sklearn.linear_model import Ridge, Lasso, ElasticNet, BayesianRidge, LogisticRegression, SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


from warnings import filterwarnings

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from tensorflow.keras.callbacks import History
from keras.callbacks import LearningRateScheduler
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from keras.layers import BatchNormalization

import matplotlib.pyplot as plt

import utils

In [7]:
seed = 1

In [8]:
def SVMRegr(X, y, cv):
    params = {'C':np.hstack([np.linspace(1e-9, 2, 1), np.linspace(10, 1e9, 1), np.linspace(0.1, 2, 1), np.array([1.0])])}
    grid = GridSearchCV(SVR(), params, cv=cv, return_train_score=True)
    grid.fit(X, y)
    return grid

In [25]:
def perform_classification_as_regression(X, y):    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

    kfold = KFold(n_splits=5, random_state=seed, shuffle=True)
    
    np.seterr(divide='ignore', invalid='ignore')
    filterwarnings('ignore')

    model = SVMRegr(X_train, y_train, kfold)
    prediction = model.best_estimator_.predict(X_test)

    for IC50_threshold in [500, 5_000, 10_000, 50_000]:
        print(f'IC50 threshold for classification: {IC50_threshold}')
        IC50_threshold = np.log10(IC50_threshold)
        test_clas = np.where(y_test<IC50_threshold, 1, 0)
        prediction_clas = np.where(prediction<IC50_threshold, 1, 0)

        print(f'{round(y[y<IC50_threshold].shape[0]/y.shape[0] * 100, 2)}% of substances classified as toxic, {round(y[y>=IC50_threshold].shape[0]/y.shape[0] * 100, 2)}% classified as non toxic.')
        
        
        accuracy = accuracy_score(test_clas, prediction_clas)
        precision = precision_score(test_clas, prediction_clas)
        recall = recall_score(test_clas, prediction_clas)
        f1 = f1_score(test_clas, prediction_clas)

        print(f"{'accuracy'.ljust(10)}    {'f1'.ljust(10)}    {'precision'.ljust(10)}    {'recall'.ljust(10)}")
        print(f"{str(round(accuracy, 6)).ljust(10)}    {str(round(f1, 6)).ljust(10)}    {str(round(precision, 6)).ljust(10)}    {str(round(recall, 6)).ljust(10)}")
        print('\n')
    

In [26]:
for perc in [0.0119, 0.02, 0.059, 0.133, 0.21, 0.29]:
#     df_hashed = utils.get_hashed_fingerprints(min_perc_used=0)
#     df_maccsfp = utils.get_MACCSFP_fingerprints(min_perc_used=0)
#     df_klekota = utils.get_KlekotaRoth_fingerprints(min_perc_used=0)
    df_mixed = utils.get_mixed_fingerprints(min_perc_used=0)
    dfs = [
#         (df_hashed, "Hashed Extended Fingerprints"), 
#         (df_maccsfp, "MACCSFP Fingerprints"), 
#         (df_klekota, "Klekota&Roth Fingerprints"), 
        (df_mixed, "Mixed Fingerprints")
    ]
    
    print(f'\nUsing custom feture selection with {perc*100}% of all features.\n')
    for df, title in dfs:
        X = df.drop('IC50', axis=1)
        y = df['IC50']
        select = int(X.shape[1] * perc)
        np.seterr(divide='ignore', invalid='ignore')
        X = SelectKBest(f_regression, k=select).fit_transform(X, y)
        print(title, '\n')
        perform_classification_as_regression(X, y)
        print()
    print('\n\n')

Preparing files for mixed fingerprints.

Preparing (ready_sets/cardiotoxicity_hERG_MACCSFP.csv) file.
DataFrame base shape: (11504, 167)
Shape after removing wrong values: (10635, 167)
Shape after removing outliers: (10396, 167)

Preparing (ready_sets/cardiotoxicity_hERG_KlekFP.csv) file.
DataFrame base shape: (11504, 4861)
Shape after removing wrong values: (10635, 4861)
Shape after removing outliers: (10396, 4861)

Preparing (ready_sets/cardiotoxicity_hERG_ExtFP.csv) file.
DataFrame base shape: (11504, 1025)
Shape after removing wrong values: (10635, 1025)
Shape after removing outliers: (10396, 1025)


Using custom feture selection with 1.1900000000000002% of all features.

Mixed Fingerprints 

IC50 threshold for classification: 500
11.55% of substances classified as toxic, 88.45% classified as non toxic.
accuracy      f1            precision     recall    
0.916346      0.553846      0.715232      0.451883  


IC50 threshold for classification: 5000
37.78% of substances classified a

DataFrame base shape: (11504, 4861)
Shape after removing wrong values: (10635, 4861)
Shape after removing outliers: (10396, 4861)

Preparing (ready_sets/cardiotoxicity_hERG_ExtFP.csv) file.
DataFrame base shape: (11504, 1025)
Shape after removing wrong values: (10635, 1025)
Shape after removing outliers: (10396, 1025)


Using custom feture selection with 28.999999999999996% of all features.

Mixed Fingerprints 

IC50 threshold for classification: 500
11.55% of substances classified as toxic, 88.45% classified as non toxic.
accuracy      f1            precision     recall    
0.936538      0.695853      0.774359      0.631799  


IC50 threshold for classification: 5000
37.78% of substances classified as toxic, 62.22% classified as non toxic.
accuracy      f1            precision     recall    
0.825962      0.771753      0.754624      0.789677  


IC50 threshold for classification: 10000
50.17% of substances classified as toxic, 49.83% classified as non toxic.
accuracy      f1          