Plik z modelami do klasyfikacji. Tym razem modele testowałem już tylko na połączeniu wszystkich trzech plików z fingerprintami, ze względnu na to, że w przypadku regresji zawsze w ten sposób orzymywałem najlepsze wyniki. Testowałem również jak model radzi sobie w zależności od ustalonego progu IC50 (progu od którego klasyfikujemy związek jako toksyczny). Podobnie jak dla regresji, większa ilość zmiennych poprawia wyniki modelu, jednak korzystanie tylko z małej części zmiennych jest w stanie osiągać dość podobne wyniki, a przy tym czas wykonania jest zdecydowanie mniejszy. W notebooku przetestowałem sporo modeli płytkich oraz prostą sieć neuronową. Do oceny wyników używałem accuracy oraz f1_score dla mniej zbalansowanych progów IC50. Najlepsze wynik podobnie jak w przypadku regresji otrzymaweł w modelach opartych ns SVM, dla progu IC50=10 000(tyle samo związków toksycznych i nietoksycznych) dochododziłem do accuracy ~80%. W przypadku sieci neuronowej było to nieco mniej. Dokładniejsze wyniki można prześledzić poniżej. Ze względu na to, że testowałem dośc dużo kombinacji modeli/danych/parametrów cały notebook liczy się dość długo. W razie własnych testów polecam wywoływać funkcje z ograniczoną liczbą zmiennych, wtedy powinno działać szybciej. Powinno dać się to zrobić przez usunięcie częsci parametrów w miejscu gdzie wywołuje główne funckje.

In [1]:
import numpy as np

from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from warnings import filterwarnings

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from tensorflow.keras.callbacks import History
from keras.callbacks import LearningRateScheduler
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from keras.layers import BatchNormalization

import matplotlib.pyplot as plt

import utils

In [2]:
seed = 1

In [3]:
def SVC_RBF(X, y, cv):
    classifier = SVC(kernel='rbf')
    
    param_grid = {
        'C': [10, 100],
        'gamma':[0.001, 0.01, 0.1, 'auto'],
    }
    
    grid = GridSearchCV(classifier, param_grid, cv=cv, return_train_score=True)
    grid.fit(X, y)
    return grid

In [4]:
def SVC_Poly(X, y, cv):
    classifier = SVC(kernel='poly')
    
    param_grid = {
        'C': [0.01, 1],
        'coef0': [0.01, 1],
        'degree': [2]
    }
    
    grid = GridSearchCV(classifier, param_grid, cv=cv, return_train_score=True)
    grid.fit(X, y)
    return grid

In [5]:
def SVC_Linear(X, y, cv):
    classifier = LinearSVC(loss='squared_hinge', penalty='l2', dual=False, max_iter=50000)
    
    param_grid = {
        'C': [0.1, 1, 10],
    }
    
    grid = GridSearchCV(classifier, param_grid, cv=cv, return_train_score=True)
    grid.fit(X, y)
    return grid

In [6]:
def Logistic(X, y, cv):
    classifier = LogisticRegression(solver='liblinear')
    
    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100, 1000],
    }
    
    grid = GridSearchCV(classifier, param_grid, cv=cv, return_train_score=True)
    grid.fit(X, y)
    return grid

In [7]:
def RandomForest(X, y, cv):
    classifier = RandomForestClassifier(random_state=seed, max_leaf_nodes=500)
    
    param_grid = {
        'n_estimators': [100, 500],
        'max_leaf_nodes': [2, 4, 8, 16]
    }
    
    grid = GridSearchCV(classifier, param_grid, cv=cv, return_train_score=True)
    grid.fit(X, y)
    return grid

In [8]:
def GradientBoosting(X, y, cv):
    classifier = GradientBoostingClassifier(random_state=seed)
    
    param_grid = {
        'n_estimators': [100, 500],
        'learning_rate': [0.01, 0.1]
    }
    
    grid = GridSearchCV(classifier, param_grid, cv=cv, return_train_score=True)
    grid.fit(X, y)
    return grid

In [9]:
def XGBoost(X, y, cv):
    classifier = XGBClassifier(use_label_encoder=False, verbosity=0)
    
    param_grid = {
        'max_depth': [3, 5, 8, 10],
        'learning_rate': [0.001, 0.01],
        'n_estimators': [50, 100],
    }
    
    grid = GridSearchCV(classifier, param_grid, cv=cv, return_train_score=True)
    grid.fit(X, y)
    return grid

In [10]:
def KNearestNeighbors(X, y, cv):
    classifier = KNeighborsClassifier(algorithm='ball_tree')
    
    param_grid = {
        'n_neighbors': [5, 10],
        'leaf_size': [20, 50],
    }
    
    grid = GridSearchCV(classifier, param_grid, cv=cv, return_train_score=True)
    grid.fit(X, y)
    return grid

In [11]:
def perform_classification(X, y):
    
    model_functions = [
    ("SVC RBF", SVC_RBF),
    ("SVC Poly", SVC_Poly),
    ("SVC Linear", SVC_Linear),
    ("Logistic", Logistic),
    ("Random Forest", RandomForest),
    ("Gradient Boosting", GradientBoosting),
    ("XGBoosting", XGBoost),
    ("K Nearest Neighbors", KNearestNeighbors),
    ]
    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

    kfold = KFold(n_splits=5, random_state=seed, shuffle=True)
    
    np.seterr(divide='ignore', invalid='ignore')
    filterwarnings('ignore')

    print(f"{'Classification model:'.ljust(22)}   {'accuracy:'.ljust(8)}   F1:")

    best_accuracy = 0
    best_acc_result = ("None", 0, {})
    
    best_f1 = 0
    best_precision = 0
    best_recall = 0
    best_f1_result = ("None", 0, {})

    for (name, function) in model_functions:
        model = function(X_train, y_train, kfold)
        prediction = model.best_estimator_.predict(X_test)
        accuracy = accuracy_score(y_test, prediction)
        precision = precision_score(y_test, prediction)
        recall = recall_score(y_test, prediction)
        f1 = f1_score(y_test, prediction)
        if (accuracy > best_accuracy):
            best_accuracy = accuracy
            best_acc_result = (name, accuracy, model.best_params_)
        if (f1 > best_f1):
            best_f1 = f1
            best_precision = precision
            best_recall = recall
            best_f1_result = (name, f1, model.best_params_)

        print(f"{name.ljust(22)}   {str(round(accuracy, 6)).ljust(9)}   {str(round(f1, 6)).ljust(9)}   {model.best_params_}")

    print(f"\nBest accuracy\n{best_acc_result[0].ljust(22)}   {str(round(best_acc_result[1], 6)).ljust(9)}   {best_acc_result[2]}\n")
    print(f"\nBest F1\n{best_f1_result[0].ljust(22)}   {str(round(best_f1_result[1], 6)).ljust(9)}   {best_f1_result[2]}")
    print(f"{'    preccision'.ljust(22)}   {best_precision}")
    print(f"{'    recall'.ljust(22)}   {best_recall}")

In [None]:
for perc in [0.9, 0.8, 0.5, 0.3, 0.1, 0.01]:
#     df_hashed = utils.get_hashed_fingerprints(min_perc_used=perc)
#     df_maccsfp = utils.get_MACCSFP_fingerprints(min_perc_used=perc)
#     df_klekota = utils.get_KlekotaRoth_fingerprints(min_perc_used=perc)
    df_mixed = utils.get_mixed_fingerprints(min_perc_used=perc)
    dfs = [
#         (df_hashed, "Hashed Extended Fingerprints"), 
#         (df_maccsfp, "MACCSFP Fingerprints"), 
#         (df_klekota, "Klekota&Roth Fingerprints"), 
        (df_mixed, "Mixed Fingerprints")
    ]

    print(f'\nUsing fetures that are present in at least {perc*100}% of substances.\n')
    for df, title in dfs:
        for IC50_threshold in [500, 5_000, 10_000, 50_000]:
            print(f'IC50 threshold for classification: {IC50_threshold} \n\n')
            df_class = utils.classify_on_IC50(df.copy(), IC50_threshold)
            X = df_class.drop('IC50', axis=1)
            y = df_class['IC50']
            print(f'{round(y[y==1].shape[0]/y.shape[0] * 100, 2)}% of substances classified as toxic, {round(y[y==0].shape[0]/y.shape[0] * 100, 2)}% classified as non toxic.' , '\n')
            print(title, '\n')
            perform_classification(X, y)
            print()
    print('\n\n')

Preparing files for mixed fingerprints.

Preparing (ready_sets/cardiotoxicity_hERG_MACCSFP.csv) file.
DataFrame base shape: (11504, 167)
Shape after removing wrong values: (10635, 167)
Shape after removing least used features: (10635, 15)
Shape after removing outliers: (10396, 15)

Preparing (ready_sets/cardiotoxicity_hERG_KlekFP.csv) file.
DataFrame base shape: (11504, 4861)
Shape after removing wrong values: (10635, 4861)
Shape after removing least used features: (10635, 9)
Shape after removing outliers: (10396, 9)

Preparing (ready_sets/cardiotoxicity_hERG_ExtFP.csv) file.
DataFrame base shape: (11504, 1025)
Shape after removing wrong values: (10635, 1025)
Shape after removing least used features: (10635, 51)
Shape after removing outliers: (10396, 51)


Using fetures that are present in at least 90.0% of substances.

IC50 threshold for classification: 500 


11.55% of substances classified as toxic, 88.45% classified as non toxic. 

Mixed Fingerprints 

Classification model:    accu

SVC RBF                  0.709615    0.70334     {'C': 10, 'gamma': 0.1}
SVC Poly                 0.680288    0.680135    {'C': 1, 'coef0': 0.01, 'degree': 2}
SVC Linear               0.630288    0.637435    {'C': 1}
Logistic                 0.624038    0.626908    {'C': 0.1}
Random Forest            0.612981    0.637877    {'max_leaf_nodes': 16, 'n_estimators': 500}
Gradient Boosting        0.676442    0.676908    {'learning_rate': 0.1, 'n_estimators': 500}
XGBoosting               0.658654    0.697615    {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}
K Nearest Neighbors      0.686058    0.709131    {'leaf_size': 20, 'n_neighbors': 5}

Best accuracy
SVC RBF                  0.709615    {'C': 10, 'gamma': 0.1}


Best F1
K Nearest Neighbors      0.709131    {'leaf_size': 20, 'n_neighbors': 5}
    preccision           0.6803418803418804
    recall               0.7404651162790697

IC50 threshold for classification: 50000 


91.26% of substances classified as toxic, 8.74% c

SVC Poly                 0.923558    0.601504    {'C': 1, 'coef0': 0.01, 'degree': 2}
SVC Linear               0.915865    0.588235    {'C': 0.1}
Logistic                 0.916827    0.568579    {'C': 0.1}
Random Forest            0.888942    0.079681    {'max_leaf_nodes': 16, 'n_estimators': 100}
Gradient Boosting        0.933173    0.660147    {'learning_rate': 0.1, 'n_estimators': 500}
XGBoosting               0.922115    0.597015    {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}
K Nearest Neighbors      0.919712    0.61959     {'leaf_size': 50, 'n_neighbors': 5}

Best accuracy
Gradient Boosting        0.933173    {'learning_rate': 0.1, 'n_estimators': 500}


Best F1
SVC RBF                  0.669604    {'C': 100, 'gamma': 0.01}
    preccision           0.7069767441860465
    recall               0.6359832635983264

IC50 threshold for classification: 5000 


37.78% of substances classified as toxic, 62.22% classified as non toxic. 

Mixed Fingerprints 

Classification

Tutaj testuje metode SelectKBest do wyboru zmiennych.

In [12]:
# perc values adjusted to compare with previous method
for perc in [0.0119, 0.02, 0.059, 0.133, 0.21, 0.29]:
#     df_hashed = utils.get_hashed_fingerprints(min_perc_used=perc)
#     df_maccsfp = utils.get_MACCSFP_fingerprints(min_perc_used=perc)
#     df_klekota = utils.get_KlekotaRoth_fingerprints(min_perc_used=perc)
    df_mixed = utils.get_mixed_fingerprints(min_perc_used=perc)
    dfs = [
#         (df_hashed, "Hashed Extended Fingerprints"), 
#         (df_maccsfp, "MACCSFP Fingerprints"), 
#         (df_klekota, "Klekota&Roth Fingerprints"), 
        (df_mixed, "Mixed Fingerprints")
    ]

    print(f'\nUsing fetures that are present in at least {perc*100}% of substances.\n')
    for df, title in dfs:
        for IC50_threshold in [500, 5_000, 10_000, 50_000]:
            print(f'IC50 threshold for classification: {IC50_threshold} \n\n')
            df_class = utils.classify_on_IC50(df.copy(), IC50_threshold)
            X = df_class.drop('IC50', axis=1)
            y = df_class['IC50']
            select = int(X.shape[1] * perc)
            print(f'{round(y[y==1].shape[0]/y.shape[0] * 100, 2)}% of substances classified as toxic, {round(y[y==0].shape[0]/y.shape[0] * 100, 2)}% classified as non toxic.' , '\n')
            np.seterr(divide='ignore', invalid='ignore')
            X = SelectKBest(f_regression, k=select).fit_transform(X, y)
            print(title, '\n')
            perform_classification(X, y)
            print()
    print('\n\n')

Preparing files for mixed fingerprints.

Preparing (ready_sets/cardiotoxicity_hERG_MACCSFP.csv) file.
DataFrame base shape: (11504, 167)
Shape after removing wrong values: (10635, 167)
Shape after removing least used features: (10635, 138)
Shape after removing outliers: (10396, 138)

Preparing (ready_sets/cardiotoxicity_hERG_KlekFP.csv) file.
DataFrame base shape: (11504, 4861)
Shape after removing wrong values: (10635, 4861)
Shape after removing least used features: (10635, 565)
Shape after removing outliers: (10396, 565)

Preparing (ready_sets/cardiotoxicity_hERG_ExtFP.csv) file.
DataFrame base shape: (11504, 1025)
Shape after removing wrong values: (10635, 1025)
Shape after removing least used features: (10635, 1008)
Shape after removing outliers: (10396, 1008)


Using fetures that are present in at least 1.1900000000000002% of substances.

IC50 threshold for classification: 500 


11.55% of substances classified as toxic, 88.45% classified as non toxic. 

Mixed Fingerprints 

Class

Mixed Fingerprints 

Classification model:    accuracy:   F1:
SVC RBF                  0.712019    0.718118    {'C': 100, 'gamma': 0.1}
SVC Poly                 0.670192    0.683579    {'C': 1, 'coef0': 0.01, 'degree': 2}
SVC Linear               0.654808    0.653475    {'C': 0.1}
Logistic                 0.651442    0.650265    {'C': 0.1}
Random Forest            0.648077    0.616352    {'max_leaf_nodes': 16, 'n_estimators': 500}
Gradient Boosting        0.675481    0.681754    {'learning_rate': 0.1, 'n_estimators': 500}
XGBoosting               0.694712    0.703963    {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}
K Nearest Neighbors      0.700962    0.708529    {'leaf_size': 20, 'n_neighbors': 5}

Best accuracy
SVC RBF                  0.712019    {'C': 100, 'gamma': 0.1}


Best F1
SVC RBF                  0.718118    {'C': 100, 'gamma': 0.1}
    preccision           0.7266666666666667
    recall               0.7097674418604651

IC50 threshold for classification: 500

SVC RBF                  0.929327    0.663616    {'C': 100, 'gamma': 0.1}
SVC Poly                 0.913942    0.506887    {'C': 1, 'coef0': 1, 'degree': 2}
SVC Linear               0.895192    0.296774    {'C': 10}
Logistic                 0.894231    0.337349    {'C': 1}
Random Forest            0.888942    0.079681    {'max_leaf_nodes': 16, 'n_estimators': 100}
Gradient Boosting        0.924519    0.594315    {'learning_rate': 0.1, 'n_estimators': 500}
XGBoosting               0.922596    0.581818    {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}
K Nearest Neighbors      0.912019    0.577367    {'leaf_size': 50, 'n_neighbors': 5}

Best accuracy
SVC RBF                  0.929327    {'C': 100, 'gamma': 0.1}


Best F1
SVC RBF                  0.663616    {'C': 100, 'gamma': 0.1}
    preccision           0.7323232323232324
    recall               0.606694560669456

IC50 threshold for classification: 5000 


37.78% of substances classified as toxic, 62.22% classified as n

Mixed Fingerprints 

Classification model:    accuracy:   F1:
SVC RBF                  0.934135    0.964682    {'C': 10, 'gamma': 0.01}
SVC Poly                 0.93125     0.963455    {'C': 1, 'coef0': 0.01, 'degree': 2}
SVC Linear               0.923558    0.959511    {'C': 1}
Logistic                 0.925962    0.960734    {'C': 0.1}
Random Forest            0.921154    0.958649    {'max_leaf_nodes': 16, 'n_estimators': 100}
Gradient Boosting        0.933173    0.964166    {'learning_rate': 0.1, 'n_estimators': 500}
XGBoosting               0.928846    0.961856    {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}
K Nearest Neighbors      0.927404    0.961133    {'leaf_size': 50, 'n_neighbors': 5}

Best accuracy
SVC RBF                  0.934135    {'C': 10, 'gamma': 0.01}


Best F1
SVC RBF                  0.964682    {'C': 10, 'gamma': 0.01}
    preccision           0.9473417721518987
    recall               0.9826680672268907




Preparing files for mixed fingerprint

Na koniec przetestowałem jeszcze prostą sieć neuronową

In [12]:
def perform_NN_classification(X, y):
    history = History()
    model = Sequential()

    size = X.shape[1]
    model.add(Dense(size//2, activation="relu",input_shape=(size,)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(size//4, activation="relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(size//8, activation="relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(size//16, activation="relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(size//32, activation="relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(1,activation="sigmoid"))
    model.summary()


    early_stopping = EarlyStopping(patience=30, monitor="val_loss")
    model.compile(loss='binary_crossentropy', optimizer="Adam", metrics=["accuracy"])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

    model.fit(X_train, y_train, validation_data= (X_test, y_test), batch_size=256, epochs=200, validation_split=0.2, callbacks=[early_stopping, history], verbose=0)
    prediction = (model.predict(X_test) > 0.5).astype("int32")
    accuracy = accuracy_score(y_test, prediction)
    f1 = f1_score(y_test, prediction)
    print(f'\nFinal accuracy:   {accuracy}')
    print(f'\nFinal f1 score:   {f1}')
    print()
    

In [13]:
for perc in [0.9, 0.8, 0.5, 0.3, 0.1, 0.01]:
#     df_hashed = utils.get_hashed_fingerprints(min_perc_used=perc)
#     df_maccsfp = utils.get_MACCSFP_fingerprints(min_perc_used=perc)
#     df_klekota = utils.get_KlekotaRoth_fingerprints(min_perc_used=perc)
    df_mixed = utils.get_mixed_fingerprints(min_perc_used=perc)
    dfs = [
#         (df_hashed, "Hashed Extended Fingerprints"), 
#         (df_maccsfp, "MACCSFP Fingerprints"), 
#         (df_klekota, "Klekota&Roth Fingerprints"), 
        (df_mixed, "Mixed Fingerprints")
    ]

    print(f'\nUsing fetures that are present in at least {perc*100}% of substances.\n')
    for df, title in dfs:
        for IC50_threshold in [500, 5_000, 10_000, 50_000]:
            print(f'IC50 threshold for classification: {IC50_threshold} \n\n')
            df_class = utils.classify_on_IC50(df.copy(), IC50_threshold)
            X = df_class.drop('IC50', axis=1)
            y = df_class['IC50']
            print(f'{round(y[y==1].shape[0]/y.shape[0] * 100, 2)}% of substances classified as toxic, {round(y[y==0].shape[0]/y.shape[0] * 100, 2)}% classified as non toxic.' , '\n')
            print(title, '\n')
            perform_NN_classification(X, y)
            print()
    print('\n\n')

Preparing files for mixed fingerprints.

Preparing (ready_sets/cardiotoxicity_hERG_MACCSFP.csv) file.
DataFrame base shape: (11504, 167)
Shape after removing wrong values: (10635, 167)
Shape after removing least used features: (10635, 15)
Shape after removing outliers: (10396, 15)

Preparing (ready_sets/cardiotoxicity_hERG_KlekFP.csv) file.
DataFrame base shape: (11504, 4861)
Shape after removing wrong values: (10635, 4861)
Shape after removing least used features: (10635, 9)
Shape after removing outliers: (10396, 9)

Preparing (ready_sets/cardiotoxicity_hERG_ExtFP.csv) file.
DataFrame base shape: (11504, 1025)
Shape after removing wrong values: (10635, 1025)
Shape after removing least used features: (10635, 51)
Shape after removing outliers: (10396, 51)


Using fetures that are present in at least 90.0% of substances.

IC50 threshold for classification: 500 


11.55% of substances classified as toxic, 88.45% classified as non toxic. 

Mixed Fingerprints 

Model: "sequential"
_________


Final accuracy:   0.4831730769230769

Final f1 score:   0.0


IC50 threshold for classification: 50000 


91.26% of substances classified as toxic, 8.74% classified as non toxic. 

Mixed Fingerprints 

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 36)                2628      
                                                                 
 batch_normalization_15 (Bat  (None, 36)               144       
 chNormalization)                                                
                                                                 
 dropout_15 (Dropout)        (None, 36)                0         
                                                                 
 dense_19 (Dense)            (None, 18)                666       
                                                                 
 batch_normalization_16 (Bat  (None, 18)         


Final accuracy:   0.6538461538461539

Final f1 score:   0.16473317865429238


IC50 threshold for classification: 10000 


50.17% of substances classified as toxic, 49.83% classified as non toxic. 

Mixed Fingerprints 

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_36 (Dense)            (None, 60)                7320      
                                                                 
 batch_normalization_30 (Bat  (None, 60)               240       
 chNormalization)                                                
                                                                 
 dropout_30 (Dropout)        (None, 60)                0         
                                                                 
 dense_37 (Dense)            (None, 30)                1830      
                                                                 
 batch_normalization_31 (Bat  (N


Final accuracy:   0.9115384615384615

Final f1 score:   0.5422885572139303


IC50 threshold for classification: 5000 


37.78% of substances classified as toxic, 62.22% classified as non toxic. 

Mixed Fingerprints 

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_54 (Dense)            (None, 178)               63546     
                                                                 
 batch_normalization_45 (Bat  (None, 178)              712       
 chNormalization)                                                
                                                                 
 dropout_45 (Dropout)        (None, 178)               0         
                                                                 
 dense_55 (Dense)            (None, 89)                15931     
                                                                 
 batch_normalization_46 (Bat  (Non


Final accuracy:   0.9264423076923077

Final f1 score:   0.9609992352791231





Preparing files for mixed fingerprints.

Preparing (ready_sets/cardiotoxicity_hERG_MACCSFP.csv) file.
DataFrame base shape: (11504, 167)
Shape after removing wrong values: (10635, 167)
Shape after removing least used features: (10635, 80)
Shape after removing outliers: (10396, 80)

Preparing (ready_sets/cardiotoxicity_hERG_KlekFP.csv) file.
DataFrame base shape: (11504, 4861)
Shape after removing wrong values: (10635, 4861)
Shape after removing least used features: (10635, 70)
Shape after removing outliers: (10396, 70)

Preparing (ready_sets/cardiotoxicity_hERG_ExtFP.csv) file.
DataFrame base shape: (11504, 1025)
Shape after removing wrong values: (10635, 1025)
Shape after removing least used features: (10635, 655)
Shape after removing outliers: (10396, 655)


Using fetures that are present in at least 30.0% of substances.

IC50 threshold for classification: 500 


11.55% of substances classified as toxic,

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_90 (Dense)            (None, 401)               322003    
                                                                 
 batch_normalization_75 (Bat  (None, 401)              1604      
 chNormalization)                                                
                                                                 
 dropout_75 (Dropout)        (None, 401)               0         
                                                                 
 dense_91 (Dense)            (None, 200)               80400     
                                                                 
 batch_normalization_76 (Bat  (None, 200)              800       
 chNormalization)                                                
                                                                 
 dropout_76 (Dropout)        (None, 200)             


Final accuracy:   0.8052884615384616

Final f1 score:   0.7272727272727273


IC50 threshold for classification: 10000 


50.17% of substances classified as toxic, 49.83% classified as non toxic. 

Mixed Fingerprints 

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_108 (Dense)           (None, 641)               823044    
                                                                 
 batch_normalization_90 (Bat  (None, 641)              2564      
 chNormalization)                                                
                                                                 
 dropout_90 (Dropout)        (None, 641)               0         
                                                                 
 dense_109 (Dense)           (None, 320)               205440    
                                                                 
 batch_normalization_91 (Bat  (N


Final accuracy:   0.9115384615384615

Final f1 score:   0.47727272727272724


IC50 threshold for classification: 5000 


37.78% of substances classified as toxic, 62.22% classified as non toxic. 

Mixed Fingerprints 

Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_126 (Dense)           (None, 881)               1553203   
                                                                 
 batch_normalization_105 (Ba  (None, 881)              3524      
 tchNormalization)                                               
                                                                 
 dropout_105 (Dropout)       (None, 881)               0         
                                                                 
 dense_127 (Dense)           (None, 440)               388080    
                                                                 
 batch_normalization_106 (Ba  (N


Final accuracy:   0.9350961538461539

Final f1 score:   0.9655348480980342





