<a href="https://colab.research.google.com/github/mounsifelatouch/cdd/blob/master/notebooks/4_cdd_ml_part_4_models_building.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Bioinformatics Project - Computational Drug Discovery [Part 4] Classification Models Building**

**MOUNSIF EL ATOUCH**

In this Jupyter notebook, we will be building a machine learning model using the ChEMBL bioactivity data.

In **Part 4**, we will be building classification models

---

## **Importing libraries**

In [None]:
import os
import pandas as pd

from scipy.spatial.distance import *
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

seed = 42

from sklearn.model_selection import *
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import *

from scipy import interp
from math import *
from sklearn.metrics import RocCurveDisplay

import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC  
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

## **Functions**

In [None]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [None]:
def print_auc(clf, X_train, y_train, X_test, y_test, model_name) :
    
    clf = clf.fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)[::,1]
    fpr, tpr, thr = roc_curve(y_test,  y_pred_proba)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    #plt.plot(fpr, tpr, label=model_name+" auc="+str(roc_auc))
    #plt.legend(loc=4)
    #plt.title("ROC curve")

    #plt.show()
    # Data to plot precision - recall curve
    precision, recall, thresholds_log = precision_recall_curve(y_test, y_pred_proba)
    # Use AUC function to calculate the area under the curve of precision recall curve
    auc_precision_recall = auc(recall, precision)
    
    #plt.plot(recall, precision, label=model_name+" auc_prc="+str(auc_precision_recall))
    #plt.xlabel("Recall")
    #plt.ylabel("Precision")
    #plt.legend(loc=4)
    #plt.title("Precision-Recall Curve")

    #plt.show()
    precision_sc=precision_score(y_test, clf.predict(X_test), average=None)
    recall_sc = recall_score(y_test, clf.predict(X_test), average=None)
    test_score = accuracy_score(y_test, clf.predict(X_test)) * 100
    train_score = accuracy_score(y_train, clf.predict(X_train)) * 100
    f1 = f1_score(y_test, clf.predict(X_test))
    print_score(clf, X_train, y_train, X_test, y_test, train=True)
    print_score(clf, X_train, y_train, X_test, y_test, train=False)
    metriques = {"fpr" : fpr, "recall_sc" : recall_sc, "precision_sc" : precision_sc, "tpr" : tpr,
                 "test_score" : test_score, "train_score" : train_score, "f1" : f1,
                 "auc_pr" : auc_precision_recall, "auc": roc_auc}
    return metriques

## **Load the data set**

In [None]:
! wget https://raw.githubusercontent.com/mounsifelatouch/cdd/master/data/bioactivity_data_PubchemFingerprinter.csv

In [None]:
df = pd.read_csv('bioactivity_data_PubchemFingerprinter.csv')
df

## **Input features**

### **Input features**

In [None]:
X = df.drop('Activity', axis=1)

### **Output features**

In [None]:
y = df['Activity']

In [None]:
# count the number of instances in each class
counts = y.value_counts()

# calculate the ratio of the negative class to the positive class
imbalance_ratio = counts[0] / counts[1]

### **Let's examine the data dimension**

In [None]:
X.shape, y.shape

### **Remove low variance features**

In [None]:
! pip install statsmodels

In [None]:
import statsmodels.api as sm

def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

In [None]:
best_features = forward_selection(X, y)

In [None]:
len(best_features)

In [None]:
X = X[best_features]

In [None]:
X.shape

## **Data split (80/20 ratio)**

In [None]:
n = np.arange(len(X))
idx_train, idx_test = train_test_split(n, test_size=.2, random_state=seed)

In [None]:
X_train, y_train = X.loc[idx_train], y.loc[idx_train]
X_test, y_test = X.loc[idx_test], y.loc[idx_test]

In [None]:
X_train.shape, y_train.shape

In [None]:
y_test.value_counts()

## **Data balancing**

### **Random Oversampling**

In [None]:
from imblearn.over_sampling import RandomOverSampler

# create an oversampler object with a 1:1 ratio of positive to negative samples
ros = RandomOverSampler(sampling_strategy='minority')

# fit and apply the oversampler to the training data
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

### **SMOTE**

In [None]:
from imblearn.over_sampling import SMOTE

# Perform oversampling on the minority class in the training set
sm = SMOTE(random_state=seed)

# fit and apply the oversampler to the training data
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

### **ADASYN**

In [None]:
from imblearn.over_sampling import ADASYN

# Perform oversampling on the minority class in the training set
adasyn = ADASYN(random_state=seed)

# fit and apply the oversampler to the training data
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)



---



In [None]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [None]:
n_samples, n_classes = X_train.shape[0], 2
class_weights = dict(zip(np.unique(y_train), n_samples / (n_classes * np.bincount(y_train))))

# **GridSearchCV**
* cv = 10
* *`'balanced_accuracy'`* as a scoring metric to deal with imbalanced datasets. It is defined as the average of recall obtained on each class.

In [None]:
best_params = []

## **ExtraTreesClassifier**

In [None]:
clf1 = ExtraTreesClassifier(random_state=seed, class_weight='balanced')
param_grid = {'n_estimators': [100, 500, 1000],
              'max_depth': [10, 25, 50, 100],
              'max_features': [int(sqrt(X.shape[1])), int(log2(X.shape[1]))],
              'min_samples_split' : [2, 5, 10], 
              'min_samples_leaf' : [1, 5, 10],
              'class_weight' : ['balanced', None]}

# Define the GridSearchCV object with 5-fold cross-validation
grid_search = GridSearchCV(clf1, param_grid=param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1)

# Fit the GridSearchCV object with the training data
grid_search.fit(X_train, y_train)
best_params.append(grid_search.best_params_)

# Print the best hyperparameters and the corresponding mean cross-validation score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best mean cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best hyperparameters to fit the model to the training data
clf_best = ExtraTreesClassifier(**grid_search.best_params_, random_state=seed)
model1 = clf_best.fit(X_train, y_train)

clf1 = print_auc(model1, X_train, y_train, X_test, y_test, 'ExtraTreesClassifier')
df1 = pd.DataFrame(data=[['ExtraTreesClassifier', clf1['f1'], clf1['auc'], clf1['auc_pr'], clf1['recall_sc'][0], clf1['recall_sc'][1], clf1['precision_sc'][0], clf1['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df1.style.hide_index()

## **XGBClassifier**

In [None]:
clf2 = xgb.XGBClassifier(objective='binary:logistic')
param_grid = {'learning_rate' : [0.01, 0.1, 0.2],
              'max_depth' : [3, 10, 25],
              'gamma' : [0.1, 0.5, 1.0],
              'subsample': [0.5, 0.8, 1.0],
              'colsample_bytree': [0.5, 0.8, 1.0],
              'scale_pos_weight': [1, imbalance_ratio]}

# Define the GridSearchCV object with 5-fold cross-validation
grid_search = GridSearchCV(clf2, param_grid=param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1)

# Fit the GridSearchCV object with the training data
grid_search.fit(X_train, y_train)
best_params.append(grid_search.best_params_)

# Print the best hyperparameters and the corresponding mean cross-validation score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best mean cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best hyperparameters to fit the model to the training data
clf_best = xgb.XGBClassifier(**grid_search.best_params_, objective='binary:logistic')
model2 = clf_best.fit(X_train, y_train)

clf2 = print_auc(model2, X_train, y_train, X_test, y_test, 'XGBClassifier')
df2 = pd.DataFrame(data=[['XGBClassifier', clf2['f1'], clf2['auc'], clf2['auc_pr'], clf2['recall_sc'][0], clf2['recall_sc'][1], clf2['precision_sc'][0], clf2['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df2.style.hide_index()

## **AdaBoostClassifier**

In [None]:
clf3 = AdaBoostClassifier(random_state=seed)
param_grid = {'n_estimators': [50, 100, 200],
              'learning_rate': [0.01, 0.1, 0.2],
              'base_estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=3)]}

# Define the GridSearchCV object with 5-fold cross-validation
grid_search = GridSearchCV(clf3, param_grid=param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1)

# Fit the GridSearchCV object with the training data
grid_search.fit(X_train, y_train)
best_params.append(grid_search.best_params_)

# Print the best hyperparameters and the corresponding mean cross-validation score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best mean cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best hyperparameters to fit the model to the training data
clf_best = AdaBoostClassifier(**grid_search.best_params_, random_state=seed)
model3 = clf_best.fit(X_train, y_train)

clf3 = print_auc(model3, X_train, y_train, X_test, y_test, 'AdaBoostClassifier')
df3 = pd.DataFrame(data=[['AdaBoostClassifier', clf3['f1'], clf3['auc'], clf3['auc_pr'], clf3['recall_sc'][0], clf3['recall_sc'][1], clf3['precision_sc'][0], clf3['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df3.style.hide_index()

## **DecisionTreeClassifier**

In [None]:
clf4 = DecisionTreeClassifier(random_state=seed)
param_grid = {'min_samples_split': [2, 5, 10, 20],
              'max_depth': [2, 3, 4, 5, None],
              'min_samples_leaf': [1, 2, 4, 8],
              'criterion' : ['gini', 'entropy', 'log_loss'],
              'max_features': ['auto', int(sqrt(X.shape[1])), int(log2(X.shape[1]))]}

# Define the GridSearchCV object with 5-fold cross-validation
grid_search = GridSearchCV(clf4, param_grid=param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1)

# Fit the GridSearchCV object with the training data
grid_search.fit(X_train, y_train)
best_params.append(grid_search.best_params_)

# Print the best hyperparameters and the corresponding mean cross-validation score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best mean cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best hyperparameters to fit the model to the training data
clf_best = DecisionTreeClassifier(**grid_search.best_params_, random_state=seed)
model4 = clf_best.fit(X_train, y_train)

clf4 = print_auc(model4, X_train, y_train, X_test, y_test, 'DecisionTreeClassifier')
df4 = pd.DataFrame(data=[['DecisionTreeClassifier', clf4['f1'], clf4['auc'], clf4['auc_pr'], clf4['recall_sc'][0], clf4['recall_sc'][1], clf4['precision_sc'][0], clf4['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df4.style.hide_index()

## **GaussianNB**

In [None]:
clf5 = GaussianNB()
param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}

# Define the GridSearchCV object with 5-fold cross-validation
grid_search = GridSearchCV(clf5, param_grid=param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1)

# Fit the GridSearchCV object with the training data
grid_search.fit(X_train, y_train)
best_params.append(grid_search.best_params_)

# Print the best hyperparameters and the corresponding mean cross-validation score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best mean cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best hyperparameters to fit the model to the training data
clf_best = GaussianNB(**grid_search.best_params_)
model5 = clf_best.fit(X_train, y_train)

clf5 = print_auc(model5, X_train, y_train, X_test, y_test, 'GaussianNB')
df5 = pd.DataFrame(data=[['GaussianNB', clf5['f1'], clf5['auc'], clf5['auc_pr'], clf5['recall_sc'][0], clf5['recall_sc'][1], clf5['precision_sc'][0], clf5['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df5.style.hide_index()

## **LGBMClassifier**

In [None]:
clf6 = lgb.LGBMClassifier()
param_grid = {'num_leaves': [10, 20, 30],
              'max_depth': [3, 5, 7],
              'learning_rate': [0.01, 0.1],
              'n_estimators': [50, 100, 200],
              'min_child_samples': [10, 20, 30],
              'subsample': [0.8, 0.9, 1.0],
              'colsample_bytree': [0.8, 0.9, 1.0],
              'class_weight' : ['balanced', None]}
         
# Define the GridSearchCV object with 5-fold cross-validation
grid_search = GridSearchCV(clf6, param_grid=param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1)

# Fit the GridSearchCV object with the training data
grid_search.fit(X_train, y_train)
best_params.append(grid_search.best_params_)

# Print the best hyperparameters and the corresponding mean cross-validation score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best mean cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best hyperparameters to fit the model to the training data
clf_best = lgb.LGBMClassifier(**grid_search.best_params_)
model6 = clf_best.fit(X_train, y_train)

clf6 = print_auc(model6, X_train, y_train, X_test, y_test, 'LGBMClassifier')
df6 = pd.DataFrame(data=[['LGBMClassifier', clf6['f1'], clf6['auc'], clf6['auc_pr'], clf6['recall_sc'][0], clf6['recall_sc'][1], clf6['precision_sc'][0], clf6['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df6.style.hide_index()

## **KNeighborsClassifier**

In [None]:
clf7 = KNeighborsClassifier()
param_grid = {'n_neighbors': [3, 5, 7],
              'weights': ['uniform', 'distance'],
              'p': [1, 2],
              'algorithm': ['brute'],
              'leaf_size': [10, 30, 50]}

# Define the GridSearchCV object with 5-fold cross-validation
grid_search = GridSearchCV(clf7, param_grid=param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1)

# Fit the GridSearchCV object with the training data
grid_search.fit(X_train, y_train)
best_params.append(grid_search.best_params_)

# Print the best hyperparameters and the corresponding mean cross-validation score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best mean cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best hyperparameters to fit the model to the training data
clf_best = KNeighborsClassifier(**grid_search.best_params_)
model7 = clf_best.fit(X_train, y_train)

clf7 = print_auc(model7, X_train, y_train, X_test, y_test, 'KNeiborsClassifier')
df7 = pd.DataFrame(data=[['KNeiborsClassifier', clf7['f1'], clf7['auc'], clf7['auc_pr'], clf7['recall_sc'][0], clf7['recall_sc'][1], clf7['precision_sc'][0], clf7['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df7.style.hide_index()

## **RandomForestClassifier**

In [None]:
clf8 = RandomForestClassifier(random_state=seed)
param_grid = {'n_estimators': [50, 100, 200],
              'max_depth': [3, 5, 7],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'max_features': ['sqrt', 'log2'],
              'criterion': ['gini', 'entropy']}

# Define the GridSearchCV object with 5-fold cross-validation
grid_search = GridSearchCV(clf8, param_grid=param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1)

# Fit the GridSearchCV object with the training data
grid_search.fit(X_train, y_train)
best_params.append(grid_search.best_params_)

# Print the best hyperparameters and the corresponding mean cross-validation score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best mean cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best hyperparameters to fit the model to the training data
clf_best = RandomForestClassifier(**grid_search.best_params_, random_state=seed)
model8 = clf_best.fit(X_train, y_train)

clf8 = print_auc(model8, X_train, y_train, X_test, y_test, 'RandomForestClassifier')
df8 = pd.DataFrame(data=[['RandomForestClassifier', clf8['f1'], clf8['auc'], clf8['auc_pr'], clf8['recall_sc'][0], clf8['recall_sc'][1], clf8['precision_sc'][0], clf8['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df8.style.hide_index()

## **SVC**

In [None]:
clf9 = SVC(probability=True)
param_grid = {'C': [0.1, 1, 10],
              'kernel': ['linear', 'poly', 'rbf'],
              'degree': [2, 3, 4],
              'gamma': ['scale', 'auto'],
              'class_weight': [None, 'balanced']}

# Define the GridSearchCV object with 5-fold cross-validation
grid_search = GridSearchCV(clf9, param_grid=param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1)

# Fit the GridSearchCV object with the training data
grid_search.fit(X_train, y_train)
best_params.append(grid_search.best_params_)

# Print the best hyperparameters and the corresponding mean cross-validation score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best mean cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best hyperparameters to fit the model to the training data
clf_best = SVC(**grid_search.best_params_, probability=True)
model9 = clf_best.fit(X_train, y_train)

clf9 = print_auc(model9, X_train, y_train, X_test, y_test, 'SVM')
df9 = pd.DataFrame(data=[['SVM', clf9['f1'], clf9['auc'], clf9['auc_pr'], clf9['recall_sc'][0], clf9['recall_sc'][1], clf9['precision_sc'][0], clf9['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df9.style.hide_index()

## **LogisticRegression**

In [None]:
clf10 = LogisticRegression()
param_grid = {'penalty': ['l1', 'l2'],
              'C': [0.1, 1, 10],
              'solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag'],
              'max_iter': [100, 200, 500],
              'class_weight': [None, 'balanced']}

grid_search = GridSearchCV(clf10, param_grid=param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1)

# Fit the GridSearchCV object with the training data
grid_search.fit(X_train, y_train)
best_params.append(grid_search.best_params_)

# Print the best hyperparameters and the corresponding mean cross-validation score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best mean cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best hyperparameters to fit the model to the training data
clf_best = LogisticRegression(**grid_search.best_params_, random_state=seed)
model10 = clf_best.fit(X_train, y_train)

clf10 = print_auc(model10, X_train, y_train, X_test, y_test, 'LogisticRegression')
df10 = pd.DataFrame(data=[['LogisticRegression', clf10['f1'], clf10['auc'], clf10['auc_pr'], clf10['recall_sc'][0], clf10['recall_sc'][1], clf10['precision_sc'][0], clf10['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df10.style.hide_index()

In [None]:
df = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10]
df_finale = pd.concat(df)
df_finale

In [None]:
df_finale.to_csv('df_finale.csv', index=False)

In [None]:
plt.rcParams["figure.figsize"] = [16, 9]
plt.plot(clf6["fpr"], clf6["tpr"], label="LGBMClassifier, auc="+str(clf10["auc"]))
plt.plot(clf9["fpr"], clf9["tpr"], label="SVM, auc="+str(clf2["auc"]))
plt.plot(clf5["fpr"], clf5["tpr"], label="GaussianNB, auc="+str(clf3["auc"]))
plt.plot(clf1["fpr"], clf1["tpr"], label="ExtraTrees, auc="+str(clf6["auc"]))
plt.plot(clf7["fpr"], clf7["tpr"], label="KNN, auc="+str(clf5["auc"]))
plt.plot(clf4["fpr"], clf4["tpr"], label="DT, auc="+str(clf8["auc"]))
plt.plot(clf10["fpr"], clf10["tpr"], label="LR, auc="+str(clf9["auc"]))
plt.plot(clf8["fpr"], clf8["tpr"], label="RF, auc="+str(clf4["auc"]))
plt.plot(clf3["fpr"], clf3["tpr"], label="AdaBoost, auc="+str(clf7["auc"]))
plt.plot(clf2["fpr"], clf2["tpr"], label="XGB, auc="+str(clf2["auc"]))

plt.xlabel("tpr")
plt.ylabel("fpr")
plt.legend(loc=4)
plt.show()

plot_confusion_matrix(best_model, fingerprints_test, bioactivity_test)

joblib.dump(best_model, 'best_model.sav')