# **Bioinformatics Project - Computational Drug Discovery [Part 4] Classification Models Building**

**MOUNSIF EL ATOUCH**

In this Jupyter notebook, we will be building a machine learning model using the ChEMBL bioactivity data.

In **Part 4**, we will be building classification models

---

## **Importing libraries**

In [1]:
import os
import pandas as pd

from scipy.spatial.distance import *
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

seed = 42

from sklearn.model_selection import *
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import *

from scipy import interp
from math import *
from sklearn.metrics import RocCurveDisplay

import warnings
warnings.filterwarnings('ignore')

In [4]:
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC  
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

In [5]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [6]:
def print_auc(clf, X_train, y_train, X_test, y_test, model_name) :
    
    clf = clf.fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)[::,1]
    fpr, tpr, thr = roc_curve(y_test,  y_pred_proba)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    #plt.plot(fpr, tpr, label=model_name+" auc="+str(roc_auc))
    #plt.legend(loc=4)
    #plt.title("ROC curve")

    #plt.show()
    # Data to plot precision - recall curve
    precision, recall, thresholds_log = precision_recall_curve(y_test, y_pred_proba)
    # Use AUC function to calculate the area under the curve of precision recall curve
    auc_precision_recall = auc(recall, precision)
    
    #plt.plot(recall, precision, label=model_name+" auc_prc="+str(auc_precision_recall))
    #plt.xlabel("Recall")
    #plt.ylabel("Precision")
    #plt.legend(loc=4)
    #plt.title("Precision-Recall Curve")

    #plt.show()
    #print_score(clf, X_train, y_train, X_test, y_test, train=True)
    precision_sc=precision_score(y_test, clf.predict(X_test), average=None)
    recall_sc = recall_score(y_test, clf.predict(X_test), average=None)
    test_score = accuracy_score(y_test, clf.predict(X_test)) * 100
    train_score = accuracy_score(y_train, clf.predict(X_train)) * 100
    f1 = f1_score(y_test, clf.predict(X_test))
    #print_score(clf, X_train, y_train, X_test, y_test, train=True)
    #print_score(clf, X_train, y_train, X_test, y_test, train=False)
    metriques = {"fpr" : fpr, "recall_sc" : recall_sc, "precision_sc" : precision_sc, "tpr" : tpr,
                 "test_score" : test_score, "train_score" : train_score, "f1" : f1,
                 "auc_pr" : auc_precision_recall, "auc": roc_auc}
    return metriques

## **Load the data set**

In [7]:
! wget https://raw.githubusercontent.com/mounsifelatouch/cdd/master/data/bioactivity_data_GraphOnlyFingerprinter.csv

--2023-06-01 09:21:12--  https://raw.githubusercontent.com/mounsifelatouch/cdd/master/data/bioactivity_data_GraphOnlyFingerprinter.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1339590 (1.3M) [text/plain]
Saving to: 'bioactivity_data_GraphOnlyFingerprinter.csv.1'

     0K .......... .......... .......... .......... ..........  3%  131K 10s
    50K .......... .......... .......... .......... ..........  7%  249K 7s
   100K .......... .......... .......... .......... .......... 11%  476K 5s
   150K .......... .......... .......... .......... .......... 15%  740K 4s
   200K .......... .......... .......... .......... .......... 19%  239K 4s
   250K .......... .......... .......... .......... .......... 22%  751K 3s
   300K .......... .......... ..........

In your case, if missing an active compound has significant consequences (e.g., lost opportunity to develop a potential drug), you may want to prioritize recall for the positive (active) class. This ensures that the model is able to correctly identify as many active compounds as possible, even if it means accepting a higher rate of false positives.

However, you should also consider the precision of the positive class, since false positives could be costly in terms of wasted resources and effort. For example, if a computational chemist or a medicinal chemist is tasked with synthesizing and testing a large number of predicted active compounds, a high rate of false positives could result in wasted resources and effort.

Therefore, you may want to aim for a balanced precision and recall for the positive positive class, where the model is able to correctly identify most of the active compounds while minimizing the false positive rate. This can be achieved by using a suitable evaluation metric, such as F1-score, which balances precision and recall.

In addition, you may want to consider the class imbalance in your dataset and use techniques such as oversampling, undersampling, or the SMOTE algorithm to balance the class distribution and improve the performance of the model for the positive class.

In [8]:
df = pd.read_csv('bioactivity_data_GraphOnlyFingerprinter.csv')
df

Unnamed: 0,GraphFP1,GraphFP2,GraphFP3,GraphFP4,GraphFP5,GraphFP6,GraphFP7,GraphFP8,GraphFP9,GraphFP10,...,GraphFP1016,GraphFP1017,GraphFP1018,GraphFP1019,GraphFP1020,GraphFP1021,GraphFP1022,GraphFP1023,GraphFP1024,Activity
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
644,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
645,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
646,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## **Input features**

### **Input features**

In [9]:
X = df.drop('Activity', axis=1)

### **Output features**

In [10]:
y = df['Activity']

In [11]:
# count the number of instances in each class
counts = y.value_counts()

# calculate the ratio of the negative class to the positive class
imbalance_ratio = counts[0] / counts[1]

### **Let's examine the data dimension**

In [12]:
X.shape, y.shape

((648, 1024), (648,))

## **Data split (80/20 ratio)**

In [13]:
idx_train, idx_test = train_test_split(np.arange(len(X)), stratify=y, test_size=.2)

In [14]:
X_train, y_train = X.loc[idx_train], y.loc[idx_train]
X_test, y_test = X.loc[idx_test], y.loc[idx_test]

In [15]:
X_train.shape, y_train.shape

((518, 1024), (518,))

## **Data balancing**

### **Random Oversampling**

In [16]:
from imblearn.over_sampling import RandomOverSampler

# create an oversampler object with a 1:1 ratio of positive to negative samples
ros = RandomOverSampler(sampling_strategy='minority')

# fit and apply the oversampler to the training data
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

### **SMOTE**

In [17]:
from imblearn.over_sampling import SMOTE

# Perform oversampling on the minority class in the training set
sm = SMOTE(random_state=seed)

# fit and apply the oversampler to the training data
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

### **ADASYN**

In [18]:
from imblearn.over_sampling import ADASYN

# Perform oversampling on the minority class in the training set
adasyn = ADASYN(random_state=seed)

# fit and apply the oversampler to the training data
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)



---



In [19]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [20]:
n_samples, n_classes = 518, 2
class_weights = dict(zip(np.unique(y_train), n_samples / (n_classes * np.bincount(y_train))))

# **GridSearchCV**
* cv = 10
* 'balanced_accuracy' as a scoring metric to deal with imbalanced datasets. It is defined as the average of recall obtained on each class.

In [21]:
def gridsearch_cv(clf, X_train, y_train, param_grid):
    
    # Create a GridSearchCV object and fit the data
    clf_random = GridSearchCV(clf, param_grid, cv=10, scoring='balanced_accuracy', n_jobs=2)
    clf_random.fit(X_train, y_train)

    # Print the best hyperparameters and the corresponding mean cross-validation score
    print('--------------------------------------------------------------------')
    print("Best hyperparameters:", clf_random.best_params_)
    print("Best score:", clf_random.best_score_)

    pred_proba = clf_random.predict_proba(X_test) 

    precision, recall, _ = precision_recall_curve(y_test, pred_proba[:, 1])
    pr_auc = auc(recall, precision)

    y_train_pred = clf_random.predict(X_train)
    train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel())*100
    y_test_pred = clf_random.predict(X_test)
    test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel())*100

    acc = accuracy_score(y_test, y_test_pred) 
    prec = precision_score(y_test, y_test_pred)
    rec = recall_score(y_test, y_test_pred)   
    f1 = f1_score(y_test, y_test_pred)  
    roc_auc = roc_auc_score(y_test, y_test_pred)
    print(f'Accuracy: {acc}, Precision: {prec}, Recall: {rec}, f1_score: {f1}, ROC AUC: {roc_auc}')

## **ExtraTreesClassifier**

In [22]:
clf1 = ExtraTreesClassifier(random_state=seed, class_weight=class_weights)
param_grid1 = {'n_estimators': [100, 500, 1000],
        'max_depth': [10, 25, 40, 50],
        'max_features': [int(sqrt(X.shape[1])), int(log2(X.shape[1]))]}

gridsearch_cv(clf1, X_train, y_train, param_grid1)

--------------------------------------------------------------------
Best hyperparameters: {'max_depth': 10, 'max_features': 10, 'n_estimators': 100}
Best score: 0.7414539007092198
Accuracy: 0.9384615384615385, Precision: 0.6153846153846154, Recall: 0.7272727272727273, f1_score: 0.6666666666666667, ROC AUC: 0.8426279602750191


In [23]:
model1 = ExtraTreesClassifier(random_state=seed, max_depth=10, n_estimators=100, class_weight=class_weights, max_features=10).fit(X_train, y_train)
clf1 = print_auc(model1, X_train, y_train, X_test, y_test, 'ExtraTreesClassifier')
df1 = pd.DataFrame(data=[['ExtraTreesClassifier', clf1['f1'], clf1['auc'], clf1['auc_pr'], clf1['recall_sc'][0], clf1['recall_sc'][1], clf1['precision_sc'][0], clf1['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df1.style.hide_index()

model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
ExtraTreesClassifier,0.666667,0.902215,0.718749,0.957983,0.727273,0.974359,0.615385


## **XGBClassifier**

In [24]:
clf2 = xgb.XGBClassifier(objective='binary:logistic', scale_pos_weight=imbalance_ratio)
param_grid2 = {'learning_rate' : [0.001, 0.1, 1.0],
        'max_depth' : [3, 7, 25, 100],
        'gamma' : [0.1, 0.5, 1.0]}

gridsearch_cv(clf2, X_train, y_train, param_grid2)

--------------------------------------------------------------------
Best hyperparameters: {'gamma': 0.1, 'learning_rate': 0.001, 'max_depth': 3}
Best score: 0.7622828014184397
Accuracy: 0.8692307692307693, Precision: 0.36363636363636365, Recall: 0.7272727272727273, f1_score: 0.4848484848484849, ROC AUC: 0.8048128342245988


In [35]:
model2 = xgb.XGBClassifier(learning_rate=1.0, max_depth=25, gamma=0.1, subsample=0.8, colsample_bytree=0.8, 
                           objective='binary:logistic', scale_pos_weight=imbalance_ratio).fit(X_train, y_train)
clf2 = print_auc(model2, X_train, y_train, X_test, y_test, 'XGBClassifier')
df2 = pd.DataFrame(data=[['XGBClassifier', clf2['f1'], clf2['auc'], clf2['auc_pr'], clf2['recall_sc'][0], clf2['recall_sc'][1], clf2['precision_sc'][0], clf2['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df2.style.hide_index()

model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
XGBClassifier,0.75,0.918258,0.724134,0.966387,0.818182,0.982906,0.692308


## **AdaBoostClassifier**

In [29]:
clf3 = AdaBoostClassifier(random_state=seed)
param_grid3 = {'n_estimators' : [25, 50, 100]}

gridsearch_cv(clf3, X_train, y_train, param_grid3)

--------------------------------------------------------------------
Best hyperparameters: {'n_estimators': 50}
Best score: 0.7059397163120568
Accuracy: 0.9384615384615385, Precision: 0.7142857142857143, Recall: 0.45454545454545453, f1_score: 0.5555555555555556, ROC AUC: 0.7188693659281895


In [38]:
model3 = AdaBoostClassifier(random_state=seed, n_estimators=100).fit(X_train, y_train)
clf3 = print_auc(model3, X_train, y_train, X_test, y_test, 'AdaBoostClassifier')
df3 = pd.DataFrame(data=[['AdaBoostClassifier', clf3['f1'], clf3['auc'], clf3['auc_pr'], clf3['recall_sc'][0], clf3['recall_sc'][1], clf3['precision_sc'][0], clf3['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df3.style.hide_index()

model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
AdaBoostClassifier,0.727273,0.955309,0.800975,0.97479,0.727273,0.97479,0.727273


## **DecisionTreeClassifier**

In [39]:
clf4 = DecisionTreeClassifier(random_state=seed)
param_grid4 = {'min_samples_leaf' : [2, 5, 7],
         'max_depth' : [10, 25, 50],
         'criterion' : ['gini', 'entropy', 'log_loss']}

gridsearch_cv(clf4, X_train, y_train, param_grid4)

--------------------------------------------------------------------
Best hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 7}
Best score: 0.7094060283687943
Accuracy: 0.9307692307692308, Precision: 0.6, Recall: 0.5454545454545454, f1_score: 0.5714285714285713, ROC AUC: 0.755920550038197


In [75]:
model4 = DecisionTreeClassifier(random_state=seed, min_samples_leaf=2, max_depth=25, criterion='gini').fit(X_train, y_train)
clf4 = print_auc(model4, X_train, y_train, X_test, y_test, 'DecisionTreeClassifier')
df4 = pd.DataFrame(data=[['DecisionTreeClassifier', clf4['f1'], clf4['auc'], clf4['auc_pr'], clf4['recall_sc'][0], clf4['recall_sc'][1], clf4['precision_sc'][0], clf4['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df4.style.hide_index()

model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
DecisionTreeClassifier,0.761905,0.890374,0.726039,0.983193,0.727273,0.975,0.8


## **GaussianNB**

In [61]:
clf5 = GaussianNB()
param_grid5 = {'var_smoothing': [1, 3, 5, 8]}

gridsearch_cv(clf5, X_train, y_train, param_grid5)

--------------------------------------------------------------------
Best hyperparameters: {'var_smoothing': 1}
Best score: 0.7518351063829788
Accuracy: 0.7923076923076923, Precision: 0.25, Recall: 0.7272727272727273, f1_score: 0.37209302325581395, ROC AUC: 0.7627960275019099


In [81]:
model5 = GaussianNB(var_smoothing=8).fit(X_train, y_train)
clf5 = print_auc(model5, X_train, y_train, X_test, y_test, 'GaussianNB')
df5 = pd.DataFrame(data=[['GaussianNB', clf5['f1'], clf5['auc'], clf5['auc_pr'], clf5['recall_sc'][0], clf5['recall_sc'][1], clf5['precision_sc'][0], clf5['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df5.style.hide_index()

model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
GaussianNB,0.533333,0.858671,0.484149,0.907563,0.727273,0.972973,0.421053


## **LGBMClassifier**

In [82]:
clf6 = lgb.LGBMClassifier(class_weight='balanced')
param_grid6 = {'num_leaves' : [20, 40, 80, 100],
         'reg_alpha' : [0.1, 0.5],
         'min_data_in_leaf' : [10, 30, 50],
         'scale_pos_weight' : [10],
         'max_depth' : [7, 10]}
         
gridsearch_cv(clf6, X_train, y_train, param_grid6)

--------------------------------------------------------------------
Best hyperparameters: {'max_depth': 7, 'min_data_in_leaf': 50, 'num_leaves': 20, 'reg_alpha': 0.1, 'scale_pos_weight': 10}
Best score: 0.7899157801418439
Accuracy: 0.9153846153846154, Precision: 0.5, Recall: 0.8181818181818182, f1_score: 0.6206896551724137, ROC AUC: 0.871275783040489


In [88]:
model6 = lgb.LGBMClassifier(num_leaves=20, reg_alpha=0.1, min_data_in_leaf=10, scale_pos_weight=10, max_depth=7).fit(X_train, y_train)
clf6 = print_auc(model6, X_train, y_train, X_test, y_test, 'LGBMClassifier')
df6 = pd.DataFrame(data=[['LGBMClassifier', clf6['f1'], clf6['auc'], clf6['auc_pr'], clf6['recall_sc'][0], clf6['recall_sc'][1], clf6['precision_sc'][0], clf6['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df6.style.hide_index()



model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
LGBMClassifier,0.727273,0.932009,0.708025,0.97479,0.727273,0.97479,0.727273


## **KNeighborsClassifier**

In [89]:
clf7 = KNeighborsClassifier()
param_grid7 = {'n_neighbors' : [3, 5, 7],
         'weights': ['distance'],
         'metric' : ['euclidean', 'manhattan'],
         'algorithm' : ['brute']}

gridsearch_cv(clf7, X_train, y_train, param_grid7)

--------------------------------------------------------------------
Best hyperparameters: {'algorithm': 'brute', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
Best score: 0.6820035460992908
Accuracy: 0.9615384615384616, Precision: 0.8, Recall: 0.7272727272727273, f1_score: 0.761904761904762, ROC AUC: 0.8552330022918258


In [92]:
model7 = KNeighborsClassifier(n_neighbors=3, weights='distance', metric='euclidean', algorithm='brute').fit(X_train, y_train)
clf7 = print_auc(model7, X_train, y_train, X_test, y_test, 'KNeiborsClassifier')
df7 = pd.DataFrame(data=[['KNeiborsClassifier', clf7['f1'], clf7['auc'], clf7['auc_pr'], clf7['recall_sc'][0], clf7['recall_sc'][1], clf7['precision_sc'][0], clf7['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df7.style.hide_index()

model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
KNeiborsClassifier,0.761905,0.944232,0.869955,0.983193,0.727273,0.975,0.8


## **RandomForestClassifier**

In [93]:
clf8 = RandomForestClassifier(random_state=seed)
param_grid8 = {'n_estimators' : [500, 750, 1000],
         'max_depth' : [10, 12, 20],
         'bootstrap' : [True, False]}

gridsearch_cv(clf8, X_train, y_train, param_grid8)

--------------------------------------------------------------------
Best hyperparameters: {'bootstrap': False, 'max_depth': 10, 'n_estimators': 750}
Best score: 0.6876507092198582
Accuracy: 0.9615384615384616, Precision: 0.8, Recall: 0.7272727272727273, f1_score: 0.761904761904762, ROC AUC: 0.8552330022918258


In [95]:
model8 = RandomForestClassifier(random_state=seed, n_estimators=750, max_depth=10, bootstrap=False).fit(X_train, y_train)
clf8 = print_auc(model8, X_train, y_train, X_test, y_test, 'RandomForestClassifier')
df8 = pd.DataFrame(data=[['RandomForestClassifier', clf8['f1'], clf8['auc'], clf8['auc_pr'], clf8['recall_sc'][0], clf8['recall_sc'][1], clf8['precision_sc'][0], clf8['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df8.style.hide_index()

model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
RandomForestClassifier,0.761905,0.925134,0.768241,0.983193,0.727273,0.975,0.8


## **SVC**

In [None]:
clf9 = SVC(probability=True, class_weight='balanced')
param_grid9 = {'C' : [10, 100], 
         'gamma' : [10, 100, 500],
         'kernel' : ['rbf', 'poly'],
         'shrinking' : [False, True]}

gridsearch_cv(clf9, X_train, y_train, param_grid9)

In [None]:
model9 = SVC(C=10, kernel='poly', gamma=20, class_weight='balanced', probability=True).fit(X_train, y_train)
clf9 = print_auc(model9, X_train, y_train, X_test, y_test, 'SVM')
df9 = pd.DataFrame(data=[['SVM', clf9['f1'], clf9['auc'], clf9['auc_pr'], clf9['recall_sc'][0], clf9['recall_sc'][1], clf9['precision_sc'][0], clf9['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df9.style.hide_index()

## **MLPClassifier**

In [None]:
clf10 = MLPClassifier(random_state=seed)
param_grid10 = {'hidden_layer_sizes' : [30, 50],
          'alpha' : [0.0001, 0.05],
          'solver' : ['adam'],
          'learning_rate' : ['invscaling', 'adaptive'],
          'activation' : ['logistic', 'tanh']}

gridsearch_cv(clf10, X_train, y_train, param_grid10)

In [None]:
model10 = MLPClassifier(random_state=seed, alpha=0.0001, solver='adam', hidden_layer_sizes=50, activation='logistic').fit(X_train, y_train)
clf10 = print_auc(model10, X_train, y_train, X_test, y_test, 'MLPClassifier')
df10 = pd.DataFrame(data=[['MLPClassifier', clf10['f1'], clf10['auc'], clf10['auc_pr'], clf10['recall_sc'][0], clf10['recall_sc'][1], clf10['precision_sc'][0], clf10['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df10.style.hide_index()

## **LogisticRegression**

In [None]:
clf11 = LogisticRegression(class_weight='balanced')
param_grid11 = {'penalty' : ['none', 'l1', 'l2', 'elasticnet'],
          'C':[0.001, 0.009, 0.01, .09, 1, 5, 10, 25]}

gridsearch_cv(clf11, X_train, y_train, param_grid11)

In [None]:
model11 = LogisticRegression(class_weight='balanced', C=0.001, penalty='l2').fit(X_train, y_train)
clf11 = print_auc(model11, X_train, y_train, X_test, y_test, 'LogisticRegression')
df11 = pd.DataFrame(data=[['LogisticRegression', clf11['f1'], clf11['auc'], clf11['auc_pr'], clf11['recall_sc'][0], clf11['recall_sc'][1], clf11['precision_sc'][0], clf11['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df11.style.hide_index()

In [None]:
df = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11]
df_finale = pd.concat(df).style.hide_index()
df_finale

In [None]:
plt.rcParams["figure.figsize"] = [16, 9]
plt.plot(clf6["fpr"], clf6["tpr"], label="LGBMClassifier, auc="+str(clf10["auc"]))
plt.plot(clf11["fpr"], clf11["tpr"], label="LR, auc="+str(clf1["auc"]))
plt.plot(clf9["fpr"], clf9["tpr"], label="SVM, auc="+str(clf2["auc"]))
plt.plot(clf5["fpr"], clf5["tpr"], label="GaussianNB, auc="+str(clf3["auc"]))
plt.plot(clf1["fpr"], clf1["tpr"], label="ExtraTrees, auc="+str(clf6["auc"]))
plt.plot(clf7["fpr"], clf7["tpr"], label="KNN, auc="+str(clf5["auc"]))
plt.plot(clf4["fpr"], clf4["tpr"], label="DT, auc="+str(clf8["auc"]))
plt.plot(clf10["fpr"], clf10["tpr"], label="MLP, auc="+str(clf9["auc"]))
plt.plot(clf8["fpr"], clf8["tpr"], label="RF, auc="+str(clf4["auc"]))
plt.plot(clf3["fpr"], clf3["tpr"], label="AdaBoost, auc="+str(clf7["auc"]))
plt.plot(clf2["fpr"], clf2["tpr"], label="XGB, auc="+str(clf11["auc"]))

plt.xlabel("tpr")
plt.ylabel("fpr")
plt.legend(loc=4)
plt.show()