<a href="https://colab.research.google.com/github/mounsifelatouch/cdd/blob/master/notebooks/4_cdd_ml_part_4_models_building.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Bioinformatics Project - Computational Drug Discovery [Part 4] Classification Models Building**

**MOUNSIF EL ATOUCH**

In this Jupyter notebook, we will be building a machine learning model using the ChEMBL bioactivity data.

In **Part 4**, we will be building classification models

---

## **Importing libraries**

In [None]:
import os
import pandas as pd

from scipy.spatial.distance import *
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

seed = 42

from sklearn.model_selection import *
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import *

from scipy import interp
from math import *
from sklearn.metrics import RocCurveDisplay

import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC  
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [None]:
def print_auc(clf, X_train, y_train, X_test, y_test, model_name) :
    
    clf = clf.fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)[::,1]
    fpr, tpr, thr = roc_curve(y_test,  y_pred_proba)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    #plt.plot(fpr, tpr, label=model_name+" auc="+str(roc_auc))
    #plt.legend(loc=4)
    #plt.title("ROC curve")

    #plt.show()
    # Data to plot precision - recall curve
    precision, recall, thresholds_log = precision_recall_curve(y_test, y_pred_proba)
    # Use AUC function to calculate the area under the curve of precision recall curve
    auc_precision_recall = auc(recall, precision)
    
    #plt.plot(recall, precision, label=model_name+" auc_prc="+str(auc_precision_recall))
    #plt.xlabel("Recall")
    #plt.ylabel("Precision")
    #plt.legend(loc=4)
    #plt.title("Precision-Recall Curve")

    #plt.show()
    #print_score(clf, X_train, y_train, X_test, y_test, train=True)
    precision_sc=precision_score(y_test, clf.predict(X_test), average=None)
    recall_sc = recall_score(y_test, clf.predict(X_test), average=None)
    test_score = accuracy_score(y_test, clf.predict(X_test)) * 100
    train_score = accuracy_score(y_train, clf.predict(X_train)) * 100
    f1 = f1_score(y_test, clf.predict(X_test))
    #print_score(clf, X_train, y_train, X_test, y_test, train=True)
    #print_score(clf, X_train, y_train, X_test, y_test, train=False)
    metriques = {"fpr" : fpr, "recall_sc" : recall_sc, "precision_sc" : precision_sc, "tpr" : tpr,
                 "test_score" : test_score, "train_score" : train_score, "f1" : f1,
                 "auc_pr" : auc_precision_recall, "auc": roc_auc}
    return metriques

## **Load the data set**

In [None]:
! wget https://raw.githubusercontent.com/mounsifelatouch/cdd/master/data/bioactivity_data_GraphOnlyFingerprinter.csv

In [None]:
df = pd.read_csv('bioactivity_data_GraphOnlyFingerprinter.csv')
df

## **Input features**

### **Input features**

In [None]:
X = df.drop('Activity', axis=1)

### **Output features**

In [None]:
y = df['Activity']

In [None]:
# count the number of instances in each class
counts = y.value_counts()

# calculate the ratio of the negative class to the positive class
imbalance_ratio = counts[0] / counts[1]

### **Let's examine the data dimension**

In [None]:
X.shape, y.shape

## **Data split (80/20 ratio)**

In [None]:
idx_train, idx_test = train_test_split(np.arange(len(X)), stratify=y, test_size=.2)

In [None]:
X_train, y_train = X.loc[idx_train], y.loc[idx_train]
X_test, y_test = X.loc[idx_test], y.loc[idx_test]

In [None]:
X_train.shape, y_train.shape

## **Data balancing**

### **Random Oversampling**

In [None]:
from imblearn.over_sampling import RandomOverSampler

# create an oversampler object with a 1:1 ratio of positive to negative samples
ros = RandomOverSampler(sampling_strategy='minority')

# fit and apply the oversampler to the training data
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

### **SMOTE**

In [None]:
from imblearn.over_sampling import SMOTE

# Perform oversampling on the minority class in the training set
sm = SMOTE(random_state=seed)

# fit and apply the oversampler to the training data
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

### **ADASYN**

In [None]:
from imblearn.over_sampling import ADASYN

# Perform oversampling on the minority class in the training set
adasyn = ADASYN(random_state=seed)

# fit and apply the oversampler to the training data
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)



---



In [None]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [None]:
n_samples, n_classes = 518, 2
class_weights = dict(zip(np.unique(y_train), n_samples / (n_classes * np.bincount(y_train))))

# **Stratified 5-Fold CV**

In [None]:
def gridsearch_cv(clf, X_train, y_train, param_grid):
    
    # Create a GridSearchCV object and fit the data
    clf_random = GridSearchCV(clf, param_grid1, cv=5, verbose=2, scoring='f1', n_jobs=-1)
    clf_random.fit(X_train, y_train)

    # Print the best hyperparameters and the corresponding mean cross-validation score
    print('--------------------------------------------------------------------')
    print("Best hyperparameters:", clf_random.best_params_)
    print("Best score:", clf_random.best_score_)

    pred_proba = clf_random.predict_proba(X_test) 

    precision, recall, _ = precision_recall_curve(y_test, pred_proba[:, 1])
    pr_auc = auc(recall, precision)

    y_train_pred = clf_random.predict(X_train)
    train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel())*100
    y_test_pred = clf_random.predict(X_test)
    test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel())*100

    acc = accuracy_score(y_test, y_test_pred) 
    prec = precision_score(y_test, y_test_pred)
    rec = recall_score(y_test, y_test_pred)   
    f1 = f1_score(y_test, y_test_pred)  
    roc_auc = roc_auc_score(y_test, y_test_pred)
    print(f'Accuracy: {acc}, Precision: {prec}, Recall: {rec}, f1_score: {f1}, ROC AUC: {roc_auc}')

## **ExtraTreesClassifier**

In [None]:
clf1 = ExtraTreesClassifier(random_state=seed, class_weight=class_weights)
param_grid1 = {'n_estimators': [100, 500, 1000],
        'max_depth': [10, 25, 40, 50],
        'max_features': [int(sqrt(X.shape[1])), int(log2(X.shape[1]))]}

gridsearch_cv(clf1, X_train, y_train, param_grid1)

In [None]:
model1 = ExtraTreesClassifier(random_state=seed, max_depth=10, n_estimators=1000, class_weight=class_weights, max_features=10).fit(X_train, y_train)
clf1 = print_auc(model1, X_train, y_train, X_test, y_test, 'ExtraTreesClassifier')
df1 = pd.DataFrame(data=[['ExtraTreesClassifier', clf1['f1'], clf1['auc'], clf1['auc_pr'], clf1['recall_sc'][0], clf1['recall_sc'][1], clf1['precision_sc'][0], clf1['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df1

## **XGBClassifier**

In [None]:
clf2 = xgb.XGBClassifier(objective='binary:logistic', scale_pos_weight=imbalance_ratio)
grid2 = {'learning_rate' : [0.001, 0.1, 1.0],
        'max_depth' : [3, 7, 25, 100],
        'gamma' : [0.1, 0.5, 1.0]}

stratified_5_fold(clf2, grid2)

In [None]:
model2 = xgb.XGBClassifier(learning_rate=1.0, max_depth=25, gamma=0.1, subsample=0.8, colsample_bytree=0.8, 
                           objective='binary:logistic', scale_pos_weight=imbalance_ratio).fit(X1_train, y1_train)
clf2 = print_auc(model2, X1_train, y1_train, X1_test, y1_test, 'XGBClassifier')
df2 = pd.DataFrame(data=[['XGBClassifier', clf2['f1'], clf2['auc'], clf2['auc_pr'], clf2['recall_sc'][0], clf2['recall_sc'][1], clf2['precision_sc'][0], clf2['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df2

## **AdaBoostClassifier**

In [None]:
clf3 = AdaBoostClassifier(random_state=seed)
grid3 = {'n_estimators' : [25, 50, 100]}

stratified_5_fold(clf3, grid3)

In [None]:
model3 = AdaBoostClassifier(random_state=seed, n_estimators=100).fit(X1_train, y1_train)
clf3 = print_auc(model3, X1_train, y1_train, X1_test, y1_test, 'AdaBoostClassifier')
df3 = pd.DataFrame(data=[['AdaBoostClassifier', clf3['f1'], clf3['auc'], clf3['auc_pr'], clf3['recall_sc'][0], clf3['recall_sc'][1], clf3['precision_sc'][0], clf3['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df3

## **DecisionTreeClassifier**

In [None]:
clf4 = DecisionTreeClassifier(random_state=seed)
grid4 = {'min_samples_leaf' : [2, 5, 7],
         'max_depth' : [10, 25, 50],
         'criterion' : ['gini', 'entropy', 'log_loss']}

stratified_5_fold(clf4, grid4)

In [None]:
model4 = DecisionTreeClassifier(random_state=seed, min_samples_leaf=5, max_depth=25, criterion='gini').fit(X1_train, y1_train)
clf4 = print_auc(model4, X1_train, y1_train, X1_test, y1_test, 'DecisionTreeClassifier')
df4 = pd.DataFrame(data=[['DecisionTreeClassifier', clf4['f1'], clf4['auc'], clf4['auc_pr'], clf4['recall_sc'][0], clf4['recall_sc'][1], clf4['precision_sc'][0], clf4['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df4

## **GaussianNB**

In [None]:
clf5 = GaussianNB()

grid5 = {'var_smoothing': [1, 3, 5]}

stratified_5_fold(clf5, grid5)

In [None]:
model5 = GaussianNB(var_smoothing=5).fit(X1_train, y1_train)
clf5 = print_auc(model5, X1_train, y1_train, X1_test, y1_test, 'GaussianNB')
df5 = pd.DataFrame(data=[['GaussianNB', clf5['f1'], clf5['auc'], clf5['auc_pr'], clf5['recall_sc'][0], clf5['recall_sc'][1], clf5['precision_sc'][0], clf5['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df5

## **LGBMClassifier**

In [None]:
clf6 = lgb.LGBMClassifier(class_weight='balanced')
grid6 = {'num_leaves' : [20, 40, 80, 100],
         'reg_alpha' : [0.1, 0.5],
         'min_data_in_leaf' : [10, 30, 50],
         'scale_pos_weight' : [10],
         'max_depth' : [7, 10]}
         
stratified_5_fold(clf6, grid6)

In [None]:
model6 = lgb.LGBMClassifier(num_leaves=20, reg_alpha=0.1, min_data_in_leaf=10, scale_pos_weight=10, max_depth=7).fit(X1_train, y1_train)
clf6 = print_auc(model6, X1_train, y1_train, X1_test, y1_test, 'LGBMClassifier')
df6 = pd.DataFrame(data=[['LGBMClassifier', clf6['f1'], clf6['auc'], clf6['auc_pr'], clf6['recall_sc'][0], clf6['recall_sc'][1], clf6['precision_sc'][0], clf6['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df6

## **KNeighborsClassifier**

In [None]:
clf7 = KNeighborsClassifier()
grid7 = {'n_neighbors' : [3, 5, 7],
         'weights': ['distance'],
         'metric' : ['euclidean', 'manhattan'],
         'algorithm' : ['brute']}

stratified_5_fold(clf7, grid7)

In [None]:
model7 = KNeighborsClassifier(n_neighbors=7, weights='distance', metric='euclidean', algorithm='brute').fit(X1_train, y1_train)
clf7 = print_auc(model7, X1_train, y1_train, X1_test, y1_test, 'KNeiborsClassifier')
df7 = pd.DataFrame(data=[['KNeiborsClassifier', clf7['f1'], clf7['auc'], clf7['auc_pr'], clf7['recall_sc'][0], clf7['recall_sc'][1], clf7['precision_sc'][0], clf7['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df7

## **RandomForestClassifier**

In [None]:
clf8 = RandomForestClassifier(random_state=seed)
grid8 = {'n_estimators' : [500, 750, 1000],
         'max_depth' : [10, 12, 20],
         'bootstrap' : [True, False]}

stratified_5_fold(clf8, grid8)

In [None]:
model8 = RandomForestClassifier(random_state=seed, n_estimators=500, max_depth=12, bootstrap=True).fit(X1_train, y1_train)
clf8 = print_auc(model8, X1_train, y1_train, X1_test, y1_test, 'RandomForestClassifier')
df8 = pd.DataFrame(data=[['RandomForestClassifier', clf8['f1'], clf8['auc'], clf8['auc_pr'], clf8['recall_sc'][0], clf8['recall_sc'][1], clf8['precision_sc'][0], clf8['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df8

## **SVC**

In [None]:
clf9 = SVC(probability=True, class_weight='balanced')
grid9 = {'C' : [10, 100, 1000], 
         'gamma' : [10, 100, 500],
         'kernel' : ['rbf', 'poly'],
         'shrinking' : [False, True]}

stratified_5_fold(clf9, grid9)

In [None]:
model9 = SVC(C=10, kernel='poly', gamma=20, class_weight='balanced', probability=True).fit(X1_train, y1_train)
clf9 = print_auc(model9, X1_train, y1_train, X1_test, y1_test, 'SVM')
df9 = pd.DataFrame(data=[['SVM', clf9['f1'], clf9['auc'], clf9['auc_pr'], clf9['recall_sc'][0], clf9['recall_sc'][1], clf9['precision_sc'][0], clf9['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df9

## **MLPClassifier**

In [None]:
clf10 = MLPClassifier(random_state=seed)
grid10 = {'hidden_layer_sizes' : [30, 50],
          'alpha' : [0.0001, 0.05],
          'solver' : ['adam'],
          'learning_rate' : ['invscaling', 'adaptive'],
          'activation' : ['logistic', 'tanh']}

stratified_5_fold(clf10, grid10)

In [None]:
model10 = MLPClassifier(random_state=seed, alpha=0.0001, solver='adam', hidden_layer_sizes=50, activation='logistic').fit(X1_train, y1_train)
clf10 = print_auc(model10, X1_train, y1_train, X1_test, y1_test, 'MLPClassifier')
df10 = pd.DataFrame(data=[['MLPClassifier', clf10['f1'], clf10['auc'], clf10['auc_pr'], clf10['recall_sc'][0], clf10['recall_sc'][1], clf10['precision_sc'][0], clf10['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df10

## **LogisticRegression**

In [None]:
clf11 = LogisticRegression(class_weight='balanced')
grid11 = {'penalty' : ['none', 'l1', 'l2', 'elasticnet'],
          'C':[0.001, 0.009, 0.01, .09, 1, 5, 10, 25]}

stratified_5_fold(clf11, grid11)

In [None]:
model11 = LogisticRegression(class_weight='balanced', C=0.001, penalty='l2').fit(X1_train, y1_train)
clf11 = print_auc(model11, X1_train, y1_train, X1_test, y1_test, 'LogisticRegression')
df11 = pd.DataFrame(data=[['LogisticRegression', clf11['f1'], clf11['auc'], clf11['auc_pr'], clf11['recall_sc'][0], clf11['recall_sc'][1], clf11['precision_sc'][0], clf11['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df11

In [None]:
df = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11]
df_finale = pd.concat(df).style.hide_index()
df_finale

In [None]:
plt.rcParams["figure.figsize"] = [16, 9]
plt.plot(clf6["fpr"], clf6["tpr"], label="LGBMClassifier, auc="+str(clf10["auc"]))
plt.plot(clf11["fpr"], clf11["tpr"], label="LR, auc="+str(clf1["auc"]))
plt.plot(clf9["fpr"], clf9["tpr"], label="SVM, auc="+str(clf2["auc"]))
plt.plot(clf5["fpr"], clf5["tpr"], label="GaussianNB, auc="+str(clf3["auc"]))
plt.plot(clf1["fpr"], clf1["tpr"], label="ExtraTrees, auc="+str(clf6["auc"]))
plt.plot(clf7["fpr"], clf7["tpr"], label="KNN, auc="+str(clf5["auc"]))
plt.plot(clf4["fpr"], clf4["tpr"], label="DT, auc="+str(clf8["auc"]))
plt.plot(clf10["fpr"], clf10["tpr"], label="MLP, auc="+str(clf9["auc"]))
plt.plot(clf8["fpr"], clf8["tpr"], label="RF, auc="+str(clf4["auc"]))
plt.plot(clf3["fpr"], clf3["tpr"], label="AdaBoost, auc="+str(clf7["auc"]))
plt.plot(clf2["fpr"], clf2["tpr"], label="XGB, auc="+str(clf11["auc"]))

plt.xlabel("tpr")
plt.ylabel("fpr")
plt.legend(loc=4)
plt.show()