<a href="https://colab.research.google.com/github/mounsifelatouch/cdd/blob/master/notebooks/4_cdd_ml_part_4_model_building%20-%20RandomOverSampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Bioinformatics Project - Computational Drug Discovery [Part 4] Classification Models Building**

**MOUNSIF EL ATOUCH**

In this Jupyter notebook, we will be building a machine learning model using the ChEMBL bioactivity data.

In **Part 4**, we will be building classification models

---

## **1. Installing librairies**

## **2. Importing libraries**

In [1]:
import os
import pandas as pd

from scipy.spatial.distance import *
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

seed = 123

from sklearn.model_selection import *
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import *

from scipy.stats import randint, uniform

from scipy import interp
from sklearn.metrics import RocCurveDisplay

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC  
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [4]:
def print_auc(clf, X_train, y_train, X_test, y_test, model_name) :
    
    clf = clf.fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)[::,1]
    fpr, tpr, thr = roc_curve(y_test,  y_pred_proba)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    #plt.plot(fpr, tpr, label=model_name+" auc="+str(roc_auc))
    #plt.legend(loc=4)
    #plt.title("ROC curve")

    #plt.show()
    # Data to plot precision - recall curve
    precision, recall, thresholds_log = precision_recall_curve(y_test, y_pred_proba)
    # Use AUC function to calculate the area under the curve of precision recall curve
    auc_precision_recall = auc(recall, precision)
    
    #plt.plot(recall, precision, label=model_name+" auc_prc="+str(auc_precision_recall))
    #plt.xlabel("Recall")
    #plt.ylabel("Precision")
    #plt.legend(loc=4)
    #plt.title("Precision-Recall Curve")

    #plt.show()
    precision_sc=precision_score(y_test, clf.predict(X_test), average=None)
    recall_sc = recall_score(y_test, clf.predict(X_test), average=None)
    test_score = accuracy_score(y_test, clf.predict(X_test)) * 100
    train_score = accuracy_score(y_train, clf.predict(X_train)) * 100
    f1 = f1_score(y_test, clf.predict(X_test))
    #print_score(clf, X_train, y_train, X_test, y_test, train=True)
    #print_score(clf, X_train, y_train, X_test, y_test, train=False)
    metriques = {"fpr" : fpr, "recall_sc" : recall_sc, "precision_sc" : precision_sc, "tpr" : tpr,
                 "test_score" : test_score, "train_score" : train_score, "f1" : f1,
                 "auc_pr" : auc_precision_recall, "auc": roc_auc}
    return metriques

In [5]:
def metrics(X_train, X_test, y_train, y_test, model):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    print("Training set accuracy: {:.2f}".format(model.score(X_train, y_train)))
    print("Test set accuracy: {:.2f}".format(model.score(X_test, y_test)))
    print("Precision: {:.2f}".format(precision_score(y_test, y_pred)))
    print("Recall: {:.2f}".format(recall_score(y_test, y_pred)))
    print("F1 score: {:.2f}".format(f1_score(y_test, y_pred)))
    print("ROC AUC score: {:.2f}".format(roc_auc_score(y_test, y_proba)))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print("True Negatives: {:.0f}".format(tn))
    print("False Positives: {:.0f}".format(fp))
    print("False Negatives: {:.0f}".format(fn))
    print("TruePositives: {:.0f}".format(tp))
    print("-------------------------------------")
    return

## **4. Load the data set**

In [6]:
! wget https://raw.githubusercontent.com/mounsifelatouch/cdd/master/data/bioactivity_data_PubchemFingerprinter.csv

--2023-06-06 13:09:29--  https://raw.githubusercontent.com/mounsifelatouch/cdd/master/data/bioactivity_data_PubchemFingerprinter.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1090920 (1.0M) [text/plain]
Saving to: ‘bioactivity_data_PubchemFingerprinter.csv’


2023-06-06 13:09:29 (110 MB/s) - ‘bioactivity_data_PubchemFingerprinter.csv’ saved [1090920/1090920]



In [7]:
df = pd.read_csv('bioactivity_data_PubchemFingerprinter.csv')

## **5. Input features**

### **5.1. Input features**

In [8]:
X = df.drop('activity', axis=1)

### **5.2. Output features**

In [9]:
y = df['activity']

In [10]:
# count the number of instances in each class
counts = y.value_counts()

# calculate the ratio of the negative class to the positive class
imbalance_ratio = counts[0] / counts[1]

### **5.3. Let's examine the data dimension**

In [11]:
X.shape, y.shape

((612, 881), (612,))

### **5.4. Remove low variance features**

In [12]:
# Create a VarianceThreshold object
selector = VarianceThreshold(threshold=.01)

# Fit the selector to the data and transform the data
selector.fit_transform(X)

# Get the indices of the selected features
selected_indices = selector.get_support(indices=True)

# Get the names of the selected features
selected_features = X.columns[selected_indices]

X_reduced = X[selected_features]

In [13]:
X.shape, X_reduced.shape

((612, 881), (612, 428))

## **6. Data split (80/20 ratio)**

In [14]:
n = np.arange(len(X_reduced))
idx_train, idx_test = train_test_split(n, stratify=y, test_size=.2, random_state=seed)

In [15]:
X1_train, y1_train = X_reduced.loc[idx_train], y.loc[idx_train]
X1_test, y1_test = X_reduced.loc[idx_test], y.loc[idx_test]

In [16]:
X1_train.shape, y1_train.shape

((489, 428), (489,))

In [17]:
X1_test.shape, y1_test.shape

((123, 428), (123,))



---



### **Random Oversampling**

In [18]:
from imblearn.over_sampling import RandomOverSampler

# create an oversampler object with a 1:1 ratio of positive to negative samples
ros = RandomOverSampler(sampling_strategy='minority', random_state=seed)

# fit and apply the oversampler to the training data
X1_train_ros, y1_train_ros = ros.fit_resample(X1_train, y1_train)

In [19]:
X = X1_train_ros.reset_index(drop=True)
y = y1_train_ros.reset_index(drop=True)
X1_test = X1_test.reset_index(drop=True)
y1_test = y1_test.reset_index(drop=True)

In [20]:
n_samples, n_classes = X.shape[0], 2
class_weights = dict(zip(np.unique(y), n_samples / (n_classes * np.bincount(y))))

## **7. Modeling**

## **Logistic Regression**

In [21]:
model1 = LogisticRegression(random_state=seed).fit(X1_train, y1_train)

In [22]:
clf = print_auc(model1, X1_train, y1_train, X1_test, y1_test, 'Logistic Regression')
results_df = pd.DataFrame(data=[['Logistic Regression', clf['f1'], clf['auc'], clf['auc_pr'], clf['recall_sc'][0], clf['recall_sc'][1], clf['precision_sc'][0], clf['precision_sc'][1]]], 
                          columns=['Model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])

results_df.style.hide_index()

Model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
Logistic Regression,0.888889,0.953297,0.932885,0.967033,0.875,0.956522,0.903226


## **SVC**

In [23]:
model2 = SVC(random_state=seed, probability=True).fit(X1_train, y1_train)

In [24]:
clf = print_auc(model2, X1_train, y1_train, X1_test, y1_test, 'SVM')
results_df2 = pd.DataFrame(data=[['SVM', clf['f1'], clf['auc'], clf['auc_pr'], clf['recall_sc'][0], clf['recall_sc'][1], clf['precision_sc'][0], clf['precision_sc'][1]]],  
                          columns=['Model', 'f1_score', 'auc','auc_pr','recall_classe(0)','recall_classe(1)','precision_classe(0)','precision_classe(1)'])
results_df = results_df.append(results_df2)

results_df.style.hide_index()

Model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
Logistic Regression,0.888889,0.953297,0.932885,0.967033,0.875,0.956522,0.903226
SVM,0.847458,0.966346,0.927824,0.978022,0.78125,0.927083,0.925926


## **Gradient Boosting Classifier**

In [25]:
model3 = GradientBoostingClassifier(random_state=seed).fit(X1_train, y1_train)

In [26]:
clf = print_auc(model3, X1_train, y1_train, X1_test, y1_test, 'Gradient Boosting')
results_df3 = pd.DataFrame(data=[['Gradient Boosting', clf['f1'], clf['auc'], clf['auc_pr'], clf['recall_sc'][0], clf['recall_sc'][1], clf['precision_sc'][0], clf['precision_sc'][1]]], 
                          columns=['Model', 'f1_score', 'auc','auc_pr','recall_classe(0)','recall_classe(1)','precision_classe(0)','precision_classe(1)'])
results_df = results_df.append(results_df3)

results_df.style.hide_index()

Model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
Logistic Regression,0.888889,0.953297,0.932885,0.967033,0.875,0.956522,0.903226
SVM,0.847458,0.966346,0.927824,0.978022,0.78125,0.927083,0.925926
Gradient Boosting,0.875,0.959478,0.938167,0.956044,0.875,0.956044,0.875


## **Random Forest**

In [27]:
model4 = RandomForestClassifier(random_state=seed).fit(X1_train, y1_train)

In [28]:
clf = print_auc(model4, X1_train, y1_train, X1_test, y1_test, 'Random Forest')
results_df3 = pd.DataFrame(data=[['Random Forest', clf['f1'], clf['auc'], clf['auc_pr'], clf['recall_sc'][0], clf['recall_sc'][1], clf['precision_sc'][0], clf['precision_sc'][1]]], 
                          columns=['Model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
results_df = results_df.append(results_df3, ignore_index=True)

results_df.style.hide_index()

Model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
Logistic Regression,0.888889,0.953297,0.932885,0.967033,0.875,0.956522,0.903226
SVM,0.847458,0.966346,0.927824,0.978022,0.78125,0.927083,0.925926
Gradient Boosting,0.875,0.959478,0.938167,0.956044,0.875,0.956044,0.875
Random Forest,0.852459,0.965659,0.927233,0.967033,0.8125,0.93617,0.896552


## **KNN**

In [29]:
model5 = KNeighborsClassifier().fit(X1_train, y1_train)

In [30]:
clf = print_auc(model5, X1_train, y1_train, X1_test, y1_test, 'KNeibors Classifier')
results_df4 = pd.DataFrame(data=[['KNeibors Classifier', clf['f1'], clf['auc'], clf['auc_pr'], clf['recall_sc'][0], clf['recall_sc'][1], clf['precision_sc'][0], clf['precision_sc'][1]]], 
                          columns=['Model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
results_df = results_df.append(results_df4, ignore_index=True)

results_df.style.hide_index()

Model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
Logistic Regression,0.888889,0.953297,0.932885,0.967033,0.875,0.956522,0.903226
SVM,0.847458,0.966346,0.927824,0.978022,0.78125,0.927083,0.925926
Gradient Boosting,0.875,0.959478,0.938167,0.956044,0.875,0.956044,0.875
Random Forest,0.852459,0.965659,0.927233,0.967033,0.8125,0.93617,0.896552
KNeibors Classifier,0.833333,0.955357,0.917124,0.967033,0.78125,0.926316,0.892857


## **ExtraTrees**

In [31]:
model6 = ExtraTreesClassifier(random_state=seed).fit(X1_train, y1_train)

In [32]:
clf = print_auc(model6, X1_train, y1_train, X1_test, y1_test, 'ExtraTrees Classifier')
results_df5 = pd.DataFrame(data=[['ExtraTrees Classifier', clf['f1'], clf['auc'], clf['auc_pr'], clf['recall_sc'][0], clf['recall_sc'][1], clf['precision_sc'][0], clf['precision_sc'][1]]], 
                          columns=['Model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
results_df = results_df.append(results_df5, ignore_index=True)

results_df.style.hide_index()

Model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
Logistic Regression,0.888889,0.953297,0.932885,0.967033,0.875,0.956522,0.903226
SVM,0.847458,0.966346,0.927824,0.978022,0.78125,0.927083,0.925926
Gradient Boosting,0.875,0.959478,0.938167,0.956044,0.875,0.956044,0.875
Random Forest,0.852459,0.965659,0.927233,0.967033,0.8125,0.93617,0.896552
KNeibors Classifier,0.833333,0.955357,0.917124,0.967033,0.78125,0.926316,0.892857
ExtraTrees Classifier,0.852459,0.963942,0.901118,0.967033,0.8125,0.93617,0.896552


## **AdaBoost**

In [33]:
model7 = AdaBoostClassifier(random_state=seed).fit(X1_train, y1_train)

In [34]:
clf = print_auc(model7, X1_train, y1_train, X1_test, y1_test, 'AdaBoost Classifier')
results_df6 = pd.DataFrame(data=[['AdaBoost Classifier', clf['f1'], clf['auc'], clf['auc_pr'], clf['recall_sc'][0], clf['recall_sc'][1], clf['precision_sc'][0], clf['precision_sc'][1]]], 
                          columns=['Model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
results_df = results_df.append(results_df6, ignore_index=True)

results_df.style.hide_index()

Model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
Logistic Regression,0.888889,0.953297,0.932885,0.967033,0.875,0.956522,0.903226
SVM,0.847458,0.966346,0.927824,0.978022,0.78125,0.927083,0.925926
Gradient Boosting,0.875,0.959478,0.938167,0.956044,0.875,0.956044,0.875
Random Forest,0.852459,0.965659,0.927233,0.967033,0.8125,0.93617,0.896552
KNeibors Classifier,0.833333,0.955357,0.917124,0.967033,0.78125,0.926316,0.892857
ExtraTrees Classifier,0.852459,0.963942,0.901118,0.967033,0.8125,0.93617,0.896552
AdaBoost Classifier,0.8,0.932521,0.875056,0.923077,0.8125,0.933333,0.787879


## **Decision Tree**

In [35]:
model8 = DecisionTreeClassifier(random_state=seed).fit(X1_train, y1_train)

In [36]:
clf = print_auc(model8, X1_train, y1_train, X1_test, y1_test, 'DecisionTree Classifier')
results_df7 = pd.DataFrame(data=[['DecisionTree Classifier', clf['f1'], clf['auc'], clf['auc_pr'], clf['recall_sc'][0], clf['recall_sc'][1], clf['precision_sc'][0], clf['precision_sc'][1]]], 
                          columns=['Model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
results_df = results_df.append(results_df7, ignore_index=True)

results_df.style.hide_index()

Model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
Logistic Regression,0.888889,0.953297,0.932885,0.967033,0.875,0.956522,0.903226
SVM,0.847458,0.966346,0.927824,0.978022,0.78125,0.927083,0.925926
Gradient Boosting,0.875,0.959478,0.938167,0.956044,0.875,0.956044,0.875
Random Forest,0.852459,0.965659,0.927233,0.967033,0.8125,0.93617,0.896552
KNeibors Classifier,0.833333,0.955357,0.917124,0.967033,0.78125,0.926316,0.892857
ExtraTrees Classifier,0.852459,0.963942,0.901118,0.967033,0.8125,0.93617,0.896552
AdaBoost Classifier,0.8,0.932521,0.875056,0.923077,0.8125,0.933333,0.787879
DecisionTree Classifier,0.818182,0.883413,0.839259,0.923077,0.84375,0.94382,0.794118


## **LGBM**

In [37]:
model10 = lgb.LGBMClassifier(random_state=seed).fit(X1_train, y1_train)

In [38]:
clf = print_auc(model10, X1_train, y1_train, X1_test, y1_test, 'LGBN')
results_df9 = pd.DataFrame(data=[['LGBM', clf['f1'], clf['auc'], clf['auc_pr'], clf['recall_sc'][0], clf['recall_sc'][1], clf['precision_sc'][0], clf['precision_sc'][1]]], 
                          columns=['Model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
results_df = results_df.append(results_df9, ignore_index=True)

results_df.style.hide_index()

Model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
Logistic Regression,0.888889,0.953297,0.932885,0.967033,0.875,0.956522,0.903226
SVM,0.847458,0.966346,0.927824,0.978022,0.78125,0.927083,0.925926
Gradient Boosting,0.875,0.959478,0.938167,0.956044,0.875,0.956044,0.875
Random Forest,0.852459,0.965659,0.927233,0.967033,0.8125,0.93617,0.896552
KNeibors Classifier,0.833333,0.955357,0.917124,0.967033,0.78125,0.926316,0.892857
ExtraTrees Classifier,0.852459,0.963942,0.901118,0.967033,0.8125,0.93617,0.896552
AdaBoost Classifier,0.8,0.932521,0.875056,0.923077,0.8125,0.933333,0.787879
DecisionTree Classifier,0.818182,0.883413,0.839259,0.923077,0.84375,0.94382,0.794118
LGBM,0.830769,0.958791,0.933096,0.934066,0.84375,0.944444,0.818182


## **XGB**

In [39]:
model11 = xgb.XGBClassifier(objective='binary:logistic', random_state=seed).fit(X1_train, y1_train)

In [40]:
clf = print_auc(model11, X1_train, y1_train, X1_test, y1_test, 'XGB Classifer')
results_df10 = pd.DataFrame(data=[['XGB Classifer', clf['f1'], clf['auc'], clf['auc_pr'], clf['recall_sc'][0], clf['recall_sc'][1], clf['precision_sc'][0], clf['precision_sc'][1]]], 
                          columns=['Model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
results_df = results_df.append(results_df10, ignore_index=True)

results_df.style.hide_index()

Model,f1_score,auc,auc_pr,recall_classe(0),recall_classe(1),precision_classe(0),precision_classe(1)
Logistic Regression,0.888889,0.953297,0.932885,0.967033,0.875,0.956522,0.903226
SVM,0.847458,0.966346,0.927824,0.978022,0.78125,0.927083,0.925926
Gradient Boosting,0.875,0.959478,0.938167,0.956044,0.875,0.956044,0.875
Random Forest,0.852459,0.965659,0.927233,0.967033,0.8125,0.93617,0.896552
KNeibors Classifier,0.833333,0.955357,0.917124,0.967033,0.78125,0.926316,0.892857
ExtraTrees Classifier,0.852459,0.963942,0.901118,0.967033,0.8125,0.93617,0.896552
AdaBoost Classifier,0.8,0.932521,0.875056,0.923077,0.8125,0.933333,0.787879
DecisionTree Classifier,0.818182,0.883413,0.839259,0.923077,0.84375,0.94382,0.794118
LGBM,0.830769,0.958791,0.933096,0.934066,0.84375,0.944444,0.818182
XGB Classifer,0.857143,0.959135,0.926096,0.956044,0.84375,0.945652,0.870968


In [41]:
results_df.to_csv('results_dfROS.csv', index=False)