<a href="https://colab.research.google.com/github/mounsifelatouch/cdd/blob/master/notebooks/4_cdd_ml_part_4_models_building_SMOTE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Bioinformatics Project - Computational Drug Discovery [Part 4] Classification Models Building**

**MOUNSIF EL ATOUCH**

In this Jupyter notebook, we will be building a machine learning model using the ChEMBL bioactivity data.

In **Part 4**, we will be building classification models

---

## **Importing libraries**

In [None]:
import os
import pandas as pd

from scipy.spatial.distance import *
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

seed = 42

from sklearn.model_selection import *
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import *

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC  
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

## **Functions**

In [None]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [None]:
def print_auc(clf, X_train, y_train, X_test, y_test, model_name) :
    
    clf = clf.fit(X_train, y_train)
    y_pred_proba = clf.predict_proba(X_test)[::,1]
    fpr, tpr, thr = roc_curve(y_test,  y_pred_proba)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=model_name+" auc="+str(roc_auc))
    plt.legend(loc=4)
    plt.title("ROC curve")

    plt.show()
    # Data to plot precision - recall curve
    precision, recall, thresholds_log = precision_recall_curve(y_test, y_pred_proba)
    # Use AUC function to calculate the area under the curve of precision recall curve
    auc_precision_recall = auc(recall, precision)
    
    plt.plot(recall, precision, label=model_name+" auc_prc="+str(auc_precision_recall))
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.legend(loc=4)
    plt.title("Precision-Recall Curve")

    plt.show()
    precision_sc = precision_score(y_test, clf.predict(X_test), average=None)
    recall_sc = recall_score(y_test, clf.predict(X_test), average=None)
    test_score = accuracy_score(y_test, clf.predict(X_test)) * 100
    train_score = accuracy_score(y_train, clf.predict(X_train)) * 100
    f1 = f1_score(y_test, clf.predict(X_test))
    print_score(clf, X_train, y_train, X_test, y_test, train=True)
    print_score(clf, X_train, y_train, X_test, y_test, train=False)
    metriques = {"fpr" : fpr, "recall_sc" : recall_sc, "precision_sc" : precision_sc, "tpr" : tpr,
                 "test_score" : test_score, "train_score" : train_score, "f1" : f1,
                 "auc_pr" : auc_precision_recall, "auc": roc_auc}
    return metriques

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def plot_confusion_matrix(y_true, y_pred):
    # generate the confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # create a heatmap of the confusion matrix
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')

    # set the axis labels
    plt.xlabel('Predicted')
    plt.ylabel('True')

    # show the plot
    plt.show()

## **Load the data set**

In [None]:
! wget https://raw.githubusercontent.com/mounsifelatouch/cdd/master/data/bioactivity_data_PubchemFingerprinter.csv

In [None]:
df = pd.read_csv('bioactivity_data_PubchemFingerprinter.csv')

In [None]:
df['activity'].value_counts()

## **Input features**

### **Input features**

In [None]:
X = df.drop('activity', axis=1)

### **Output features**

In [None]:
y = df['activity']

In [None]:
# count the number of instances in each class
counts = y.value_counts()

# calculate the ratio of the negative class to the positive class
imbalance_ratio = counts[0] / counts[1]

### **Let's examine the data dimension**

In [None]:
X.shape, y.shape

### **Remove low variance features**

In [None]:
# Create a VarianceThreshold object
selector = VarianceThreshold(threshold=.01)

# Fit the selector to the data and transform the data
selector.fit_transform(X)

# Get the indices of the selected features
selected_indices = selector.get_support(indices=True)

# Get the names of the selected features
selected_features = X.columns[selected_indices]

X = X[selected_features]

In [None]:
X.shape

## **Data split (80/20 ratio)**

In [None]:
n = np.arange(len(X))
idx_train, idx_test = train_test_split(n, stratify=y, test_size=.2, random_state=seed)

In [None]:
X_train, y_train = X.loc[idx_train], y.loc[idx_train]
X_test, y_test = X.loc[idx_test], y.loc[idx_test]

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape



---



## **Data balancing**

### **SMOTE**

In [None]:
from imblearn.over_sampling import SMOTE

# Perform oversampling on the minority class in the training set
sm = SMOTE(random_state=seed)

# fit and apply the oversampler to the training data
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

In [None]:
X_train = X_train_sm.reset_index(drop=True)
y_train = y_train_sm.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [None]:
n_samples, n_classes = X_train.shape[0], 2
class_weights = dict(zip(np.unique(y_train), n_samples / (n_classes * np.bincount(y_train))))

# **RandomizedSearchCV**
* cv = 5
* n_iter = 100

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

In [None]:
best_params = []

## **ExtraTreesClassifier**

In [None]:
# Define the search space
param_dist = {'n_estimators': randint(100, 1000),
              'max_depth': randint(10, 100),
              'max_features': ['sqrt', 'log2'],
              'min_samples_split' : randint(2, 10), 
              'min_samples_leaf' : randint(1, 10),
              'class_weight' : ['balanced', None, class_weights]}

# Create the ExtraTreesClassifier model
clf1 = ExtraTreesClassifier(random_state=seed)

# Create the randomized search object
random_search = RandomizedSearchCV(estimator=clf1, param_distributions=param_dist, n_iter=100, scoring='roc_auc', cv=5, n_jobs=-1)

# Fit the randomized search object to the data
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print(random_search.best_params_)
best_params.append(random_search.best_params_)

In [None]:
# Use the best hyperparameters to fit the model to the training data
clf_best = ExtraTreesClassifier(**random_search.best_params_, random_state=seed)
model1 = clf_best.fit(X_train, y_train)

clf1 = print_auc(model1, X_train, y_train, X_test, y_test, 'ExtraTreesClassifier')
df1 = pd.DataFrame(data=[['ExtraTreesClassifier', clf1['f1'],  clf1['auc'], clf1['auc_pr'], clf1['recall_sc'][0], clf1['recall_sc'][1], clf1['precision_sc'][0], clf1['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df1.style.hide_index()

## **XGBClassifier**

In [None]:
# Define the search space
param_dist = {'learning_rate' : uniform(1e-5, 1),
              'max_depth' : randint(3, 25),
              'gamma' : uniform(0.1, 1),
              'subsample': uniform(0.5, 1),
              'colsample_bytree': uniform(0.5, 1),
              'scale_pos_weight': [1, imbalance_ratio]}

# Create the XGBClassifier model
clf2 = xgb.XGBClassifier(objective='binary:logistic')

# Create the randomized search object
random_search = RandomizedSearchCV(estimator=clf2, param_distributions=param_dist, n_iter=100, scoring='roc_auc', cv=5, n_jobs=-1)

# Fit the randomized search object to the data
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print(random_search.best_params_)
best_params.append(random_search.best_params_)

In [None]:
# Use the best hyperparameters to fit the model to the training data
clf_best = xgb.XGBClassifier(**random_search.best_params_, random_state=seed)
model2 = clf_best.fit(X_train, y_train)

clf2 = print_auc(model2, X_train, y_train, X_test, y_test, 'XGBClassifier')
df2 = pd.DataFrame(data=[['XGBClassifier', clf2['f1'], clf2['auc'], clf2['auc_pr'], clf2['recall_sc'][0], clf2['recall_sc'][1], clf2['precision_sc'][0], clf2['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df2.style.hide_index()

## **AdaBoostClassifier**

In [None]:
# Define the search space
param_dist = {'n_estimators': randint(50, 200),
              'learning_rate': uniform(1e-5, 1),
              'base_estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=3)]}

# Create the AdaBoostClassifier model
clf3 = AdaBoostClassifier(random_state=seed)

# Create the randomized search object
random_search = RandomizedSearchCV(estimator=clf3, param_distributions=param_dist, n_iter=100, scoring='roc_auc', cv=5, n_jobs=-1)

# Fit the randomized search object to the data
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print(random_search.best_params_)
best_params.append(random_search.best_params_)

In [None]:
# Use the best hyperparameters to fit the model to the training data
clf_best = AdaBoostClassifier(**random_search.best_params_, random_state=seed)
model3 = clf_best.fit(X_train, y_train)

clf3 = print_auc(model3, X_train, y_train, X_test, y_test, 'AdaBoostClassifier')
df3 = pd.DataFrame(data=[['AdaBoostClassifier', clf3['f1'], clf3['auc'], clf3['auc_pr'], clf3['recall_sc'][0], clf3['recall_sc'][1], clf3['precision_sc'][0], clf3['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df3.style.hide_index()

## **DecisionTreeClassifier**

In [None]:
# Define the search space
param_dist = {'min_samples_split': randint(2, 20),
              'max_depth': randint(3, 10),
              'min_samples_leaf': randint(2, 10),
              'criterion' : ['gini', 'entropy', 'log_loss'],
              'max_features': ['auto', 'sqrt', 'log2']}

# Create the DecisionTreeClassifierr model
clf4 = DecisionTreeClassifier(random_state=seed)

# Create the randomized search object
random_search = RandomizedSearchCV(estimator=clf4, param_distributions=param_dist, n_iter=100, scoring='roc_auc', cv=5, n_jobs=-1)

# Fit the randomized search object to the data
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print(random_search.best_params_)
best_params.append(random_search.best_params_)

In [None]:
# Use the best hyperparameters to fit the model to the training data
clf_best = DecisionTreeClassifier(**random_search.best_params_, random_state=seed)
model4 = clf_best.fit(X_train, y_train)

clf4 = print_auc(model4, X_train, y_train, X_test, y_test, 'DecisionTreeClassifier')
df4 = pd.DataFrame(data=[['DecisionTreeClassifier', clf4['f1'], clf4['auc'], clf4['auc_pr'], clf4['recall_sc'][0], clf4['recall_sc'][1], clf4['precision_sc'][0], clf4['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df4.style.hide_index()

## **GaussianNB**

In [None]:
# Define the search space
param_dist = {'var_smoothing': uniform(1e-9, 1e-6)}

# Create the DecisionTreeClassifierr model
clf5 = GaussianNB()

# Create the randomized search object
random_search = RandomizedSearchCV(estimator=clf5, param_distributions=param_dist, n_iter=100, scoring='roc_auc', cv=5, n_jobs=-1)


# Fit the randomized search object to the data
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print(random_search.best_params_)
best_params.append(random_search.best_params_)

In [None]:
# Use the best hyperparameters to fit the model to the training data
clf_best = GaussianNB(**random_search.best_params_)
model5 = clf_best.fit(X_train, y_train)

clf5 = print_auc(model5, X_train, y_train, X_test, y_test, 'GaussianNB')
df5 = pd.DataFrame(data=[['GaussianNB', clf5['f1'], clf5['auc'], clf5['auc_pr'], clf5['recall_sc'][0], clf5['recall_sc'][1], clf5['precision_sc'][0], clf5['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df5.style.hide_index()

## **LGBMClassifier**

In [None]:
# Define the search space
param_dist = {'num_leaves': randint(10, 30),
              'max_depth': randint(3, 10),
              'learning_rate': uniform(1e-5, 1),
              'n_estimators': randint(50, 200),
              'min_child_samples': randint(10, 30),
              'subsample': uniform(0.8, 1),
              'colsample_bytree': uniform(0.8, 1),
              'class_weight' : ['balanced', None, class_weights]}

# Create the LGBMClassifier model
clf6 = lgb.LGBMClassifier()
         
# Create the randomized search object
random_search = RandomizedSearchCV(estimator=clf6, param_distributions=param_dist, n_iter=100, scoring='roc_auc', cv=5, n_jobs=-1)

# Fit the randomized search object to the data
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print(random_search.best_params_)
best_params.append(random_search.best_params_)

In [None]:
# Use the best hyperparameters to fit the model to the training data
clf_best = lgb.LGBMClassifier(**random_search.best_params_)
model6 = clf_best.fit(X_train, y_train)

clf6 = print_auc(model6, X_train, y_train, X_test, y_test, 'LGBMClassifier')
df6 = pd.DataFrame(data=[['LGBMClassifier', clf6['f1'], clf6['auc'], clf6['auc_pr'], clf6['recall_sc'][0], clf6['recall_sc'][1], clf6['precision_sc'][0], clf6['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df6.style.hide_index()

## **KNeighborsClassifier**

In [None]:
# Define the search space
param_dist = {'n_neighbors': randint(3, 7),
              'weights': ['uniform', 'distance'],
              'p': [1, 2],
              'algorithm': ['brute'],
              'leaf_size': randint(10, 50)}

# Create the KNeighborsClassifier model
clf7 = KNeighborsClassifier()

# Create the randomized search object
random_search = RandomizedSearchCV(estimator=clf7, param_distributions=param_dist, n_iter=100, scoring='roc_auc', cv=5, n_jobs=-1)

# Fit the randomized search object to the data
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print(random_search.best_params_)
best_params.append(random_search.best_params_)

In [None]:
# Use the best hyperparameters to fit the model to the training data
clf_best = KNeighborsClassifier(**random_search.best_params_)
model7 = clf_best.fit(X_train, y_train)

clf7 = print_auc(model7, X_train, y_train, X_test, y_test, 'KNeiborsClassifier')
df7 = pd.DataFrame(data=[['KNeiborsClassifier', clf7['f1'], clf7['auc'], clf7['auc_pr'], clf7['recall_sc'][0], clf7['recall_sc'][1], clf7['precision_sc'][0], clf7['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df7.style.hide_index()

## **RandomForestClassifier**

In [None]:
# Define the search space
param_dist = {'n_estimators': randint(50, 1000),
              'max_depth': randint(3, 10),
              'min_samples_split': randint(2, 10),
              'min_samples_leaf': randint(1, 5),
              'max_features': ['sqrt', 'log2', None],
              'criterion': ['gini', 'entropy', 'log_loss'],
              'class_weight' : ['balanced', None, class_weights]}

# Create the RandomForestClassifier model
clf8 = RandomForestClassifier(random_state=seed)

# Create the randomized search object
random_search = RandomizedSearchCV(estimator=clf8, param_distributions=param_dist, n_iter=100, scoring='roc_auc', cv=5, n_jobs=-1)

# Fit the randomized search object to the data
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print(random_search.best_params_)
best_params.append(random_search.best_params_)

In [None]:
# Use the best hyperparameters to fit the model to the training data
clf_best = RandomForestClassifier(**random_search.best_params_, random_state=seed)
model8 = clf_best.fit(X_train, y_train)

clf8 = print_auc(model8, X_train, y_train, X_test, y_test, 'RandomForestClassifier')
df8 = pd.DataFrame(data=[['RandomForestClassifier', clf8['f1'], clf8['auc'], clf8['auc_pr'], clf8['recall_sc'][0], clf8['recall_sc'][1], clf8['precision_sc'][0], clf8['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df8.style.hide_index()

## **SVC**

In [None]:
# Define the search space
param_dist = {'C': uniform(0.1, 100),
              'degree': randint(2, 5),
              'gamma': ['scale', 'auto'],
              'class_weight': [None, 'balanced', class_weights]}

# Create the SVC model
clf9 = SVC(probability=True)              

# Create the randomized search object
random_search = RandomizedSearchCV(estimator=clf9, param_distributions=param_dist, n_iter=100, scoring='roc_auc', cv=50, n_jobs=-1)

# Fit the randomized search object to the data
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print(random_search.best_params_)
best_params.append(random_search.best_params_)

In [None]:
# Use the best hyperparameters to fit the model to the training data
clf_best = SVC(**random_search.best_params_, probability=True)
model9 = clf_best.fit(X_train, y_train)

clf9 = print_auc(model9, X_train, y_train, X_test, y_test, 'SVM')
df9 = pd.DataFrame(data=[['SVM', clf9['f1'], clf9['auc'], clf9['auc_pr'], clf9['recall_sc'][0], clf9['recall_sc'][1], clf9['precision_sc'][0], clf9['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df9.style.hide_index()

## **LogisticRegression**

In [None]:
# Define the search space
param_dist = {'penalty': ['l1', 'l2'],
              'C': uniform(0.1, 100),
              'solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag'],
              'max_iter': randint(100, 500),
              'class_weight': [None, 'balanced', class_weights]}

# Create the LogisticRegressionr model
clf10 = LogisticRegression()

# Create the randomized search object
random_search = RandomizedSearchCV(estimator=clf10, param_distributions=param_dist, n_iter=100, scoring='roc_auc', cv=5, n_jobs=-1)

# Fit the randomized search object to the data
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print(random_search.best_params_)
best_params.append(random_search.best_params_)

In [None]:
# Use the best hyperparameters to fit the model to the training data
clf_best = LogisticRegression(**random_search.best_params_, random_state=seed)
model10 = clf_best.fit(X_train, y_train)

clf10 = print_auc(model10, X_train, y_train, X_test, y_test, 'LogisticRegression')
df10 = pd.DataFrame(data=[['LogisticRegression', clf10['f1'], clf10['auc'], clf10['auc_pr'], clf10['recall_sc'][0], clf10['recall_sc'][1], clf10['precision_sc'][0], clf10['precision_sc'][1]]], 
                          columns=['model', 'f1_score', 'auc', 'auc_pr', 'recall_classe(0)', 'recall_classe(1)', 'precision_classe(0)', 'precision_classe(1)'])
df10.style.hide_index()

In [None]:
df_sm = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10]
df_finale_sm = pd.concat(df_sm)
df_finale_sm.style.hide_index()

In [None]:
df_finale_sm.to_csv('df_finale_sm.csv', index=False)

In [None]:
smote_ = best_params

In [None]:
plt.rcParams["figure.figsize"] = [16, 8]
plt.plot(clf6["fpr"], clf6["tpr"], label="LGBMClassifier, auc="+str(clf10["auc"]))
plt.plot(clf9["fpr"], clf9["tpr"], label="SVM, auc="+str(clf2["auc"]))
plt.plot(clf5["fpr"], clf5["tpr"], label="GaussianNB, auc="+str(clf3["auc"]))
plt.plot(clf1["fpr"], clf1["tpr"], label="ExtraTrees, auc="+str(clf6["auc"]))
plt.plot(clf7["fpr"], clf7["tpr"], label="KNN, auc="+str(clf5["auc"]))
plt.plot(clf4["fpr"], clf4["tpr"], label="DT, auc="+str(clf8["auc"]))
plt.plot(clf10["fpr"], clf10["tpr"], label="LR, auc="+str(clf9["auc"]))
plt.plot(clf8["fpr"], clf8["tpr"], label="RF, auc="+str(clf4["auc"]))
plt.plot(clf3["fpr"], clf3["tpr"], label="AdaBoost, auc="+str(clf7["auc"]))
plt.plot(clf2["fpr"], clf2["tpr"], label="XGB, auc="+str(clf2["auc"]))

plt.xlabel("tpr")
plt.ylabel("fpr")
plt.legend(loc=4)
plt.savefig('smote_data.png', dpi=300)
plt.show()