<a href="https://colab.research.google.com/github/mehdimerbah/CompDrugDiscovery/blob/main/models/ClassificationModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries and Data Import

In [37]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, SimpleRNN
import json


data = pd.read_csv('https://raw.githubusercontent.com/mehdimerbah/CompDrugDiscovery/main/data/classification_model_data.csv')
data.head(10)

Unnamed: 0,SubFP1,SubFP2,SubFP3,SubFP4,SubFP5,SubFP6,SubFP7,SubFP8,SubFP9,SubFP10,...,SubFP299,SubFP300,SubFP301,SubFP302,SubFP303,SubFP304,SubFP305,SubFP306,SubFP307,activity_class
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,active
1,1,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,inactive
2,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,active
3,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,active
4,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,active
5,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,0,0,0,0,1,inactive
6,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,active
7,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,active
8,1,0,0,0,0,0,0,0,0,0,...,1,1,1,1,0,0,0,0,1,active
9,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,active


In [7]:
features = data.drop(columns = ['activity_class'])
targets = data.activity_class

In [8]:
def remove_low_variance(input_data, threshold=0.1):
    selection = VarianceThreshold(threshold)
    selection.fit(input_data)
    return input_data[input_data.columns[selection.get_support(indices=True)]]


features = remove_low_variance(features, threshold=0.1)
features

Unnamed: 0,SubFP1,SubFP2,SubFP3,SubFP20,SubFP38,SubFP49,SubFP85,SubFP88,SubFP96,SubFP100,SubFP135,SubFP137,SubFP171,SubFP181,SubFP182,SubFP183,SubFP184,SubFP279,SubFP280,SubFP287
0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,1
1,1,0,0,0,0,1,0,1,0,0,1,1,0,1,1,0,1,0,0,1
2,0,0,0,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0,0,1
3,0,0,0,0,0,1,0,1,0,0,1,1,0,0,1,0,1,0,0,1
4,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,1,1,0,0,0,1,1,1,0,0,1,1,0,0,1,0,1,0,0,1
264,1,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,1,0,0,1
265,1,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1
266,1,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1


# Models Building and Training

In [23]:
def get_metrics(predicted,true):
    metrics = dict()
    metrics['accuracy'] = round(accuracy_score(predicted, true), 5)
    metrics['precision'] = round(precision_score(predicted, true, average = 'weighted'), 5)
    metrics['recall'] = round(recall_score(predicted, true, average = 'weighted'), 5)
    metrics['f1'] = round(f1_score(predicted, true, average = 'weighted'), 5)
    
    return metrics

## Random Forest Classifier

In [9]:
## Splitting the Data
X_training_set, X_validation_set, y_training_set, y_validation_set = train_test_split(features, targets, test_size=0.2, random_state=42)


In [16]:
RF_model = RandomForestClassifier(n_estimators=500, random_state=42)
RF_model.fit(X_training_set, y_training_set)


RandomForestClassifier(n_estimators=500, random_state=42)

In [18]:
y_training_pred = RF_model.predict(X_training_set)
y_validation_pred = RF_model.predict(X_validation_set)


In [19]:
mcc_test = matthews_corrcoef(y_validation_set, y_validation_pred)
mcc_test

0.7774957785358391

In [25]:
metrics = get_metrics(y_validation_pred, y_validation_set)
print(metrics)

{'accuracy': 0.88889, 'precision': 0.89226, 'recall': 0.88889, 'f1': 0.88935}


## LSTM-RNN Classifier

In [32]:
# initializing model
LSTM_model = Sequential()
# adding embedding layer
LSTM_model.add(Embedding(20, 10, input_length=20))
LSTM_model.add(LSTM(150))
LSTM_model.add(Dense(1, activation='sigmoid'))
LSTM_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
LSTM_model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 20, 100)           2000      
                                                                 
 lstm_2 (LSTM)               (None, 150)               150600    
                                                                 
 dense_2 (Dense)             (None, 1)                 151       
                                                                 
Total params: 152,751
Trainable params: 152,751
Non-trainable params: 0
_________________________________________________________________


In [39]:
## testing

RNN_model = Sequential([
    Embedding(20, 10, input_length=20),
    SimpleRNN(32),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])
RNN_model.summary()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 20, 10)            200       
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                1376      
                                                                 
 dense_3 (Dense)             (None, 10)                330       
                                                                 
 dense_4 (Dense)             (None, 1)                 11        
                                                                 
Total params: 1,917
Trainable params: 1,917
Non-trainable params: 0
_________________________________________________________________


In [44]:
from sklearn.svm import SVC, LinearSVC

clf = LinearSVC()
clf.fit(X_training_set, y_training_set)

LinearSVC()

In [46]:
y_SVM_pred = clf.predict(X_validation_set)

In [47]:
get_metrics(y_SVM_pred, y_validation_set)

{'accuracy': 0.90741, 'f1': 0.90809, 'precision': 0.91438, 'recall': 0.90741}