In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from helper_functions import smi_tokenizer
import joblib
import lightgbm as lgb


In [2]:
data = pd.read_table('F:/bbb/data/B3DB/B3DB_classification.tsv')
X = data['SMILES']
X = [smi_tokenizer(smile) for smile in X]
target=data['BBB+/BBB-'].values.tolist()
label_encoder = LabelEncoder()
y_classification = label_encoder.fit_transform(target)

In [3]:
def predict_metrics(model, xtest, ytest):
    ypred = model.predict(xtest)
    auc = metrics.roc_auc_score(ytest, ypred)
    acc = metrics.accuracy_score(ytest, ypred)
    tot = metrics.classification_report(ytest, ypred, output_dict=True)
    tn, fp, fn, tp = metrics.confusion_matrix(ytest, ypred).ravel()
    specificity = tn / (tn+fp)
    sensitivity = tp / (tp+fn)

    return auc, acc,tot, specificity, sensitivity

In [4]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y_classification, test_size = 0.2, random_state = 42)
xtrain, xvalid, ytrain, yvalid = train_test_split(xtrain, ytrain, test_size = 0.16, random_state = 42)

In [5]:
model_randomforset = Pipeline([('tfidf', TfidfVectorizer(lowercase = False, analyzer = "char")), ('clf', RandomForestClassifier())])
model_svm = Pipeline([('tfidf', TfidfVectorizer(lowercase = False, analyzer = "char")), ('clf', SVC())])
model_extratree = Pipeline([('tfidf', TfidfVectorizer(lowercase = False, analyzer = "char")), ('clf', ExtraTreesClassifier())])
model_naive = Pipeline([('tfidf', TfidfVectorizer(lowercase = False, analyzer = "char")), ('clf', MultinomialNB())])
model_light = Pipeline([('tfidf', TfidfVectorizer(lowercase = False, analyzer = "char")), ('clf', lgb.LGBMClassifier())])

In [6]:
model_randomforset.fit(xtrain, ytrain)
model_svm.fit(xtrain, ytrain)
model_extratree.fit(xtrain, ytrain)
model_naive.fit(xtrain, ytrain)
model_light.fit(xtrain, ytrain)

Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='char', lowercase=False)),
                ('clf', LGBMClassifier())])

## Lightgbm results

In [7]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_light, xtest, ytest)
print(auc, acc, specificity, sensitivity)

0.8287692699457405 0.850832266325224 0.9070929070929071 0.750445632798574


In [8]:
result

{'0': {'precision': 0.8664122137404581,
  'recall': 0.9070929070929071,
  'f1-score': 0.8862859931673988,
  'support': 1001},
 '1': {'precision': 0.8190661478599222,
  'recall': 0.750445632798574,
  'f1-score': 0.7832558139534883,
  'support': 561},
 'accuracy': 0.850832266325224,
 'macro avg': {'precision': 0.8427391808001901,
  'recall': 0.8287692699457405,
  'f1-score': 0.8347709035604436,
  'support': 1562},
 'weighted avg': {'precision': 0.8494076407833642,
  'recall': 0.850832266325224,
  'f1-score': 0.8492821964074732,
  'support': 1562}}

In [9]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_light, xvalid, yvalid)
print(auc, acc, specificity, sensitivity)

0.8540853086307633 0.876 0.9340659340659341 0.7741046831955923


In [10]:
result

{'0': {'precision': 0.8788774002954209,
  'recall': 0.9340659340659341,
  'f1-score': 0.9056316590563166,
  'support': 637},
 '1': {'precision': 0.8699690402476781,
  'recall': 0.7741046831955923,
  'f1-score': 0.8192419825072887,
  'support': 363},
 'accuracy': 0.876,
 'macro avg': {'precision': 0.8744232202715495,
  'recall': 0.8540853086307632,
  'f1-score': 0.8624368207818027,
  'support': 1000},
 'weighted avg': {'precision': 0.8756436655980903,
  'recall': 0.876,
  'f1-score': 0.8742722064690195,
  'support': 1000}}

## SVM results

In [11]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_svm, xtest, ytest)
print(auc, acc, specificity, sensitivity)

0.7574190515366985 0.7900128040973111 0.8731268731268731 0.6417112299465241


In [12]:
result

{'0': {'precision': 0.8130232558139535,
  'recall': 0.8731268731268731,
  'f1-score': 0.8420038535645472,
  'support': 1001},
 '1': {'precision': 0.7392197125256673,
  'recall': 0.6417112299465241,
  'f1-score': 0.6870229007633587,
  'support': 561},
 'accuracy': 0.7900128040973111,
 'macro avg': {'precision': 0.7761214841698104,
  'recall': 0.7574190515366985,
  'f1-score': 0.764513377163953,
  'support': 1562},
 'weighted avg': {'precision': 0.7865163494216817,
  'recall': 0.7900128040973111,
  'f1-score': 0.786341680375388,
  'support': 1562}}

In [13]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_svm, xvalid, yvalid)
print(auc, acc, specificity, sensitivity)

0.7844904013735182 0.813 0.8885400313971743 0.6804407713498623


In [14]:
result

{'0': {'precision': 0.8299120234604106,
  'recall': 0.8885400313971743,
  'f1-score': 0.8582259287338894,
  'support': 637},
 '1': {'precision': 0.7767295597484277,
  'recall': 0.6804407713498623,
  'f1-score': 0.7254038179148311,
  'support': 363},
 'accuracy': 0.813,
 'macro avg': {'precision': 0.8033207916044192,
  'recall': 0.7844904013735183,
  'f1-score': 0.7918148733243602,
  'support': 1000},
 'weighted avg': {'precision': 0.8106067891329608,
  'recall': 0.813,
  'f1-score': 0.8100115025065712,
  'support': 1000}}

## Random Forest model

In [15]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_randomforset, xtest, ytest)
print(auc, acc, specificity, sensitivity)

0.8311296546590665 0.8578745198463509 0.926073926073926 0.7361853832442068


In [16]:
result

{'0': {'precision': 0.8623255813953489,
  'recall': 0.926073926073926,
  'f1-score': 0.8930635838150289,
  'support': 1001},
 '1': {'precision': 0.8480492813141683,
  'recall': 0.7361853832442068,
  'f1-score': 0.7881679389312977,
  'support': 561},
 'accuracy': 0.8578745198463509,
 'macro avg': {'precision': 0.8551874313547586,
  'recall': 0.8311296546590664,
  'f1-score': 0.8406157613731633,
  'support': 1562},
 'weighted avg': {'precision': 0.8571981778450657,
  'recall': 0.8578745198463509,
  'f1-score': 0.8553897958638297,
  'support': 1562}}

In [17]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_randomforset, xvalid, yvalid)
print(auc, acc, specificity, sensitivity)

0.8481756338899197 0.873 0.9387755102040817 0.7575757575757576


In [18]:
result

{'0': {'precision': 0.8717201166180758,
  'recall': 0.9387755102040817,
  'f1-score': 0.9040060468631896,
  'support': 637},
 '1': {'precision': 0.8757961783439491,
  'recall': 0.7575757575757576,
  'f1-score': 0.8124076809453471,
  'support': 363},
 'accuracy': 0.873,
 'macro avg': {'precision': 0.8737581474810124,
  'recall': 0.8481756338899196,
  'f1-score': 0.8582068639042684,
  'support': 1000},
 'weighted avg': {'precision': 0.8731997270245679,
  'recall': 0.873,
  'f1-score': 0.8707558400350128,
  'support': 1000}}

## ExtraTreeClassifier

In [19]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_extratree, xtest, ytest)
print(auc, acc, specificity, sensitivity)

0.8239603533721181 0.8501920614596671 0.9170829170829171 0.7308377896613191


In [20]:
result

{'0': {'precision': 0.8587464920486436,
  'recall': 0.9170829170829171,
  'f1-score': 0.8869565217391305,
  'support': 1001},
 '1': {'precision': 0.8316430020283976,
  'recall': 0.7308377896613191,
  'f1-score': 0.777988614800759,
  'support': 561},
 'accuracy': 0.8501920614596671,
 'macro avg': {'precision': 0.8451947470385206,
  'recall': 0.823960353372118,
  'f1-score': 0.8324725682699448,
  'support': 1562},
 'weighted avg': {'precision': 0.8490121399991186,
  'recall': 0.8501920614596671,
  'f1-score': 0.847820160796476,
  'support': 1562}}

In [21]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_extratree, xvalid, yvalid)
print(auc, acc, specificity, sensitivity)

0.8487681150018813 0.873 0.9372056514913658 0.7603305785123967


In [22]:
result

{'0': {'precision': 0.8728070175438597,
  'recall': 0.9372056514913658,
  'f1-score': 0.9038607115821348,
  'support': 637},
 '1': {'precision': 0.8734177215189873,
  'recall': 0.7603305785123967,
  'f1-score': 0.812960235640648,
  'support': 363},
 'accuracy': 0.873,
 'macro avg': {'precision': 0.8731123695314236,
  'recall': 0.8487681150018813,
  'f1-score': 0.8584104736113913,
  'support': 1000},
 'weighted avg': {'precision': 0.873028703086831,
  'recall': 0.873,
  'f1-score': 0.8708638388153751,
  'support': 1000}}

## Naive Bayes 

In [23]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_naive, xtest, ytest)
print(auc, acc, specificity, sensitivity)

0.5039568274862394 0.6434058898847631 0.999000999000999 0.008912655971479501


In [24]:
result

{'0': {'precision': 0.6426735218508998,
  'recall': 0.999000999000999,
  'f1-score': 0.7821666014861165,
  'support': 1001},
 '1': {'precision': 0.8333333333333334,
  'recall': 0.008912655971479501,
  'f1-score': 0.01763668430335097,
  'support': 561},
 'accuracy': 0.6434058898847631,
 'macro avg': {'precision': 0.7380034275921166,
  'recall': 0.5039568274862393,
  'f1-score': 0.39990164289473373,
  'support': 1562},
 'weighted avg': {'precision': 0.711149933017126,
  'recall': 0.6434058898847631,
  'f1-score': 0.5075819129204753,
  'support': 1562}}

In [25]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_naive, xvalid, yvalid)
print(auc, acc, specificity, sensitivity)

0.5027548209366391 0.639 1.0 0.005509641873278237


In [26]:
result

{'0': {'precision': 0.6382765531062125,
  'recall': 1.0,
  'f1-score': 0.7792048929663609,
  'support': 637},
 '1': {'precision': 1.0,
  'recall': 0.005509641873278237,
  'f1-score': 0.010958904109589041,
  'support': 363},
 'accuracy': 0.639,
 'macro avg': {'precision': 0.8191382765531062,
  'recall': 0.5027548209366391,
  'f1-score': 0.39508189853797493,
  'support': 1000},
 'weighted avg': {'precision': 0.7695821643286573,
  'recall': 0.639,
  'f1-score': 0.5003315990113527,
  'support': 1000}}

## saving all the models

In [27]:
joblib.dump(model_randomforset, 'F:/bbb/model/sklearn_lgb_models/randomforest.pkl')
joblib.dump(model_svm, 'F:/bbb/model/sklearn_lgb_models/svm.pkl')
joblib.dump(model_extratree, 'F:/bbb/model/sklearn_lgb_models/extratree.pkl')
joblib.dump(model_naive, 'F:/bbb/model/sklearn_lgb_models/naive.pkl')
joblib.dump(model_light, 'F:/bbb/model/sklearn_lgb_models/lgb_classifier.pkl')

['F:/bbb/model/sklearn_lgb_models/lgb_classifier.pkl']