In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from helper_functions import smi_tokenizer
import joblib

In [2]:
def predict_metrics(model, xtest, ytest):
    ypred = model.predict(xtest)
    auc = metrics.roc_auc_score(ytest, ypred)
    acc = metrics.accuracy_score(ytest, ypred)
    tot = metrics.classification_report(ytest, ypred, output_dict=True)
    tn, fp, fn, tp = metrics.confusion_matrix(ytest, ypred).ravel()
    specificity = tn / (tn+fp)
    sensitivity = tp / (tp+fn)

    return auc, acc,tot, specificity, sensitivity

In [3]:
data = pd.read_table('F:/bbb/data/B3DB/B3DB_classification.tsv')
X = data['SMILES']
X = [smi_tokenizer(smile) for smile in X]
target=data['BBB+/BBB-'].values.tolist()
label_encoder = LabelEncoder()
y_classification = label_encoder.fit_transform(target)

In [4]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y_classification, test_size = 0.2, random_state = 42)
xtrain, xvalid, ytrain, yvalid = train_test_split(xtrain, ytrain, test_size = 0.16, random_state = 42)

In [5]:
model_randomforset = Pipeline([('tfidf', TfidfVectorizer(lowercase = False, analyzer = "char")), ('clf', RandomForestClassifier())])
model_svm = Pipeline([('tfidf', TfidfVectorizer(lowercase = False, analyzer = "char")), ('clf', SVC())])
model_extratree = Pipeline([('tfidf', TfidfVectorizer(lowercase = False, analyzer = "char")), ('clf', ExtraTreesClassifier())])
model_naive = Pipeline([('tfidf', TfidfVectorizer(lowercase = False, analyzer = "char")), ('clf', MultinomialNB())])

In [6]:
model_randomforset.fit(xtrain, ytrain)
model_svm.fit(xtrain, ytrain)
model_extratree.fit(xtrain, ytrain)
model_naive.fit(xtrain, ytrain)

Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='char', lowercase=False)),
                ('clf', MultinomialNB())])

## SVM results

In [7]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_svm, xtest, ytest)
print(auc, acc, specificity, sensitivity)

0.7574190515366985 0.7900128040973111 0.8731268731268731 0.6417112299465241


In [8]:
result

{'0': {'precision': 0.8130232558139535,
  'recall': 0.8731268731268731,
  'f1-score': 0.8420038535645472,
  'support': 1001},
 '1': {'precision': 0.7392197125256673,
  'recall': 0.6417112299465241,
  'f1-score': 0.6870229007633587,
  'support': 561},
 'accuracy': 0.7900128040973111,
 'macro avg': {'precision': 0.7761214841698104,
  'recall': 0.7574190515366985,
  'f1-score': 0.764513377163953,
  'support': 1562},
 'weighted avg': {'precision': 0.7865163494216817,
  'recall': 0.7900128040973111,
  'f1-score': 0.786341680375388,
  'support': 1562}}

In [9]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_svm, xvalid, yvalid)
print(auc, acc, specificity, sensitivity)

0.7844904013735182 0.813 0.8885400313971743 0.6804407713498623


In [10]:
result

{'0': {'precision': 0.8299120234604106,
  'recall': 0.8885400313971743,
  'f1-score': 0.8582259287338894,
  'support': 637},
 '1': {'precision': 0.7767295597484277,
  'recall': 0.6804407713498623,
  'f1-score': 0.7254038179148311,
  'support': 363},
 'accuracy': 0.813,
 'macro avg': {'precision': 0.8033207916044192,
  'recall': 0.7844904013735183,
  'f1-score': 0.7918148733243602,
  'support': 1000},
 'weighted avg': {'precision': 0.8106067891329608,
  'recall': 0.813,
  'f1-score': 0.8100115025065712,
  'support': 1000}}

## Random Forest model

In [11]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_randomforset, xtest, ytest)
print(auc, acc, specificity, sensitivity)

0.8201798201798202 0.8463508322663252 0.913086913086913 0.7272727272727273


In [12]:
result

{'0': {'precision': 0.8566073102155577,
  'recall': 0.913086913086913,
  'f1-score': 0.88394584139265,
  'support': 1001},
 '1': {'precision': 0.8242424242424242,
  'recall': 0.7272727272727273,
  'f1-score': 0.7727272727272727,
  'support': 561},
 'accuracy': 0.8463508322663252,
 'macro avg': {'precision': 0.8404248672289909,
  'recall': 0.8201798201798202,
  'f1-score': 0.8283365570599613,
  'support': 1562},
 'weighted avg': {'precision': 0.8449833018730942,
  'recall': 0.8463508322663252,
  'f1-score': 0.84400114419593,
  'support': 1562}}

In [13]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_randomforset, xvalid, yvalid)
print(auc, acc, specificity, sensitivity)

0.8418810626602835 0.868 0.9372056514913658 0.7465564738292011


In [14]:
result

{'0': {'precision': 0.8664731494920174,
  'recall': 0.9372056514913658,
  'f1-score': 0.9004524886877827,
  'support': 637},
 '1': {'precision': 0.8713826366559485,
  'recall': 0.7465564738292011,
  'f1-score': 0.8041543026706232,
  'support': 363},
 'accuracy': 0.868,
 'macro avg': {'precision': 0.868927893073983,
  'recall': 0.8418810626602835,
  'f1-score': 0.852303395679203,
  'support': 1000},
 'weighted avg': {'precision': 0.8682552933325244,
  'recall': 0.868,
  'f1-score': 0.8654962471635538,
  'support': 1000}}

## ExtraTreeClassifier

In [15]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_extratree, xtest, ytest)
print(auc, acc, specificity, sensitivity)

0.8283089459560048 0.852752880921895 0.9150849150849151 0.7415329768270945


In [16]:
result

{'0': {'precision': 0.8633364750235627,
  'recall': 0.9150849150849151,
  'f1-score': 0.8884578079534433,
  'support': 1001},
 '1': {'precision': 0.8303393213572854,
  'recall': 0.7415329768270945,
  'f1-score': 0.7834274952919021,
  'support': 561},
 'accuracy': 0.852752880921895,
 'macro avg': {'precision': 0.8468378981904241,
  'recall': 0.8283089459560048,
  'f1-score': 0.8359426516226727,
  'support': 1562},
 'weighted avg': {'precision': 0.8514853846222941,
  'recall': 0.852752880921895,
  'f1-score': 0.8507356534059884,
  'support': 1562}}

In [17]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_extratree, xvalid, yvalid)
print(auc, acc, specificity, sensitivity)

0.8436433696173955 0.871 0.9434850863422292 0.743801652892562


In [18]:
result

{'0': {'precision': 0.8659942363112392,
  'recall': 0.9434850863422292,
  'f1-score': 0.9030803906836964,
  'support': 637},
 '1': {'precision': 0.8823529411764706,
  'recall': 0.743801652892562,
  'f1-score': 0.8071748878923767,
  'support': 363},
 'accuracy': 0.871,
 'macro avg': {'precision': 0.8741735887438549,
  'recall': 0.8436433696173955,
  'f1-score': 0.8551276392880365,
  'support': 1000},
 'weighted avg': {'precision': 0.8719324461773182,
  'recall': 0.871,
  'f1-score': 0.8682666931704472,
  'support': 1000}}

## Naive Bayes 

In [19]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_naive, xtest, ytest)
print(auc, acc, specificity, sensitivity)

0.5039568274862394 0.6434058898847631 0.999000999000999 0.008912655971479501


In [20]:
result

{'0': {'precision': 0.6426735218508998,
  'recall': 0.999000999000999,
  'f1-score': 0.7821666014861165,
  'support': 1001},
 '1': {'precision': 0.8333333333333334,
  'recall': 0.008912655971479501,
  'f1-score': 0.01763668430335097,
  'support': 561},
 'accuracy': 0.6434058898847631,
 'macro avg': {'precision': 0.7380034275921166,
  'recall': 0.5039568274862393,
  'f1-score': 0.39990164289473373,
  'support': 1562},
 'weighted avg': {'precision': 0.711149933017126,
  'recall': 0.6434058898847631,
  'f1-score': 0.5075819129204753,
  'support': 1562}}

In [21]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_naive, xvalid, yvalid)
print(auc, acc, specificity, sensitivity)

0.5027548209366391 0.639 1.0 0.005509641873278237


In [22]:
result

{'0': {'precision': 0.6382765531062125,
  'recall': 1.0,
  'f1-score': 0.7792048929663609,
  'support': 637},
 '1': {'precision': 1.0,
  'recall': 0.005509641873278237,
  'f1-score': 0.010958904109589041,
  'support': 363},
 'accuracy': 0.639,
 'macro avg': {'precision': 0.8191382765531062,
  'recall': 0.5027548209366391,
  'f1-score': 0.39508189853797493,
  'support': 1000},
 'weighted avg': {'precision': 0.7695821643286573,
  'recall': 0.639,
  'f1-score': 0.5003315990113527,
  'support': 1000}}

## saving all the models

In [23]:
joblib.dump(model_randomforset, 'F:/bbb/model/sklearn_models/randomforest.pkl')
joblib.dump(model_svm, 'F:/bbb/model/sklearn_models/svm.pkl')
joblib.dump(model_extratree, 'F:/bbb/model/sklearn_models/extratree.pkl')
joblib.dump(model_naive, 'F:/bbb/model/sklearn_models/naive.pkl')

['F:/bbb/model/sklearn_models/naive.pkl']