In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from helper_functions import smi_tokenizer
import joblib

In [2]:
def predict_metrics(model, xtest, ytest):
    ypred = model.predict(xtest)
    auc = metrics.roc_auc_score(ytest, ypred)
    acc = metrics.accuracy_score(ytest, ypred)
    tot = metrics.classification_report(ytest, ypred, output_dict=True)
    tn, fp, fn, tp = metrics.confusion_matrix(ytest, ypred).ravel()
    specificity = tn / (tn+fp)
    sensitivity = tp / (tp+fn)

    return auc, acc,tot, specificity, sensitivity

In [3]:
data = pd.read_table('F:/bbb/data/B3DB/B3DB_classification.tsv')
X = data['SMILES']
X = [smi_tokenizer(smile) for smile in X]
target=data['BBB+/BBB-'].values.tolist()
label_encoder = LabelEncoder()
y_classification = label_encoder.fit_transform(target)

In [4]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y_classification, test_size = 0.2, random_state = 42)
xtrain, xvalid, ytrain, yvalid = train_test_split(xtrain, ytrain, test_size = 0.16, random_state = 42)

In [5]:
model_randomforset = Pipeline([('tfidf', TfidfVectorizer(lowercase = False, analyzer = "char")), ('clf', RandomForestClassifier())])
model_svm = Pipeline([('tfidf', TfidfVectorizer(lowercase = False, analyzer = "char")), ('clf', SVC())])
model_extratree = Pipeline([('tfidf', TfidfVectorizer(lowercase = False, analyzer = "char")), ('clf', ExtraTreesClassifier())])
model_naive = Pipeline([('tfidf', TfidfVectorizer(lowercase = False, analyzer = "char")), ('clf', MultinomialNB())])

In [6]:
model_randomforset.fit(xtrain, ytrain)
model_svm.fit(xtrain, ytrain)
model_extratree.fit(xtrain, ytrain)
model_naive.fit(xtrain, ytrain)

Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='char', lowercase=False)),
                ('clf', MultinomialNB())])

## SVM results

In [7]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_svm, xtest, ytest)
print(auc, acc, specificity, sensitivity)

0.7574190515366985 0.7900128040973111 0.8731268731268731 0.6417112299465241


In [8]:
result

{'0': {'precision': 0.8130232558139535,
  'recall': 0.8731268731268731,
  'f1-score': 0.8420038535645472,
  'support': 1001},
 '1': {'precision': 0.7392197125256673,
  'recall': 0.6417112299465241,
  'f1-score': 0.6870229007633587,
  'support': 561},
 'accuracy': 0.7900128040973111,
 'macro avg': {'precision': 0.7761214841698104,
  'recall': 0.7574190515366985,
  'f1-score': 0.764513377163953,
  'support': 1562},
 'weighted avg': {'precision': 0.7865163494216817,
  'recall': 0.7900128040973111,
  'f1-score': 0.786341680375388,
  'support': 1562}}

## Random Forest model

In [9]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_svm, xtest, ytest)
print(auc, acc, specificity, sensitivity)

0.7574190515366985 0.7900128040973111 0.8731268731268731 0.6417112299465241


In [10]:
result

{'0': {'precision': 0.8130232558139535,
  'recall': 0.8731268731268731,
  'f1-score': 0.8420038535645472,
  'support': 1001},
 '1': {'precision': 0.7392197125256673,
  'recall': 0.6417112299465241,
  'f1-score': 0.6870229007633587,
  'support': 561},
 'accuracy': 0.7900128040973111,
 'macro avg': {'precision': 0.7761214841698104,
  'recall': 0.7574190515366985,
  'f1-score': 0.764513377163953,
  'support': 1562},
 'weighted avg': {'precision': 0.7865163494216817,
  'recall': 0.7900128040973111,
  'f1-score': 0.786341680375388,
  'support': 1562}}

## ExtraTreeClassifier

In [11]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_svm, xtest, ytest)
print(auc, acc, specificity, sensitivity)

0.7574190515366985 0.7900128040973111 0.8731268731268731 0.6417112299465241


In [12]:
result

{'0': {'precision': 0.8130232558139535,
  'recall': 0.8731268731268731,
  'f1-score': 0.8420038535645472,
  'support': 1001},
 '1': {'precision': 0.7392197125256673,
  'recall': 0.6417112299465241,
  'f1-score': 0.6870229007633587,
  'support': 561},
 'accuracy': 0.7900128040973111,
 'macro avg': {'precision': 0.7761214841698104,
  'recall': 0.7574190515366985,
  'f1-score': 0.764513377163953,
  'support': 1562},
 'weighted avg': {'precision': 0.7865163494216817,
  'recall': 0.7900128040973111,
  'f1-score': 0.786341680375388,
  'support': 1562}}

## Naive Bayes 

In [13]:
auc,acc,result, specificity, sensitivity = predict_metrics(model_svm, xtest, ytest)
print(auc, acc, specificity, sensitivity)

0.7574190515366985 0.7900128040973111 0.8731268731268731 0.6417112299465241


In [14]:
result

{'0': {'precision': 0.8130232558139535,
  'recall': 0.8731268731268731,
  'f1-score': 0.8420038535645472,
  'support': 1001},
 '1': {'precision': 0.7392197125256673,
  'recall': 0.6417112299465241,
  'f1-score': 0.6870229007633587,
  'support': 561},
 'accuracy': 0.7900128040973111,
 'macro avg': {'precision': 0.7761214841698104,
  'recall': 0.7574190515366985,
  'f1-score': 0.764513377163953,
  'support': 1562},
 'weighted avg': {'precision': 0.7865163494216817,
  'recall': 0.7900128040973111,
  'f1-score': 0.786341680375388,
  'support': 1562}}

In [15]:
joblib.dump(model_randomforset, 'F:/bbb/model/sklearn_models/randomforest.pkl')
joblib.dump(model_svm, 'F:/bbb/model/sklearn_models/svm.pkl')
joblib.dump(model_extratree, 'F:/bbb/model/sklearn_models/extratree.pkl')
joblib.dump(model_naive, 'F:/bbb/model/sklearn_models/naive.pkl')

['F:/bbb/model/sklearn_models/naive.pkl']