### Comparison between CNNFP and ECFP

Logistic Regression using CNNFP and ECFP, for n-terms binary classification (one-VS-all) problems. Performances evaluated using 10-fold Cross Validated AUC.

In [1]:
import os
import sys
parent_path = os.path.abspath(os.path.join('..'))
if parent_path not in sys.path:
    sys.path.append(parent_path)

import datetime
import csv
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from preprocess.data_handler import load_data, categorical_labels, load_pickle
from preprocess.smiles_embedder import get_cnn_fingerprint

import tensorflow as tf
from keras import backend as K

# Loading data
path = '../data/'
termdict = load_pickle(path+'termdict.pickle')
dataset = load_data(path+'dataset.csv')
smiles = dataset['SMILES']
labels = categorical_labels(dataset['Terms'], termdict)

# CNN embedding
cnn_fp_data = get_cnn_fingerprint(smiles)

Using TensorFlow backend.


Embedding smiles...
Embedding complete - 4.079897165298462 seconds, 9174 smiles


In [2]:
# Load ECFP fingerprints and labels
ecfp_d = load_pickle('../data/ecfp-data.pickle')
ecfp_l = load_pickle('../data/ecfp-labels.pickle')
ecfp_data, ecfp_labels = [], []
for cid in sorted(ecfp_d.keys()):
    ecfp_data.append(ecfp_d[cid])
    ecfp_labels.append(ecfp_l[cid])
ecfp_data = np.array(ecfp_data)
ecfp_labels = np.array(ecfp_labels)

10-fold CV for binary classification for each term. Only the first 10 terms are showed as an example, spreadsheet with complete results [here](https://docs.google.com/spreadsheets/d/1jQb9JPWqfxbhlN5P0_UiaNMoUKFBfXTwrfCdsTz0_Xs/edit?usp=sharing).

In [4]:
terms = sorted(list(termdict.keys()))
print('%s\t%s\t%s' % ('Term', 'CNNFP', 'ECFP'))

for t in terms[0:10]:
   
    y = labels[:, termdict[t]]
    y_ecfp = ecfp_labels[:, termdict[t]]

    logreg = LogisticRegression()

    auc_cnnfp = cross_val_score(logreg, cnn_fp_data, y, cv=10, scoring='roc_auc', n_jobs=-1)
    auc_ecfp = cross_val_score(logreg, ecfp_data, y_ecfp, cv=10, scoring='roc_auc', n_jobs=-1)

    print('%s\t%s\t%s' % (t, auc_cnnfp.mean(), auc_ecfp.mean()))

Term	CNNFP	ECFP
5-alpha Reductase Inhibitors	0.999626185266	0.957731143392
Adjuvants, Anesthesia	0.980391181953	0.964977661844
Adjuvants, Immunologic	0.996582366921	0.827809812083
Adrenergic Agents	0.999626514092	0.977944930708
Adrenergic Uptake Inhibitors	0.999652941911	0.952155247785
Adrenergic alpha-1 Receptor Antagonists	0.999945295405	0.987745851119
Adrenergic alpha-2 Receptor Agonists	0.998869557939	0.962814147864
Adrenergic alpha-Agonists	0.99982475356	0.973018461405
Adrenergic alpha-Antagonists	0.998239899539	0.950806465262
Adrenergic beta-1 Receptor Agonists	0.999726954924	0.978087551006
