In [1]:


%matplotlib inline

import random
import math
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans

from sklearn.base import BaseEstimator
from sklearn.model_selection import ShuffleSplit
from sklearn import datasets, neighbors
import matplotlib.pyplot as plt
import matplotlib as mpl

import pandas as pd
import sklearn.metrics as metrics
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem,DataStructs,Draw,PandasTools,Descriptors
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem.Draw import IPythonConsole
from rdkit import RDLogger
from functools import partial
from modAL.models import ActiveLearner, Committee
from modAL.uncertainty import uncertainty_sampling
from modAL.batch import uncertainty_batch_sampling
from modAL.uncertainty import classifier_uncertainty
from modAL.uncertainty import margin_sampling
from modAL.disagreement import max_disagreement_sampling
from modAL.expected_error import expected_error_reduction
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from IPython.display import Image
uncharger = rdMolStandardize.Uncharger()
PandasTools.molRepresentation='svg'



__TODO__: Pass dataset as parameter

In [4]:
def standardize(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        mol = rdMolStandardize.FragmentParent(mol)
        mol = uncharger.uncharge(mol)
    return mol
    
def data_setup():
    mol_data = pd.read_csv("./data/Beta-secretase_1.balanced.tsv", header=None,sep='\t')
    mol_data["MOL"] = mol_data[mol_data.columns[0]].apply(standardize)
    mol_data.rename(columns={mol_data.columns[0]: "SMILES", mol_data.columns[1]: "Label"}, inplace=True)
    mol_data.drop_duplicates(['SMILES'],inplace=True)
    mol_data.reset_index(drop=True, inplace=True)
    return mol_data

def create_fingerprints(morgan_radius, morgan_n_bits, mol_data):
    fps = [AllChem.GetMorganFingerprintAsBitVect(m, morgan_radius, nBits=morgan_n_bits) for m in mol_data["MOL"]]
    X_morgan = np.asarray(fps)
    Y_labels = mol_data["Label"].to_numpy()
    return X_morgan, Y_labels

In [5]:
# https://github.com/rdkit/rdkit/issues/2320
def disable_rdkit_logging():
    """
    Disables RDKit whiny logging.
    """
    import rdkit.rdBase as rkrb
    import rdkit.RDLogger as rkl
    logger = rkl.logger()
    logger.setLevel(rkl.ERROR)
    rkrb.DisableLog('rdApp.error')

In [None]:
""" 
AL with expected error reduction
"""
disable_rdkit_logging()

N_QUERIES = 20
N_LABELS_PER_QUERY = 100
query_list = []
#Each element is the qu
auc_list = []
count = 0
n_labels = 50
batch_size = 300
ss = ShuffleSplit(n_splits = 5, train_size = n_labels, random_state=1)
mol_data = data_setup()
X_morgan, Y_labels = create_fingerprints(2, 1024, mol_data)
SCORE = list() # spara intermediate accuracy värden osv
for labeled_index, unlabeled_index in ss.split(X_morgan):
    X_labeled, X_pool = X_morgan[labeled_index], X_morgan[unlabeled_index]    
    y_labeled, y_pool = Y_labels[labeled_index], Y_labels[unlabeled_index]
    count+=1
    print('Iteration {i}:'.format(i=count))
    learner = ActiveLearner(
    estimator=RandomForestClassifier(random_state=1),
    #query_strategy=uncertainty_sampling,
    X_training=X_labeled, y_training=y_labeled
    )
    predictions = learner.predict(X_morgan)
    unqueried_score = learner.score(X_morgan, Y_labels)
    probas_ = learner.predict_proba(X_morgan)
    fpr, tpr, _ = metrics.roc_curve(Y_labels, probas_[:,1],pos_label = 1)
    auc_list.append(metrics.auc(fpr,tpr))
    query_list.append(unqueried_score)
    performance_history = [unqueried_score]
    for index in range(N_QUERIES):
    
        #query_indices, query_instances 
        query_indices = np.random.choice(np.arange(len(X_pool)), size=300,replace=False)
        query_indices = expected_error_reduction(learner=learner,X=X_pool[query_indices], n_instances=batch_size)
        X, y = X_pool[query_indices].reshape(batch_size, -1), y_pool[query_indices].reshape(batch_size,)
        learner.teach(X=X, y=y)
        X_pool, y_pool = np.delete(X_pool, query_indices, axis=0), np.delete(y_pool, query_indices)
        model_accuracy = learner.score(X_morgan, Y_labels)
        probas_ = learner.predict_proba(X_morgan)
        fpr, tpr, _ = metrics.roc_curve(Y_labels, probas_[:,1],pos_label = 1)
        auc_list.append(metrics.auc(fpr,tpr))        
        print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy))
        performance_history.append(model_accuracy)
        query_list.append(model_accuracy)


Iteration 1:
Accuracy after query 1: 0.9200
Accuracy after query 2: 0.9305
Accuracy after query 3: 0.9386
Accuracy after query 4: 0.9425
Accuracy after query 5: 0.9488
Accuracy after query 6: 0.9525
Accuracy after query 7: 0.9541
Accuracy after query 8: 0.9569
Accuracy after query 9: 0.9594
Accuracy after query 10: 0.9609
Accuracy after query 11: 0.9635
Accuracy after query 12: 0.9645
