In [None]:
#default_exp applicability_domain

In [None]:
%reload_ext autoreload
%autoreload 2
%load_ext watermark
%watermark -a 'Marcos Santana' -d -p sophosdata,numpy,pandas,descriptastorus,joblib,rdkit,tqdm,fastcore -v



Author: Marcos Santana

Python implementation: CPython
Python version       : 3.8.8
IPython version      : 7.22.0

sophosdata     : 1.3
numpy          : 1.20.1
pandas         : 1.1.3
descriptastorus: 2.3.0.5
joblib         : 0.17.0
rdkit          : 2022.03.1
tqdm           : 4.59.0
fastcore       : 1.4.2



In [None]:
#export
import pandas as pd
import numpy as np
from rdkit import Chem
from scipy.spatial import distance

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# from rdkit.Chem import Draw
# import mols2grid
# import joblib
# from sophosdata.all import *

[14:25:22] Initializing Normalizer
  from pandas import MultiIndex, Int64Index


# Load data

In [None]:
trainset = pd.read_csv('../Pf-PHEN_23_05_22_processed_dev_pIC50_5_classification_2022-05-23_trainset.csv')
testset = pd.read_csv('../Pf-PHEN_23_05_22_processed_dev_pIC50_5_classification_2022-05-23_testset.csv')

In [None]:
trainset.reset_index(drop=True,inplace=True)
testset.reset_index(drop=True,inplace=True)

# Featurize

In [None]:
fingerprinter = Fingerprinter('ecfp')

In [None]:
Xtrain = fingerprinter.generate_features(trainset['processed_smiles'].values)
Xtest = fingerprinter.generate_features(testset['processed_smiles'].values)
X_sample = Xtest[0].reshape(1, Xtest.shape[1])

In [None]:
X_sample_2 = fingerprinter.generate_features([testset['processed_smiles'].values[0]])

In [None]:
X_sample.shape

(1, 1024)

In [None]:
np.testing.assert_array_equal(X_sample, X_sample_2)

# Applicability domain

## K-nearest neighbours (Z-kNN)

**AD threshold**

In [None]:
def calculate_similarity_from_array(fp1, fp2=None, metric='jaccard', z=0.5):
    
    """Calculates a squared similarity matrix between two arrays of fingerprints
    
    Arguments
    -----------------------------------------------------------------------------
    
    fp1 : numpy.array
        An array of fingerprints.
        
    fp2 : numpy.array
        Second array of fingerprints. If None, defaults to `fp1`.
        
    metric : str or callable, optional
        The distance metric to use. 
        If a string, the distance function can be ‘braycurtis’, ‘canberra’, ‘chebyshev’,
        ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’,
        ‘jensenshannon’, ‘kulsinski’, ‘kulczynski1’, ‘mahalanobis’, ‘matching’, ‘minkowski’,
        ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’.
        
        
    z : float
        Significance threshold. See original publication for more details: https://pubs.acs.org/doi/10.1021/ci060132x
        
        
    Returns
    -----------------------------------------------------------------------------
    
    simi_matrix : numpy.array
        A matrix with pairwise distance.
    
    
    """
    
    from scipy.spatial import distance
    if fp2 is None:
        simi_matrix = distance.cdist(fp1, fp1, metric=metric).squeeze()
    else:
        simi_matrix = distance.cdist(fp1, fp2, metric=metric).squeeze()
    return simi_matrix

def calculate_ad_threhold(X, metric='jaccard', z=0.5):
        
    """Calculates the applicability domain threhold using k-NN method
    
    Arguments
    -----------------------------------------------------------------------------
    
    X : numpy.array
        An array of fingerprints.
       
    metric : str or callable, optional
        The distance metric to use. 
        If a string, the distance function can be ‘braycurtis’, ‘canberra’, ‘chebyshev’,
        ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’,
        ‘jensenshannon’, ‘kulsinski’, ‘kulczynski1’, ‘mahalanobis’, ‘matching’, ‘minkowski’,
        ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’.
              
    z : float
        Significance threshold. See original publication for more details: https://pubs.acs.org/doi/10.1021/ci060132x
        
        
    Returns
    -----------------------------------------------------------------------------
    
    ad_threshold : float
        The distance threshold used to define the applicability domain.
    
    
    """
    
    from scipy.spatial import distance    
    simi_matrix = calculate_similarity_from_array(X, metric=metric).squeeze()

    std_distances = np.std(simi_matrix) # std of distances
    avg_distances = np.mean(simi_matrix) # average of distances

    # Applicability threhold
    ad_threshold = (z*std_distances) + avg_distances
    return ad_threshold




In [None]:
ad_threhold = calculate_ad_threhold(Xtrain)

ad_threhold


0.9194734400235668

### **Get k-nearest neighbours**

In [None]:
def get_knn(fp, ref_fps, k:int=10):
    
    """Get the k-nearest neighbours and calculate the distance between fp and ref_fps.
    
    Arguments
    -----------------------------------------------------------------------------
    fp : numpy.array
        Query fingerprint
        
    ref_fps : numpy.array
        Reference fingerprints
        
    k : int
        Number of k nearest neighbours
        
        
    Returns
    -----------------------------------------------------------------------------
    distances : numpy.array
        A matrix with pairwise distance.
        
    neighbours : numpy.array
        Index of k-nearest neighbours
    
    
    """
    
    # Generate similarity matrix
    distances = calculate_similarity_from_array(fp, ref_fps).reshape(len(fp), len(ref_fps))
    # Get kNN
    neighbours = np.argsort(distances, axis=-1).reshape(len(fp), len(ref_fps))
    return distances, neighbours

In [None]:
distances, neighbours = get_knn(Xtest, Xtrain)

In [None]:
distances[0].shape

(2648,)

In [None]:
distances_sample, neighbours_sample = get_knn(X_sample, Xtrain)
distances_sample.shape

(1, 2648)

In [None]:
np.testing.assert_equal(distances_sample[0], distances[0])

In [None]:
assert distances.shape[0] == Xtest.shape[0]
assert distances.shape[1] == Xtrain.shape[0]

In [None]:
kk = []
for i in range(distances.shape[0]):
    dst = distances[i]
    neigh = neighbours[i, :10]
    kk.append(dst[neigh])

In [None]:
kk[1]

array([0.33653846, 0.37037037, 0.42      , 0.42857143, 0.46078431,
       0.47572816, 0.48543689, 0.49557522, 0.51351351, 0.52777778])


### **Calculate AD**

In [None]:
def calculate_applicability_domain(fp, ref_fp, z:float, k:int=10):
    
        
    """Get the k-nearest neighbours and calculate the distance between fp and ref_fps.
    
    Arguments
    -----------------------------------------------------------------------------
    
    fp : numpy.array
        Query fingerprint
        
    ref_fps : numpy.array
        Reference fingerprints
        
    z : float
        Significance threshold. See original publication for more details: https://pubs.acs.org/doi/10.1021/ci060132x     
        
    k : int
        Number of k nearest neighbours
        
        
    Returns
    -----------------------------------------------------------------------------
    
    avg_distance : float
        Average distance between query and k-nearest neighbours in AD.
    
    
    """
    
    ditances, neighbours = get_knn(fp, ref_fp, k=k)
    avg_distance = np.take_along_axis(distances, neighbours[:, :k], 1).mean(-1)
    return avg_distance

In [None]:
avg_distance = np.take_along_axis(distances_sample, neighbours_sample[:, :10], 1).mean(1)
avg_distance>=0.5

array([ True])

In [None]:
ad_threhold

0.9194734400235668

In [None]:
calculate_applicability_domain(Xtest, Xtrain, z=ad_threhold)

array([0.61098674, 0.45142961, 0.68963638, 0.61702224, 0.7618961 ,
       0.79018336, 0.75093942, 0.63094869, 0.38533987, 0.68428936,
       0.43575212, 0.71685246, 0.74289203, 0.74459016, 0.64661555,
       0.61538241, 0.73866228, 0.7275278 , 0.73962316, 0.80795721,
       0.74442247, 0.68261528, 0.6825706 , 0.75174508, 0.51730006,
       0.50708319, 0.58156801, 0.55192986, 0.741607  , 0.44439823,
       0.77118182, 0.77366232, 0.42336297, 0.50471898, 0.58877224,
       0.70764503, 0.78797507, 0.79787541, 0.80716715, 0.43177211,
       0.44788453, 0.44924766, 0.44195935, 0.50961282, 0.43075815,
       0.47368817, 0.43531288, 0.62075708, 0.78179879, 0.7819906 ,
       0.4340867 , 0.70932196, 0.76597742, 0.80265408, 0.76783991,
       0.75218678, 0.7557019 , 0.74439282, 0.73959934, 0.74670102,
       0.77886472, 0.79106291, 0.77540064, 0.77397191, 0.7580933 ,
       0.78490432, 0.74573406, 0.78189105, 0.4331026 , 0.40482028,
       0.45940859, 0.81010921, 0.25432034, 0.39463257, 0.35970

In [None]:
# # setup the grid
# grid = mols2grid.MolGrid(trainset.iloc[neighbours], smiles_col="processed_smiles", size=(200, 180), name="Hs-HEPG2")
# grid.display(subset=["ID","img"], n_cols=4, n_rows=3)

# Refactor

In [None]:
#export
class BaseDomain:
     def calculate_applicability_domain(self):
        pass

In [None]:
#export
class kNNDomain(BaseDomain):
    
    def __init__(self, Xref:np.array, metric='euclidean'):
        
        """Calculates the applicability domain using the k-nearest neighbours approach
        
        Attributes
        -----------------------------------------------------------------------------
        
        Xref : numpy.array
            Reference fingerprints         

        metric : str or callable, optional
            The distance metric to use. 
            If a string, the distance function can be ‘braycurtis’, ‘canberra’, ‘chebyshev’,
            ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’,
            ‘jensenshannon’, ‘kulsinski’, ‘kulczynski1’, ‘mahalanobis’, ‘matching’, ‘minkowski’,
            ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’.

        
        
        """
        
        
        self.Xref = Xref
        self.metric = metric
        self.ad_threshold = self.calculate_ad_threhold(metric=metric)
        
        
    @property
    def ad_threshold(self):
        return self._ad_threshold
 
    @ad_threshold.setter
    def ad_threshold(self, v):
        self._ad_threshold = v      

    def calculate_similarity_from_array(self, fp1:np.array, fp2:np.array=None, metric:str=None, z=0.5):

        """Calculates a squared similarity matrix between two arrays of fingerprints

        Arguments
        -----------------------------------------------------------------------------

        fp1 : numpy.array
            An array of fingerprints.

        fp2 : numpy.array
            Second array of fingerprints. If None, defaults to `fp1`.
            
        metric : str or callable, optional
            The distance metric to use. 
            If a string, the distance function can be ‘braycurtis’, ‘canberra’, ‘chebyshev’,
            ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’,
            ‘jensenshannon’, ‘kulsinski’, ‘kulczynski1’, ‘mahalanobis’, ‘matching’, ‘minkowski’,
            ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’.


        z : float
            Significance threshold. See original publication for more details: https://pubs.acs.org/doi/10.1021/ci060132x


        Returns
        -----------------------------------------------------------------------------

        simi_matrix : numpy.array
            A matrix with pairwise distance.


        """

        from scipy.spatial import distance
        if fp2 is None:
            simi_matrix = distance.cdist(fp1, fp1, metric=metric).squeeze()
        else:
            simi_matrix = distance.cdist(fp1, fp2, metric=metric).squeeze()
        return simi_matrix

    def calculate_ad_threhold(self, X:np.array=None,  metric:str=None, z=0.5):

        """Calculates the applicability domain threhold using k-NN method

        Arguments
        -----------------------------------------------------------------------------

        X : numpy.array
            An array of fingerprints.
            
        metric : str or callable, optional
            The distance metric to use. 
            If a string, the distance function can be ‘braycurtis’, ‘canberra’, ‘chebyshev’,
            ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’,
            ‘jensenshannon’, ‘kulsinski’, ‘kulczynski1’, ‘mahalanobis’, ‘matching’, ‘minkowski’,
            ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’.

        z : float
            Significance threshold. See original publication for more details: https://pubs.acs.org/doi/10.1021/ci060132x


        Returns
        -----------------------------------------------------------------------------

        ad_threshold : float
            The distance threshold used to define the applicability domain.


        """

        from scipy.spatial import distance

        X = self.Xref if X is None else X
        simi_matrix = self.calculate_similarity_from_array(X, metric=metric).squeeze()

        std_distances = np.std(simi_matrix) # std of distances
        avg_distances = np.mean(simi_matrix) # average of distances

        # Applicability threhold
        ad_threshold = (z*std_distances) + avg_distances
        return ad_threshold

    def get_knn(self, fp:np.array, ref_fp:np.array=None, k:int=10):

        """Get the k-nearest neighbours and calculate the distance between fp and ref_fps.

        Arguments
        -----------------------------------------------------------------------------
        fp : numpy.array
            Query fingerprint

        ref_fps : numpy.array
            Reference fingerprints
            
        k : int
            Number of k nearest neighbours


        Returns
        -----------------------------------------------------------------------------
        distances : numpy.array
            A matrix with pairwise distance.

        neighbours : numpy.array
            Index of k-nearest neighbours


        """
    
        # Generate similarity matrix
        ref_fp = self.Xref if ref_fp is None else ref_fp
        distances = self.calculate_similarity_from_array(fp, ref_fp, metric=self.metric).reshape(len(fp), len(ref_fp))
        # Get kNN
        neighbours = np.argsort(distances, axis=-1).reshape(len(fp), len(ref_fp))
        return distances, neighbours
#         # Generate similarity matrix
#         ref_fp = self.Xref if ref_fp is None else ref_fp
#         distances = self.calculate_similarity_from_array(fp, ref_fp)
#         # Get kNN
#         neighbours = np.argsort(distances)[: k]
#         return distances, neighbours

    def calculate_applicability_domain(self, fp:np.array, ref_fp:np.array=None, k:int=10):


        """Get the k-nearest neighbours and calculate the distance between fp and ref_fps.

        Arguments
        -----------------------------------------------------------------------------

        fp : numpy.array
            Query fingerprint

        ref_fps : numpy.array
            Reference fingerprints

        k : int
            Number of k nearest neighbours

        Returns
        -----------------------------------------------------------------------------

        avg_distance : float
            Average distance between query and k-nearest neighbours in AD.


        """
        assert k >=1, "k must be >= 1."
        ref_fp = self.Xref if ref_fp is None else ref_fp
        distances, neighbours = self.get_knn(fp, ref_fp, k=k)
        avg_distance = np.take_along_axis(distances, neighbours[:, :k], 1).mean(-1)
        return (avg_distance,avg_distance<=self.ad_threshold)

        #avg_distance = distances[neighbours].mean()
        #return (avg_distance, True) if avg_distance <= self.ad_threshold else (avg_distance, False)

In [None]:
ad_knn = kNNDomain(Xref=Xtrain, metric='euclidean')

In [None]:
ad_knn.ad_threshold

9.958452035589074

In [None]:
avg_distance = ad_knn.calculate_applicability_domain(Xtest, k=1)

In [None]:
avg_distance[1]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

# OneClass SVM

In [None]:
from sklearn.svm import OneClassSVM

In [None]:
model = OneClassSVM()

In [None]:
model.fit(Xtrain)

In [None]:
preds = model.predict(X_sample[None, :])

ValueError: Found array with dim 3. OneClassSVM expected <= 2.

In [None]:
preds

In [None]:
#export
class SVMDomain:
    from sklearn.svm import OneClassSVM
    def __init__(self, Xref:np.array, svm_model=None):
        
        """Calculates the applicability domain using the k-nearest neighbours approach
        
        Attributes
        -----------------------------------------------------------------------------
        
        Xref : numpy.array
            Reference fingerprints
        
        
        """
          
        self.Xref = Xref
        self.svm_model = OneClassSVM() if svm_model is None else svm_model

        
    @property
    def svm_model(self):
        return self._svm_model
 
    @svm_model.setter
    def svm_model(self, v):
        self._svm_model = v      

    def train_model(self, X:np.array=None, params={}):

        """Train a One class SVM model for outlier detection.

        Arguments
        -----------------------------------------------------------------------------

        X : numpy.array
            An array of fingerprints for the training set


        params : dict
           A dictionary of parameters for OneClassSVM

        Returns
        -----------------------------------------------------------------------------

        svm_model : Fitted estimator


        """

        
        X = self.Xref if X is None else X
        self.svm_model.set_params(**params)
        self.svm_model.fit(X)
        return self.svm_model

    def calculate_applicability_domain(self, Xtest:np.array=None):

        """Perform classification on samples in Xtest.

        Arguments
        -----------------------------------------------------------------------------

        Xtest : numpy.array
            An array of fingerprints.

        Returns
        -----------------------------------------------------------------------------

        class_label : int
            For a one-class model, +1 or -1 is returned.


        """

        return self.svm_model.predict(Xtest)

In [None]:
svm_domain = SVMDomain(Xtrain)

In [None]:
svm_domain.svm_model

In [None]:
svm_domain.train_model()

In [None]:
svm_domain.calculate_applicability_domain(X_sample[None, :])

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script(fname='AD.ipynb')

Converted AD.ipynb.
