TODO..

- Look at 2-medioids splitting.  Estimate mean Tanimoto distance between the splits.  Compare with mean Tanimoto distance between activity-based splits.

- Run a version of the paper that does exactly the same thing, but with no activity splitting (as a benchmark).

In [2]:
import numpy as np
import pandas as pd

In [3]:
# From https://github.com/letiantian/kmedoids/blob/master/kmedoids.py

def kMedoids(D, k, tmax=100):
    # determine dimensions of distance matrix D
    m, n = D.shape

    if k > n:
        raise Exception('too many medoids')

    # find a set of valid initial cluster medoid indices since we
    # can't seed different clusters with two points at the same location
    valid_medoid_inds = set(range(n))
    invalid_medoid_inds = set([])
    rs,cs = np.where(D==0)
    # the rows, cols must be shuffled because we will keep the first duplicate below
    index_shuf = range(len(rs))
    np.random.shuffle(index_shuf)
    rs = rs[index_shuf]
    cs = cs[index_shuf]
    for r,c in zip(rs,cs):
        # if there are two points with a distance of 0...
        # keep the first one for cluster init
        if r < c and r not in invalid_medoid_inds:
            invalid_medoid_inds.add(c)
    valid_medoid_inds = list(valid_medoid_inds - invalid_medoid_inds)

    if k > len(valid_medoid_inds):
        raise Exception('too many medoids (after removing {} duplicate points)'.format(
            len(invalid_medoid_inds)))

    # randomly initialize an array of k medoid indices
    M = np.array(valid_medoid_inds)
    np.random.shuffle(M)
    M = np.sort(M[:k])

    # create a copy of the array of medoid indices
    Mnew = np.copy(M)

    # initialize a dictionary to represent clusters
    C = {}
    for t in xrange(tmax):
        # determine clusters, i. e. arrays of data indices
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]
        # update cluster medoids
        for kappa in range(k):
            J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
            j = np.argmin(J)
            Mnew[kappa] = C[kappa][j]
        np.sort(Mnew)
        # check for convergence
        if np.array_equal(M, Mnew):
            break
        M = np.copy(Mnew)
    else:
        # final update of cluster memberships
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]

    # return results
    return M, C


In [6]:
from sklearn.metrics.pairwise import pairwise_distances
import joblib

In [7]:
# Pull in data for a single target name
def get_data(tgt_name='COX-2'):
    data_dir = 'datasets/' + tgt_name + '/'
    preds = joblib.load(data_dir + tgt_name + '_predsu.npy')
    resps = joblib.load(data_dir + tgt_name + '_respu.npy')
    smiles = joblib.load(data_dir + tgt_name + '.smiu')
    dy = joblib.load(data_dir + tgt_name + '.dyu')
    return preds, resps, smiles, dy

### First let's look at the smallest dataset - A2a.

In [8]:
p, r, sm, _ = get_data('A2a')

In [10]:
p.shape

(199, 128)

In [11]:
D = pairwise_distances(np.asarray(p), metric='jaccard')



In [12]:
D.shape

(199, 199)

In [13]:
D

array([[ 0.        ,  0.66153846,  0.75      , ...,  0.73529412,
         0.72      ,  0.734375  ],
       [ 0.66153846,  0.        ,  0.79365079, ...,  0.68181818,
         0.5942029 ,  0.73846154],
       [ 0.75      ,  0.79365079,  0.        , ...,  0.75      ,
         0.71212121,  0.79310345],
       ..., 
       [ 0.73529412,  0.68181818,  0.75      , ...,  0.        ,
         0.73684211,  0.71428571],
       [ 0.72      ,  0.5942029 ,  0.71212121, ...,  0.73684211,
         0.        ,  0.66176471],
       [ 0.734375  ,  0.73846154,  0.79310345, ...,  0.71428571,
         0.66176471,  0.        ]])

In [16]:
M, C = kMedoids(D, 2)

print('medoids:')
for point_idx in M:
    print( p[point_idx] )

print('')
#print('clustering result:')
#for label in C:
#    for point_idx in C[label]:
#        print('label {0}:　{1}'.format(label, p[point_idx]))

medoids:
[1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1
 1 0 1 1 0 0 0 0 1 1 1 1 0 1 0 0 1 1 1 0 0 0 0 0 1 0 1 1 0 1 0 1 1 0 0 0 1
 0 1 0 1 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 1 1 1 0 0
 0 1 1 1 0 0 0 0 1 0 0 1 1 0 1 1 1]
[1 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0
 1 0 1 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 1 0 1 0 1 0
 1 0 0 1 0 0 1 1 0 0 0 0 1 0 1 0 1 1 1 0 0 0 0 0 0 1 1 1 0 0 1 1 0 0 0 0 0
 0 1 0 1 0 0 1 1 1 1 1 1 0 0 0 0 1]



In [18]:
# 123 components in cluster 0
C[0].shape

(123,)

In [19]:
# 76 in cluster 1
C[1].shape

(76,)

In [38]:
from scipy.spatial.distance import pdist

In [47]:
# distance between the two clusters...
print('Distance between cluster centres: {:.4f}'.format(pairwise_distances(p[M[0]].reshape(1,-1), 
                                                                p[M[1]].reshape(1,-1), 
                                                                metric='jaccard')[0,0]))

# average pairwise distance 
print('Average pairwise distance:        {:.4f}'.format(np.mean(pdist(p, metric='jaccard'))))
# average pairwise distance in clusters...
print('Average pairwise distance (C0):   {:.4f}'.format(np.mean(pdist(p[C[0]], metric='jaccard'))))
print('Average pairwise distance (C1):   {:.4f}'.format(np.mean(pdist(p[C[1]], metric='jaccard'))))
# if you split half-way by activity...
idcs = np.argsort(r)
idx_end = int(r.shape[0]/2)
print('Average pairwise distance (C0A):  {:.4f}'.format(np.mean(pdist(p[idcs[:idx_end]], metric='jaccard'))))
print('Average pairwise distance (C1A):  {:.4f}'.format(np.mean(pdist(p[idcs[idx_end:]], metric='jaccard'))))

Distance between cluster centres: 0.7051
Average pairwise distance:        0.7253
Average pairwise distance (C0):   0.6978
Average pairwise distance (C1):   0.7077
Average pairwise distance (C0A):  0.7581
Average pairwise distance (C1A):  0.6713


### Now let's do it for everything...

In [49]:
from glob import glob
targets = [s.replace('datasets/', '') for s in glob('datasets/*')]
sorted_targets = sorted(targets, key=lambda x: len(get_data(x)[0]))

In [50]:
results = {'name' : [], 'CC_dist' : [], 'Avg_dist' : [], 
           'C0_dist' : [], 'C1_dist' : [], 'C0A_dist' : [], 'C1A_dist' :[]}

In [51]:
for tgt in sorted_targets:
    results['name'].append(tgt)
    p, r, s, _ = get_data(tgt)
    D = pairwise_distances(np.asarray(p), metric='jaccard')
    M, C = kMedoids(D, 2)
    CC_dist = pairwise_distances(p[M[0]].reshape(1,-1), p[M[1]].reshape(1,-1), metric='jaccard')[0,0]
    avg_dist = np.mean(pdist(p, metric='jaccard'))
    avg_dist_C0 = np.mean(pdist(p[C[0]], metric='jaccard'))
    avg_dist_C1 = np.mean(pdist(p[C[1]], metric='jaccard'))    
    idcs = np.argsort(r)
    idx_end = int(r.shape[0]/2)    
    avg_dist_C0A = np.mean(pdist(p[idcs[:idx_end]], metric='jaccard'))
    avg_dist_C1A = np.mean(pdist(p[idcs[idx_end:]], metric='jaccard'))                      
    
    results['CC_dist'].append(CC_dist)
    results['Avg_dist'].append(avg_dist)
    results['C0_dist'].append(avg_dist_C0)
    results['C1_dist'].append(avg_dist_C1)
    results['C0A_dist'].append(avg_dist_C0A)
    results['C1A_dist'].append(avg_dist_C1A)

In [56]:
pd.DataFrame.from_dict(results)[['name', 'CC_dist', 'Avg_dist', 'C0_dist', 'C1_dist', 'C0A_dist', 'C1A_dist']]

Unnamed: 0,name,CC_dist,Avg_dist,C0_dist,C1_dist,C0A_dist,C1A_dist
0,A2a,0.705128,0.725285,0.707729,0.697805,0.758064,0.671323
1,Dopamine,0.666667,0.687367,0.670254,0.633486,0.698943,0.65292
2,Dihydrofolate,0.571429,0.650236,0.60245,0.627717,0.668422,0.622257
3,Carbonic,0.702128,0.714966,0.688403,0.718835,0.737423,0.658226
4,ABL1,0.628571,0.655916,0.572591,0.636682,0.661175,0.63355
5,opioid,0.689655,0.679006,0.670102,0.647386,0.69311,0.640824
6,Cannabinoid,0.6,0.678897,0.660299,0.657782,0.698942,0.644796
7,Androgen,0.640625,0.720247,0.735152,0.697496,0.726467,0.705101
8,COX-1,0.630769,0.726508,0.732315,0.690919,0.717391,0.731438
9,Monoamine,0.64,0.747634,0.738982,0.716815,0.74745,0.74209
