In [1]:
# HCASE Experiments Quantify Overlap
#
# Author: Gergely Zahoranszky-Kohalmi, PhD
#
# Email: gergely.zahoranszky-kohalmi@nih.gov
#
# Organization: National Center for Advancing Translational Sciences (NCATS/NIH)
#

In [2]:

import pandas as pd

import math

import matplotlib
#matplotlib.use('agg')
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Config section



fname_embedding_canvass_natprod = '../data/canvass_emb_hcase_natprod_bms_dim_2.tab'
fname_embedding_drugs_natprod = '../data/drugs_emb_hcase_natprod_bms_dim_2.tab'

fname_embedding_canvass_chembl = '../data/canvass_emb_hcase_chembl_24_1_bms_dim_2.tab'
fname_embedding_drugs_chembl = '../data/drugs_emb_hcase_chembl.tab'

fname_out_overlaps = '../data/quantified_overlaps.tab'


In [4]:
# Import datasets

df_embedded_canvass_natprod = pd.read_csv (fname_embedding_canvass_natprod, sep = '\t')
df_embedded_drugs_natprod = pd.read_csv (fname_embedding_drugs_natprod, sep = '\t')

df_embedded_canvass_chembl  = pd.read_csv (fname_embedding_canvass_chembl, sep = '\t')
df_embedded_drugs_chembl  = pd.read_csv (fname_embedding_drugs_chembl, sep = '\t')

In [5]:
# Functions

def aggregate_dim (df):
    df_agg = df.groupby(['bucket_id'], as_index = False).agg({
        'id': 'count'
    })

    df_agg = df_agg.rename (columns = {'id': 'dim_count'})    

    return (df_agg)


def get_embedding_vector (df, n_dim):
    embedding_vector = []

    for i in range(n_dim):
        df_i = df[df['bucket_id'] == i].copy()

        if (not df_i.empty):
            embedding_vector.append(list(df_i['dim_count'])[0])
        else:
            embedding_vector.append(0)

    return (embedding_vector)
        
def process_vectors (vec1, vec2):
    value_sum = 0

    if len(vec1) != len(vec2):
        print ('[ERROR] Mismatch in vector lengths. Terminating...')
        sys.exit(-1)
    

    n_dim = len (vec1)

    for i in range(n_dim):
        value_sum += vec1[i] * vec2[i]

    return (value_sum)

In [6]:
# Workflow


reference_set = 'chembl'
ord_values = []
tanimoto_values = []
df_all = pd.DataFrame ()

for i in range (2, 9):
    p = math.pow(2, i) 
    n_dim = int(math.pow(p,2))
    print (n_dim)
    print ("z: %d" % (i))



    
    df_canvass = df_embedded_canvass_chembl[df_embedded_canvass_chembl['hc_order'] == i].copy()
    df_drug = df_embedded_drugs_chembl[df_embedded_drugs_chembl['hc_order'] == i].copy()
    
    df_drug_agg = aggregate_dim(df_drug)
    df_canvass_agg = aggregate_dim(df_canvass)

    drug_ev = get_embedding_vector (df_drug_agg, n_dim)
    canvass_ev = get_embedding_vector (df_canvass_agg, n_dim)

    c = process_vectors (drug_ev, canvass_ev)
    a = process_vectors (drug_ev, drug_ev)
    b = process_vectors (canvass_ev, canvass_ev)
    
    T = float(c / (a + b - c))
    
    ord_values.append(int(i))
    tanimoto_values.append(T)

    print (T)

df_all = pd.DataFrame ({'dataset': 'drugbank_canvass', 'reference_set': reference_set, 'phc_order': ord_values, 'tanimoto_sim': tanimoto_values})



reference_set = 'natprod'
ord_values = []
tanimoto_values = []


for i in range (2, 6):
    p = math.pow(2, i) 
    n_dim = int(math.pow(p,2))
    print (n_dim)
    print ("z: %d" % (i))


    
    df_canvass = df_embedded_canvass_natprod[df_embedded_canvass_natprod['hc_order'] == i].copy()
    df_drug = df_embedded_drugs_natprod[df_embedded_drugs_natprod['hc_order'] == i].copy()


    df_drug_agg = aggregate_dim(df_drug)
    df_canvass_agg = aggregate_dim(df_canvass)

    drug_ev = get_embedding_vector (df_drug_agg, n_dim)
    canvass_ev = get_embedding_vector (df_canvass_agg, n_dim)

    c = process_vectors (drug_ev, canvass_ev)
    a = process_vectors (drug_ev, drug_ev)
    b = process_vectors (canvass_ev, canvass_ev)
    
    T = float(c / (a + b - c))
 
    ord_values.append(int(i))
    tanimoto_values.append(T)


    print (T)

df_res = pd.DataFrame ({'dataset': 'drugbank_canvass', 'reference_set': reference_set, 'phc_order': ord_values, 'tanimoto_sim': tanimoto_values})

df_all = df_all.append (df_res, ignore_index = True)
df_all = df_all.reset_index (drop = True)

df_all.to_csv (fname_out_overlaps, sep = '\t', index = False)

print (df_all)

16
z: 2
0.20423860961025234
64
z: 3
0.1608551701493082
256
z: 4
0.11384857689278884
1024
z: 5
0.08642522329236725
4096
z: 6
0.059257756018365845
16384
z: 7
0.052472223684903375
65536
z: 8
0.051918433056077275
16
z: 2
0.18360033318482072
64
z: 3
0.10412149993690996
256
z: 4
0.07852931235212378
1024
z: 5
0.07578927974521958
             dataset reference_set  phc_order  tanimoto_sim
0   drugbank_canvass        chembl          2      0.204239
1   drugbank_canvass        chembl          3      0.160855
2   drugbank_canvass        chembl          4      0.113849
3   drugbank_canvass        chembl          5      0.086425
4   drugbank_canvass        chembl          6      0.059258
5   drugbank_canvass        chembl          7      0.052472
6   drugbank_canvass        chembl          8      0.051918
7   drugbank_canvass       natprod          2      0.183600
8   drugbank_canvass       natprod          3      0.104121
9   drugbank_canvass       natprod          4      0.078529
10  drugbank_can

In [7]:
# References:

# Ref: https://forum.knime.com/t/tanimoto-similarity-using-count-based-fingerprints/12176/3
# Ref: https://pubs.acs.org/doi/full/10.1021/ci9800211
# Ref: https://jcheminf.biomedcentral.com/articles/10.1186/s13321-015-0069-3?optIn=false