In [1]:
# HCASE Experiments Chebyshev Kendall
#
# Author: Gergely Zahoranszky-Kohalmi, PhD
#
# Email: gergely.zahoranszky-kohalmi@nih.gov
#
# Organization: National Center for Advancing Translational Sciences (NCATS/NIH)
#

In [2]:

import pandas as pd


from dchebyshev import determine_correlation_glob_kendall, determine_correlation_local_kendall
import numpy as np




In [3]:
# Config section



fname_embedding_canvass_natprod = '../data/canvass_emb_hcase_natprod_bms_dim_2.tab'
fname_embedding_drugs_natprod = '../data/drugs_emb_hcase_natprod_bms_dim_2.tab'

fname_embedding_canvass_chembl = '../data/canvass_emb_hcase_chembl_24_1_bms_dim_2.tab'
fname_embedding_drugs_chembl = '../data/drugs_emb_hcase_chembl.tab'

fname_out_global_corr = '../data/chebyshev_stat_kendall_full.tab'
fname_out_local_corr = '../data/chebyshev_stat_kendall.tab'


random_seed = 55555
sample_size = 100


In [4]:
# Import datasets

df_embedded_canvass_natprod = pd.read_csv (fname_embedding_canvass_natprod, sep = '\t')
df_embedded_drugs_natprod = pd.read_csv (fname_embedding_drugs_natprod, sep = '\t')

df_embedded_canvass_chembl  = pd.read_csv (fname_embedding_canvass_chembl, sep = '\t')
df_embedded_drugs_chembl  = pd.read_csv (fname_embedding_drugs_chembl, sep = '\t')

In [5]:
# Functions


def compute_global_stat_kendall (df_embedding, dataset_name, max_z):
    
    df_all = pd.DataFrame()

   
    ord_values = []
    corr_values = []

    for i in range(2, max_z + 1):
        ord_values.append(i)

        print ("[*] Processing dataset: %s at parameter z: %d" %(dataset_name, i))
        df = df_embedding[df_embedding['hc_order'] == i].copy()

        (df, cv) = determine_correlation_glob_kendall (df)
        corr_values.append(cv)
        #print (cv)



    df_res = pd.DataFrame({'dataset': dataset_name, 'phc_order': ord_values, 'kendall.corr': corr_values})
    
    
    return (df_res)



def compute_local_stat_kendall (df_embedding, dataset_name, max_z, sample_size, random_seed):
    df_all = pd.DataFrame()

  
    corr_mean_values = []
    corr_std_values = []
    ord_values = []

    for i in range(2, max_z + 1):
        corr_values = []
        ord_values.append(i)

        print ("[*] Processing dataset: %s at parameter z: %d" %(dataset_name, i))
        df = df_embedding[df_embedding['hc_order'] == i].copy()

        while (df.shape[0] >= sample_size):
            (df, cv) = determine_correlation_local_kendall (df, sample_size, random_seed)
            corr_values.append(cv)
            #print (cv)

        #print (corr_values)

        corr_values = np.array(corr_values)
        corr_mean_values.append(corr_values.mean())
        corr_std_values.append(corr_values.std())

    df_res = pd.DataFrame({'dataset': dataset_name, 'phc_order': ord_values, 'kendall.corr.mean': corr_mean_values, 'kendall.corr.std': corr_std_values})


    return (df_res)  



In [6]:
# Workflow

# Perform global stats

print ('[*] Computing global stats .. ')
       
df_all_glob = pd.DataFrame()

dataset_name = 'drugbank_chembl_bms_full'
max_z = 8

df_drug_chembl_glob_corr = compute_global_stat_kendall (df_embedded_drugs_chembl, dataset_name, max_z)


df_all_glob = df_drug_chembl_glob_corr



dataset_name = 'drugbank_natprod_bms_full'
max_z = 5


df_drug_natprod_glob_corr = compute_global_stat_kendall (df_embedded_drugs_natprod, dataset_name, max_z)

df_all_glob = df_all_glob.append (df_drug_natprod_glob_corr, ignore_index = True)


df_all_glob = df_all_glob.reset_index (drop = True)


dataset_name = 'canvass_chembl_bms_full'
max_z = 8

df_canvass_chembl_glob_corr = compute_global_stat_kendall (df_embedded_canvass_chembl, dataset_name, max_z)

df_all_glob = df_all_glob.append (df_canvass_chembl_glob_corr, ignore_index = True)


df_all_glob = df_all_glob.reset_index (drop = True)



dataset_name = 'canvass_natprod_bms_full'
max_z = 5

df_canvass_natprod_glob_corr = compute_global_stat_kendall (df_embedded_canvass_natprod, dataset_name, max_z)

df_all_glob = df_all_glob.append (df_canvass_natprod_glob_corr, ignore_index = True)


df_all_glob = df_all_glob.reset_index (drop = True)





df_all_glob.to_csv (fname_out_global_corr, sep = '\t', index = False)

print (' .. done')

[*] Computing global stats .. 
[*] Processing dataset: drugbank_chembl_bms_full at parameter z: 2


  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


[*] Processing dataset: drugbank_chembl_bms_full at parameter z: 3
[*] Processing dataset: drugbank_chembl_bms_full at parameter z: 4
[*] Processing dataset: drugbank_chembl_bms_full at parameter z: 5
[*] Processing dataset: drugbank_chembl_bms_full at parameter z: 6
[*] Processing dataset: drugbank_chembl_bms_full at parameter z: 7
[*] Processing dataset: drugbank_chembl_bms_full at parameter z: 8
[*] Processing dataset: drugbank_natprod_bms_full at parameter z: 2
[*] Processing dataset: drugbank_natprod_bms_full at parameter z: 3
[*] Processing dataset: drugbank_natprod_bms_full at parameter z: 4
[*] Processing dataset: drugbank_natprod_bms_full at parameter z: 5
[*] Processing dataset: canvass_chembl_bms_full at parameter z: 2
[*] Processing dataset: canvass_chembl_bms_full at parameter z: 3
[*] Processing dataset: canvass_chembl_bms_full at parameter z: 4
[*] Processing dataset: canvass_chembl_bms_full at parameter z: 5
[*] Processing dataset: canvass_chembl_bms_full at parameter z

In [7]:
# Perform local stats

print ('[*] Computing local stats .. ')

df_all_local = pd.DataFrame()


dataset_name = 'drugbank_chembl_bms'
max_z = 8

df_drug_chembl_local_corr = compute_local_stat_kendall (df_embedded_drugs_chembl, dataset_name, max_z, sample_size, random_seed)


df_all_local = df_drug_chembl_local_corr


dataset_name = 'drugbank_natprod_bms'
max_z = 5

df_drug_natprod_local_corr = compute_local_stat_kendall (df_embedded_drugs_natprod, dataset_name, max_z, sample_size, random_seed)

df_all_local = df_all_local.append (df_drug_natprod_local_corr, ignore_index = True)

df_all_local = df_all_local.reset_index (drop = True)



dataset_name = 'canvass_chembl_bms'
max_z = 8

df_canvass_chembl_local_corr = compute_local_stat_kendall (df_embedded_canvass_chembl, dataset_name, max_z, sample_size, random_seed)

df_all_local = df_all_local.append (df_canvass_chembl_local_corr, ignore_index = True)

df_all_local = df_all_local.reset_index (drop = True)



dataset_name = 'canvass_natprod_bms'
max_z = 5

df_canvass_natprod_local_corr = compute_local_stat_kendall (df_embedded_canvass_natprod, dataset_name, max_z, sample_size, random_seed)

df_all_local = df_all_local.append (df_canvass_natprod_local_corr, ignore_index = True)

df_all_local = df_all_local.reset_index (drop = True)


df_all_local.to_csv (fname_out_local_corr, sep = '\t', index = False)

print (' .. done')

[*] Computing local stats .. 
[*] Processing dataset: drugbank_chembl_bms at parameter z: 2
[*] Processing dataset: drugbank_chembl_bms at parameter z: 3
[*] Processing dataset: drugbank_chembl_bms at parameter z: 4
[*] Processing dataset: drugbank_chembl_bms at parameter z: 5
[*] Processing dataset: drugbank_chembl_bms at parameter z: 6
[*] Processing dataset: drugbank_chembl_bms at parameter z: 7
[*] Processing dataset: drugbank_chembl_bms at parameter z: 8
[*] Processing dataset: drugbank_natprod_bms at parameter z: 2
[*] Processing dataset: drugbank_natprod_bms at parameter z: 3
[*] Processing dataset: drugbank_natprod_bms at parameter z: 4
[*] Processing dataset: drugbank_natprod_bms at parameter z: 5
[*] Processing dataset: canvass_chembl_bms at parameter z: 2
[*] Processing dataset: canvass_chembl_bms at parameter z: 3
[*] Processing dataset: canvass_chembl_bms at parameter z: 4
[*] Processing dataset: canvass_chembl_bms at parameter z: 5
[*] Processing dataset: canvass_chembl_b

In [8]:
# References:

# Ref: https://forum.knime.com/t/tanimoto-similarity-using-count-based-fingerprints/12176/3
# Ref: https://pubs.acs.org/doi/full/10.1021/ci9800211
# Ref: https://jcheminf.biomedcentral.com/articles/10.1186/s13321-015-0069-3?optIn=false

# Ref: https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.chebyshev.html
# Ref: https://www.geeksforgeeks.org/python-pandas-dataframe-corr/


