In [None]:
# HCASE Embedding
#
# Author: Gergely Zahoranszky-Kohalmi, PhD
#
# Email: gergely.zahoranszky-kohalmi@nih.gov
#
# Organization: National Center for Advancing Translational Sciences (NCATS/NIH)
#


In [None]:
import hcase
import pandas as pd
from rdkit.rdBase import BlockLogs

import os

import pdb

In [None]:
# Config section



fname_reference_scaffolds = '../data/scaffolds_chembl_24.tab'
fname_structures = '../data/STD_drugbank_approved_structures_v5.txt'

fname_out_space = '../data/hc_space.tab'
fname_out_embedding = '../data/drugs_emb_hcase_chembl.tab'

n_dim = 2



use_precomputed_reference_spaces = False

do_downsampling = False
rnd_seed = 55555
sample_size = 2000

NR_CORES = 10

In [None]:
# Determining number of CPUs to use based on config, i.e. if -1, use all cores, otherwise use the specified nr of CPUs


if NR_CORES == -1:
    NR_CORES = os.cpu_count()  # Use all available cores
    NR_JOBS = -1    # this is for AI/ML

    print (f'[*] Number of cores for fingerprint generation: {NR_CORES}, Number of cores for AI/ML: ALL (max: {NR_CORES})')
    
else:
    
    NR_JOBS = NR_CORES    # this is for AI/ML
    print (f'[*] Number of cores for fingerprint generation: {NR_CORES}, Number of cores for AI/ML: {NR_JOBS}')


In [None]:
# Import datasets

df_ref_scaffolds = pd.read_csv (fname_reference_scaffolds, sep = '\t')
df_structures = pd.read_csv (fname_structures, sep = '\t')

print (df_ref_scaffolds.shape)
print (df_structures.shape)

In [None]:
# Clean-up datasets

df_structures = df_structures.rename (columns = {'Structure': 'structure', 'ID': 'id'})

In [None]:
# Reduce datasets for workflow development (optional)

if do_downsampling:

    df_ref_scaffolds = df_ref_scaffolds.sample (n = sample_size, random_state = rnd_seed)
    df_structures = df_structures.sample (n = sample_size, random_state = rnd_seed)



In [None]:
df_structures

In [None]:
# Create ChEMBL Scaffold space
with BlockLogs():
    if not use_precomputed_reference_spaces:
        df_hcase_space = hcase.train (df_ref_scaffolds, NR_CORES)
        
        df_hcase_space.to_csv (fname_out_space, sep = '\t', index = False)
    
    else:
        
        df_hcase_space = pd.read_csv (fname_out_space, sep = '\t')



In [10]:

# Embed structures into HCASE space
with BlockLogs():
    %time df_embedded = hcase.embed (df_hcase_space, df_structures, n_dim, NR_CORES)

In [None]:
df_embedded.to_csv (fname_out_embedding, sep = '\t')

In [None]:
df_embedded

In [None]:
#df_embedded.plot.scatter(x='Dim_1', y='Dim_2')

In [None]:
# References

# ChatGPT 4.0 Palantir Instance
# ChatGPT 4o www.openai.com