In [11]:
# HCASE Embedding
#
# Author: Gergely Zahoranszky-Kohalmi, PhD
#
# Email: gergely.zahoranszky-kohalmi@nih.gov
#
# Organization: National Center for Advancing Translational Sciences (NCATS/NIH)
#

In [12]:
import hcase
import pandas as pd
from rdkit.rdBase import BlockLogs
import cupy as cp
import multiprocessing

In [None]:
# Config section
fname_reference_scaffolds = '../data/scaffolds_chembl_24.tab'
fname_structures = '../data/STD_drugbank_approved_structures_v5.txt'
fname_out_space = '../data/hc_space.tab'
fname_out_embedding = '../data/drugs_emb_hcase_chembl.tab'

def get_available_cores():
    """
    Returns the total number of CPU cores and a smart default (total cores - 1).
    """
    total_cores = multiprocessing.cpu_count()
    smart_default = max(1, total_cores - 1)  # Ensure at least 1 core is used
    return total_cores, smart_default

n_dim = 2
use_precomputed_reference_spaces = False
do_downsampling = False
rnd_seed = 55555
sample_size = 2000
row_based = False # this computes closest_scaffold via pandarallel and also numpy not as fast as batching on GPU
use_cupy = cp.cuda.is_available()
batch_size = 500 # if too high out of memory error will occur
total_cores, smart_default = get_available_cores()
hcase.initialize_pandarallel(n_cores=smart_default)


In [14]:
# Import datasets

df_ref_scaffolds = pd.read_csv (fname_reference_scaffolds, sep = '\t')
df_structures = pd.read_csv (fname_structures, sep = '\t')


In [15]:
# Clean-up datasets

df_structures = df_structures.rename (columns = {'Structure': 'structure', 'ID': 'id'})

In [16]:
# Reduce datasets for workflow development (optional)

if do_downsampling:

    df_ref_scaffolds = df_ref_scaffolds.sample (n = sample_size, random_state = rnd_seed)
    df_structures = df_structures.sample (n = sample_size, random_state = rnd_seed)



In [None]:
df_structures

In [None]:
# Create ChEMBL Scaffold space
with BlockLogs():
    if not use_precomputed_reference_spaces:
        df_hcase_space = hcase.train (df_ref_scaffolds)
        
        df_hcase_space.to_csv (fname_out_space, sep = '\t', index = False)
    
    else:
        
        df_hcase_space = pd.read_csv (fname_out_space, sep = '\t')



In [None]:
# Embed structures into HCASE space
with BlockLogs():
    df_embedded = hcase.embed (df_hcase_space, df_structures, n_dim, row_based = row_based, use_cupy = use_cupy, batch_size = batch_size)
    df_embedded.to_csv (fname_out_embedding, sep = '\t')

In [20]:
# References

# ChatGPT 4.0 Palantir Instance
# ChatGPT 4o www.openai.com