In [1]:
# HCASE Embedding
#
# Author: Gergely Zahoranszky-Kohalmi, PhD
#
# Email: gergely.zahoranszky-kohalmi@nih.gov
#
# Organization: National Center for Advancing Translational Sciences (NCATS/NIH)
#

In [2]:
import hcase
import pandas as pd
from rdkit.rdBase import BlockLogs
import cupy as cp

In [3]:
# Config section
fname_reference_scaffolds = '../data/scaffolds_chembl_24.tab'
fname_structures = '../data/STD_drugbank_approved_structures_v5.txt'
fname_out_space = '../data/hc_space.tab'
fname_out_embedding = '../data/drugs_emb_hcase_chembl.tab'


n_dim = 2
use_precomputed_reference_spaces = False
do_downsampling = True # TEMP
rnd_seed = 55555
sample_size = 2000
use_cupy = cp.cuda.is_available()
batch_size = 500 # if too high out of memory error will occur


In [4]:
# Import datasets

df_ref_scaffolds = pd.read_csv (fname_reference_scaffolds, sep = '\t')
df_structures = pd.read_csv (fname_structures, sep = '\t')


In [5]:
# Clean-up datasets

df_structures = df_structures.rename (columns = {'Structure': 'structure', 'ID': 'id'})

In [6]:
# Reduce datasets for workflow development (optional)

if do_downsampling:

    df_ref_scaffolds = df_ref_scaffolds.sample (n = sample_size, random_state = rnd_seed)
    df_structures = df_structures.sample (n = sample_size, random_state = rnd_seed)



In [7]:
df_ref_scaffolds = df_ref_scaffolds[['pattern_id', 'structure', 'ptype', 'hash']].copy()
df_ref_scaffolds = df_ref_scaffolds.query("ptype == 'scaffold'")

In [8]:
with BlockLogs():
    if not use_precomputed_reference_spaces:
        # Extract NumPy arrays from the DataFrame
        structures = df_ref_scaffolds['structure'].values
        pattern_ids = df_ref_scaffolds['pattern_id'].values

        # Call the NumPy-based function
        structures, order, scaffold_ids, scaffold_keys = hcase.order_scaffolds_np(structures, pattern_ids)

        # Convert the NumPy arrays back into a DataFrame for interpretability
        df_hcase_space = pd.DataFrame({
            'structure': structures,
            'order': order,
            'scaffold_id': scaffold_ids,
            'scaffold_key': scaffold_keys
        })

        # Save to CSV
        df_hcase_space.to_csv(fname_out_space, sep='\t', index=False)

    else:
        # Load precomputed scaffold space
        df_hcase_space = pd.read_csv(fname_out_space, sep='\t')



In [None]:

ref_fingerprints = df_hcase_space['scaffold_key'].values
structures = df_structures['structure'].values
ids = df_structures['id'].values
df_space_order = df_hcase_space['order'].values  # The order column from df_space as a NumPy array

with BlockLogs():
    embedded_data = hcase.embed(ref_fingerprints, structures, ids, n_dim, df_space_order, use_cupy=use_cupy, batch_size=batch_size)

    df_embedded_final = pd.DataFrame(embedded_data, columns=['id', 'structure', 'scaffold_key', 'sk_struct', 'closest_order', 'bucket_id'] + [f"Dim_{i+1}" for i in range(n_dim)])

    df_embedded_final.to_csv(fname_out_embedding, sep='\t', index=False)
