In [10]:
# HCASE Embedding Reduced
#

# Author: Gergely Zahoranszky-Kohalmi, PhD
#
# Email: gergely.zahoranszky-kohalmi@nih.gov
#
# Organization: National Center for Advancing Translational Sciences (NCATS/NIH)
#
# Workflow:

# 1. Select 90% of reference scaffolds.
# 2. Merge the resultant set with cehrry-picked scaffold, and deduplicate (this is to assure consistency of analysis).
# 3. Perform HCASE of drugs in reduced ChEMBL space
# 4. Generate plots for Cherry-picked scaffolds on each HCASE embedding.
# 5. Generate comparative plot between HCASE embeddings of drugs in full and reduced ChEBML space
# on the largest zommon z parameter.


In [11]:
import hcase
import pandas as pd

In [12]:
# Config section



fname_reference_scaffolds = '../../data/hc_space.tab'
fname_cherry_picked_scaffolds = '../../data/cherrypicked_scaffolds.tab'
fname_structures = '../../data/STD_drugbank_approved_structures_v5.txt'

fname_out_reduced_reduced_ref_scaffolds = '../../data/red_ref_scaffolds.tab'
fname_out_space = '../../data/red_hc_space.tab'
fname_out_embedding = '../../data/red_drugs_emb_hcase_chembl.tab'

n_dim = 2



use_precomputed_reference_spaces = False

rnd_seed_reduced = 12345
reduced_fraction = 0.9

do_downsampling = False
rnd_seed = 55555
sample_size = 200




In [13]:
# Import datasets

df_ref_scaffolds = pd.read_csv (fname_reference_scaffolds, sep = '\t')
df_cp = pd.read_csv (fname_cherry_picked_scaffolds, sep = '\t')
df_structures = pd.read_csv (fname_structures, sep = '\t')

print (df_ref_scaffolds.head)
print (df_cp.head)

<bound method NDFrame.head of                                                structure  order  \
0                                                C=C1CC1      1   
1                                                  C1CN1      2   
2                                             C1C[NH2+]1      3   
3                                                  C1CS1      4   
4                                                 C1=CC1      5   
...                                                  ...    ...   
55613  O=C1CCSSC[C@@H]2NC(=O)[C@@H]3CSSC[C@@H](C(=O)N...  55614   
55614  O=C(NCC(=O)N1Cc2ccccc2C[C@H]1C(=O)N1CC2CCCCC2[...  55615   
55615  S=P(N/N=C/c1ccc(OP2(Oc3ccc(/C=N/NP(=S)(Oc4cccc...  55616   
55616  O=C(CNC(=O)CNC(=O)CNC(=O)CNC(=O)CNC(=O)CNC(=O)...  55617   
55617  O=C(CNC(=O)[C@@H]1C2CCCCC2CN1C(=O)[C@@H]1Cc2cc...  55618   

           scaffold_id                                       scaffold_key  
0          scaffold.10  3 0 0 1 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...  
1           s

In [14]:
# Clean-up datasets

df_ref_scaffolds = df_ref_scaffolds[['structure', 'scaffold_id', 'scaffold_key']].copy()

df_cp = df_cp[['structure', 'scaffold_id', 'scaffold_key', 'color']].copy()



df_structures = df_structures.rename (columns = {'Structure': 'structure', 'ID': 'id'})

In [19]:
# Functions

def smiles2inchikey (smiles):
    if smiles == 'NA':
        inchi = 'NA'
    else:
        try:
            mol = Chem.MolFromSmiles(smiles)
            inchi = Chem.MolToInchi (mol)
        except:
            inchi = 'NA'

    if inchi == 'NA':
        inchikey = 'NA'
    else:
        try:
            inchikey = Chem.InchiToInchiKey (inchi)
        except:
            inchikey = 'NA'
    
    return (inchikey)    



def generate_reduced_reference_scaffold_set (df_scaffolds_orig, df_cp_scaffolds, reduced_fraction, rnd_seed_reduced):
   
    # Reduce reference set to 90%, randomly
    df_ref_scaffolds = df_scaffolds_orig.sample(frac = reduced_fraction, random_state = rnd_seed_reduced)

    # Merge with Cherry-Picked scaffolds to maintain consistency so that the embeddings of the
    # Cherry-Picked scasffolds can be compared in the original and in the reduced space
    df_ref_scaffolds = df_ref_scaffolds[['structure', 'scaffold_id', 'scaffold_key']].copy()
    

    #df_cp_scaffolds = df_cp_scaffolds[['structure', 'scaffold_id', 'scaffold_key', 'color']].copy()
    
    df_cp_scaffolds_orig = df_cp_scaffolds.copy()
    df_cp_scaffolds = df_cp_scaffolds_orig[['structure', 'scaffold_id', 'scaffold_key']].copy()
    df_cp_color = df_cp_scaffolds_orig[['scaffold_id', 'color']].copy()

    
    
    # append reduced scaffold set and cherry-picked scaffolds
    # df = df_ref_scaffolds.append (df_cp_scaffolds, ignore_index = True)
    df = pd.concat([df_ref_scaffolds, df_cp_scaffolds], ignore_index = True)
    
    
    # deduplicate sets

    df = df.groupby (['scaffold_id'], as_index = False).agg('first')

    df = df.reset_index(drop = True)
    
    
    
    # split deduplicated set into cherry-picked and not cherry-picked partitions

    df_in_cp = df[df['scaffold_id'].isin(list(df_cp_scaffolds['scaffold_id']))].copy()
   
    
    df_not_in_cp = df[~df['scaffold_id'].isin(list(df_cp_scaffolds['scaffold_id']))].copy()



    # assign colors: assign color -1 to non-cherry-picked partition

    
    df_in_cp = df_in_cp.merge (df_cp_color, on = 'scaffold_id', how = 'inner')
    df_not_in_cp['color'] = -1   
    
    
    #df_in_cp = df_in_cp.astype({'color': int})

 
    

    # append cherry-picked and non-cherry-picked partitions


    # df = df_in_cp.append (df_not_in_cp, ignore_index = True)
    df = pd.concat([df_in_cp, df_not_in_cp], ignore_index = True)
    df = df[['structure', 'scaffold_id', 'scaffold_key', 'color']].copy()



    
    return (df)


In [20]:
# Reduce reference scaffold set

# Reduce reference set to 90%, randomly
df_ref_scaffolds = generate_reduced_reference_scaffold_set (df_ref_scaffolds, df_cp, reduced_fraction, rnd_seed_reduced)

    
# Data structure adjustment

# This is just to make the data structure compliant which HCASE which at this point expects a set of scaffolds,
# but here a set if compounds are provided. It's not an issue in the workflow, as the train method of hcase 
# actually generates the unique BM-scaffold set from the compounds. 
df_ref_scaffolds['hash'] = df_ref_scaffolds.apply (lambda x: smiles2inchikey(x['structure']), axis = 1)
df_ref_scaffolds['ptype'] = 'scaffold'


df_ref_scaffolds.to_csv (fname_out_reduced_reduced_ref_scaffolds, sep = '\t', index = False)


df_ref_scaffolds = df_ref_scaffolds.rename (columns = {'scaffold_id': 'pattern_id'})

In [21]:
# Reduce datasets for workflow development (optional)

if do_downsampling:

    df_ref_scaffolds = df_ref_scaffolds.sample (n = sample_size, random_state = rnd_seed)
    df_structures = df_structures.sample (n = sample_size, random_state = rnd_seed)



In [22]:
# Create ChEMBL Scaffold space
print (df_ref_scaffolds.head)

if not use_precomputed_reference_spaces:
    
    df_hcase_space = hcase.train (df_ref_scaffolds)
    df_hcase_space.to_csv (fname_out_space, sep = '\t', index = False)

else:
    
    df_hcase_space = pd.read_csv (fname_out_space, sep = '\t')


<bound method NDFrame.head of                                         structure       pattern_id  \
0                  O=C(CCc1ccccc1)NCC(=O)N1CCCCC1  scaffold.100896   
1                  O=C(CNC(=O)c1ccccc1)NCC1CCCCC1  scaffold.100938   
2             O=C(N/C=C/c1ccccc1)NC1C(=O)N2CCSC12  scaffold.101000   
3                O=C(NC(=O)c1ccccc1)NC1CN2CCC1CC2  scaffold.101307   
4               O=C1Nc2ccccc2C(=O)/C1=C/Nc1ccccc1  scaffold.102134   
...                                           ...              ...   
50131  c1ccc([C@H]2Nc3ccccc3[C@H]3NCC[C@@H]23)cc1   scaffold.99954   
50132                 c1ccc(CCc2nccn2Cc2cccs2)cc1   scaffold.99979   
50133                 c1ccc(CCc2nccn2Cc2ccco2)cc1   scaffold.99980   
50134                         O=C1CCC=NN1c1ccccc1    scaffold.9999   
50135              c1ccc(-c2cc3cncn3c3ccccc23)cc1   scaffold.99992   

                                            scaffold_key  color hash     ptype  
0      18 6 7 2 0 6 1 0 4 0 0 0 4 4 0 0 2 2 0 0 

  return bound(*args, **kwds)
100%|██████████| 1/1 [00:00<00:00, 2659.67it/s]
  return bound(*args, **kwds)
100%|██████████| 1/1 [00:00<00:00, 13148.29it/s]


In [23]:
# Embed structures into HCASE space


    
df_embedded = hcase.embed (df_hcase_space, df_structures, n_dim)




  return bound(*args, **kwds)
100%|██████████| 1/1 [00:00<00:00, 11618.57it/s]
  return bound(*args, **kwds)
100%|██████████| 1/1 [00:00<00:00, 15141.89it/s]
  return bound(*args, **kwds)
100%|██████████| 1/1 [00:00<00:00, 5849.80it/s]
  return bound(*args, **kwds)
100%|██████████| 1/1 [00:00<00:00, 15592.21it/s]
  return bound(*args, **kwds)
100%|██████████| 1/1 [00:00<00:00, 16131.94it/s]
  return bound(*args, **kwds)
100%|██████████| 1/1 [00:00<00:00, 18893.26it/s]
  return bound(*args, **kwds)
100%|██████████| 1/1 [00:00<00:00, 18558.87it/s]
  return bound(*args, **kwds)
100%|██████████| 1/1 [00:00<00:00, 20360.70it/s]
  return bound(*args, **kwds)
100%|██████████| 1/1 [00:00<00:00, 18558.87it/s]
  return bound(*args, **kwds)
100%|██████████| 1/1 [00:00<00:00, 17549.39it/s]
  return bound(*args, **kwds)
100%|██████████| 1/1 [00:00<00:00, 19972.88it/s]
  return bound(*args, **kwds)
100%|██████████| 1/1 [00:00<00:00, 17189.77it/s]
  return bound(*args, **kwds)
100%|██████████| 1/1 [0

In [11]:
df_embedded.to_csv (fname_out_embedding, sep = '\t', index = False)

In [12]:
# References

# Ref: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reset_index.html