In [1]:
# HCASE Embedding Reduced
#

# Author: Gergely Zahoranszky-Kohalmi, PhD
#
# Email: gergely.zahoranszky-kohalmi@nih.gov
#
# Organization: National Center for Advancing Translational Sciences (NCATS/NIH)
#
# Workflow:

# 1. Select 90% of reference scaffolds.
# 2. Merge the resultant set with cehrry-picked scaffold, and deduplicate (this is to assure consistency of analysis).
# 3. Perform HCASE of drugs in reduced ChEMBL space
# 4. Generate plots for Cherry-picked scaffolds on each HCASE embedding.
# 5. Generate comparative plot between HCASE embeddings of drugs in full and reduced ChEBML space
# on the largest zommon z parameter.


In [2]:
import hcase
import pandas as pd

In [3]:
# Config section



fname_reference_scaffolds = '../data/hc_space.tab'
fname_cherry_picked_scaffolds = '../data/cherrypicked_scaffolds.tab'
fname_structures = '../data/STD_drugbank_approved_structures_v5.txt'

fname_out_reduced_reduced_ref_scaffolds = '../data/red_ref_scaffolds.tab'
fname_out_space = '../data/red_hc_space.tab'
fname_out_embedding = '../data/red_drugs_emb_hcase_chembl.tab'

n_dim = 2



use_precomputed_reference_spaces = False

rnd_seed_reduced = 12345
reduced_fraction = 0.9

do_downsampling = False
rnd_seed = 55555
sample_size = 200




In [4]:
# Import datasets

df_ref_scaffolds = pd.read_csv (fname_reference_scaffolds, sep = '\t')
df_cp = pd.read_csv (fname_cherry_picked_scaffolds, sep = '\t')
df_structures = pd.read_csv (fname_structures, sep = '\t')

print (df_ref_scaffolds.head)
print (df_cp.head)

<bound method NDFrame.head of                                                structure  order  \
0                                                C=C1CC1      1   
1                                                  C1CN1      2   
2                                             C1C[NH2+]1      3   
3                                                  C1CS1      4   
4                                                 C1=CC1      5   
...                                                  ...    ...   
55592  O=C1CCSSC[C@@H]2NC(=O)[C@@H]3CSSC[C@@H](C(=O)N...  55593   
55593  O=C(NCC(=O)N1Cc2ccccc2C[C@H]1C(=O)N1CC2CCCCC2[...  55594   
55594  S=P(N/N=C/c1ccc(OP2(Oc3ccc(/C=N/NP(=S)(Oc4cccc...  55595   
55595  O=C(CNC(=O)CNC(=O)CNC(=O)CNC(=O)CNC(=O)CNC(=O)...  55596   
55596  O=C(CNC(=O)[C@@H]1C2CCCCC2CN1C(=O)[C@@H]1Cc2cc...  55597   

           scaffold_id                                       scaffold_key  
0          scaffold.10  3 0 0 1 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...  
1           s

In [5]:
# Clean-up datasets

df_ref_scaffolds = df_ref_scaffolds[['structure', 'scaffold_id', 'scaffold_key']].copy()

df_cp = df_cp[['structure', 'scaffold_id', 'scaffold_key', 'color']].copy()



df_structures = df_structures.rename (columns = {'Structure': 'structure', 'ID': 'id'})

In [6]:
# Functions

def smiles2inchikey (smiles):
    if smiles == 'NA':
        inchi = 'NA'
    else:
        try:
            mol = Chem.MolFromSmiles(smiles)
            inchi = Chem.MolToInchi (mol)
        except:
            inchi = 'NA'

    if inchi == 'NA':
        inchikey = 'NA'
    else:
        try:
            inchikey = Chem.InchiToInchiKey (inchi)
        except:
            inchikey = 'NA'
    
    return (inchikey)    



def generate_reduced_reference_scaffold_set (df_scaffolds_orig, df_cp_scaffolds, reduced_fraction, rnd_seed_reduced):
   
    # Reduce reference set to 90%, randomly
    df_ref_scaffolds = df_scaffolds_orig.sample(frac = reduced_fraction, random_state = rnd_seed_reduced)

    # Merge with Cherry-Picked scaffolds to maintain consistency so that the embeddings of the
    # Cherry-Picked scasffolds can be compared in the original and in the reduced space
    df_ref_scaffolds = df_ref_scaffolds[['structure', 'scaffold_id', 'scaffold_key']].copy()
    

    #df_cp_scaffolds = df_cp_scaffolds[['structure', 'scaffold_id', 'scaffold_key', 'color']].copy()
    
    df_cp_scaffolds_orig = df_cp_scaffolds.copy()
    df_cp_scaffolds = df_cp_scaffolds_orig[['structure', 'scaffold_id', 'scaffold_key']].copy()
    df_cp_color = df_cp_scaffolds_orig[['scaffold_id', 'color']].copy()

    
    
    # append reduced scaffold set and cherry-picked scaffolds
    df = df_ref_scaffolds.append (df_cp_scaffolds, ignore_index = True)
    
    
    # deduplicate sets

    df = df.groupby (['scaffold_id'], as_index = False).agg('first')

    df = df.reset_index(drop = True)
    
    
    
    # split deduplicated set into cherry-picked and not cherry-picked partitions

    df_in_cp = df[df['scaffold_id'].isin(list(df_cp_scaffolds['scaffold_id']))].copy()
   
    
    df_not_in_cp = df[~df['scaffold_id'].isin(list(df_cp_scaffolds['scaffold_id']))].copy()



    # assign colors: assign color -1 to non-cherry-picked partition

    
    df_in_cp = df_in_cp.merge (df_cp_color, on = 'scaffold_id', how = 'inner')
    df_not_in_cp['color'] = -1   
    
    
    #df_in_cp = df_in_cp.astype({'color': int})

 
    

    # append cherry-picked and non-cherry-picked partitions


    df = df_in_cp.append (df_not_in_cp, ignore_index = True)
    df = df[['structure', 'scaffold_id', 'scaffold_key', 'color']].copy()



    
    return (df)


In [7]:
# Reduce reference scaffold set

# Reduce reference set to 90%, randomly
df_ref_scaffolds = generate_reduced_reference_scaffold_set (df_ref_scaffolds, df_cp, reduced_fraction, rnd_seed_reduced)

    
# Data structure adjustment

# This is just to make the data structure compliant which HCASE which at this point expects a set of scaffolds,
# but here a set if compounds are provided. It's not an issue in the workflow, as the train method of hcase 
# actually generates the unique BM-scaffold set from the compounds. 
df_ref_scaffolds['hash'] = df_ref_scaffolds.apply (lambda x: smiles2inchikey(x['structure']), axis = 1)
df_ref_scaffolds['ptype'] = 'scaffold'


df_ref_scaffolds.to_csv (fname_out_reduced_reduced_ref_scaffolds, sep = '\t', index = False)


df_ref_scaffolds = df_ref_scaffolds.rename (columns = {'scaffold_id': 'pattern_id'})

In [8]:
# Reduce datasets for workflow development (optional)

if do_downsampling:

    df_ref_scaffolds = df_ref_scaffolds.sample (n = sample_size, random_state = rnd_seed)
    df_structures = df_structures.sample (n = sample_size, random_state = rnd_seed)



In [9]:
# Create ChEMBL Scaffold space
print (df_ref_scaffolds.head)

if not use_precomputed_reference_spaces:
    
    df_hcase_space = hcase.train (df_ref_scaffolds)
    df_hcase_space.to_csv (fname_out_space, sep = '\t', index = False)

else:
    
    df_hcase_space = pd.read_csv (fname_out_space, sep = '\t')


<bound method NDFrame.head of                                         structure       pattern_id  \
0                  O=C(CCc1ccccc1)NCC(=O)N1CCCCC1  scaffold.100896   
1                  O=C(CNC(=O)c1ccccc1)NCC1CCCCC1  scaffold.100938   
2             O=C(N/C=C/c1ccccc1)NC1C(=O)N2CCSC12  scaffold.101000   
3                O=C(NC(=O)c1ccccc1)NC1CN2CCC1CC2  scaffold.101307   
4               O=C1Nc2ccccc2C(=O)/C1=C/Nc1ccccc1  scaffold.102134   
...                                           ...              ...   
50114                        O=C1CCCC(c2ccccc2)N1    scaffold.9995   
50115  c1ccc([C@H]2Nc3ccccc3[C@H]3NCC[C@@H]23)cc1   scaffold.99954   
50116                 c1ccc(CCc2nccn2Cc2cccs2)cc1   scaffold.99979   
50117                 c1ccc(CCc2nccn2Cc2ccco2)cc1   scaffold.99980   
50118                         O=C1CCC=NN1c1ccccc1    scaffold.9999   

                                            scaffold_key  color hash     ptype  
0      18 6 7 2 0 6 1 0 4 0 0 0 4 4 0 0 2 2 0 0 









































































































[*] Number of scaffolds in input:
50119
[*] Number of unique reference scaffolds:
50119


In [10]:
# Embed structures into HCASE space


    
df_embedded = hcase.embed (df_hcase_space, df_structures, n_dim)




[21:35:17] Explicit valence for atom # 21 N, 4, is greater than permitted
[21:35:17] Explicit valence for atom # 1 N, 4, is greater than permitted
[21:35:17] Explicit valence for atom # 6 N, 4, is greater than permitted
[21:35:17] Explicit valence for atom # 1 N, 4, is greater than permitted
[21:35:17] Explicit valence for atom # 12 N, 4, is greater than permitted
[21:35:17] Explicit valence for atom # 2 N, 4, is greater than permitted
[21:35:17] Explicit valence for atom # 3 O, 3, is greater than permitted
[21:35:17] Explicit valence for atom # 17 N, 4, is greater than permitted
[21:35:17] Explicit valence for atom # 11 N, 4, is greater than permitted
[21:35:17] Explicit valence for atom # 1 N, 4, is greater than permitted
[21:35:17] Explicit valence for atom # 2 N, 4, is greater than permitted
[21:35:17] Explicit valence for atom # 20 N, 4, is greater than permitted
[21:35:17] Explicit valence for atom # 17 N, 4, is greater than permitted
[21:35:17] Explicit valence for atom # 25 N, 

[21:35:19] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6

[21:35:19] SMILES Parse Error: syntax error for input: 'NA'
[21:35:19] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 8 9 10 11 12 13

[21:35:19] SMILES Parse Error: syntax error for input: 'NA'
[21:35:19] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 8 17 18

[21:35:19] SMILES Parse Error: syntax error for input: 'NA'




[21:35:20] Can't kekulize mol.  Unkekulized atoms: 1 3 5 7 8 9

[21:35:20] SMILES Parse Error: syntax error for input: 'NA'
[21:35:20] Can't kekulize mol.  Unkekulized atoms: 1 2 3 20 21 22 24

[21:35:20] SMILES Parse Error: syntax error for input: 'NA'




[21:35:20] Can't kekulize mol.  Unkekulized atoms: 1 2 3 8 9 10 20 21 22

[21:35:20] SMILES Parse Error: syntax error for input: 'NA'
[21:35:20] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 11

[21:35:20] SMILES Parse Error: syntax error for input: 'NA'




[21:35:20] Can't kekulize mol.  Unkekulized atoms: 1 2 3 10

[21:35:20] SMILES Parse Error: syntax error for input: 'NA'
[21:35:20] Can't kekulize mol.  Unkekulized atoms: 1 3 4 5 6 7 8 9 10

[21:35:20] SMILES Parse Error: syntax error for input: 'NA'




[21:35:20] Can't kekulize mol.  Unkekulized atoms: 1 3 5 7 8 9

[21:35:20] SMILES Parse Error: syntax error for input: 'NA'




[21:35:20] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 8 16 17

[21:35:20] SMILES Parse Error: syntax error for input: 'NA'
[21:35:20] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 7 8 9

[21:35:20] SMILES Parse Error: syntax error for input: 'NA'




[21:35:21] Can't kekulize mol.  Unkekulized atoms: 1 2 3 10

[21:35:21] SMILES Parse Error: syntax error for input: 'NA'
[21:35:21] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 9

[21:35:21] SMILES Parse Error: syntax error for input: 'NA'




[21:35:21] Can't kekulize mol.  Unkekulized atoms: 1 3 4 5 6 7 8

[21:35:21] SMILES Parse Error: syntax error for input: 'NA'
[21:35:21] Can't kekulize mol.  Unkekulized atoms: 1 3 4 5 6

[21:35:21] SMILES Parse Error: syntax error for input: 'NA'
[21:35:21] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5

[21:35:21] SMILES Parse Error: syntax error for input: 'NA'
[21:35:21] Can't kekulize mol.  Unkekulized atoms: 1 3 4

[21:35:21] SMILES Parse Error: syntax error for input: 'NA'




[21:35:21] Can't kekulize mol.  Unkekulized atoms: 1 2 3 10

[21:35:21] SMILES Parse Error: syntax error for input: 'NA'
[21:35:21] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 8 9

[21:35:21] SMILES Parse Error: syntax error for input: 'NA'
[21:35:



[21:35:21] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5

[21:35:21] SMILES Parse Error: syntax error for input: 'NA'
[21:35:21] Can't kekulize mol.  Unkekulized atoms: 7 8 9 10 12 13 15

[21:35:21] SMILES Parse Error: syntax error for input: 'NA'
[21:35:22] Can't kekulize mol.  Unkekulized atoms: 10 13 14 15 16 17 18

[21:35:22] SMILES Parse Error: syntax error for input: 'NA'
[21:35:22] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 14 15 16

[21:35:22] SMILES Parse Error: syntax error for input: 'NA'




[21:35:22] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 14 15 16

[21:35:22] SMILES Parse Error: syntax error for input: 'NA'
[21:35:22] Can't kekulize mol.  Unkekulized atoms: 1 2 3 8 9 10 17 18 19

[21:35:22] SMILES Parse Error: syntax error for input: 'NA'
[21:35:22] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5

[21:35:22] SMILES Parse Error: syntax error for input: 'NA'
[21:35:22] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5

[21:35:22] SMILES Parse Error: syntax error for input: 'NA'
[21:35:22] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 10 11 12

[21:35:22] SMILES Parse Error: syntax error for input: 'NA'




[21:35:23] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 7 8 9

[21:35:23] SMILES Parse Error: syntax error for input: 'NA'




[21:35:24] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 14 16

[21:35:24] SMILES Parse Error: syntax error for input: 'NA'
[21:35:24] Can't kekulize mol.  Unkekulized atoms: 1 2 3 10

[21:35:24] SMILES Parse Error: syntax error for input: 'NA'
[21:35:24] Can't kekulize mol.  Unkekulized atoms: 1 3 4 5 6

[21:35:24] SMILES Parse Error: syntax error for input: 'NA'
[21:35:24] Can't kekulize mol.  Unkekulized atoms: 1 3 5 7 8 9

[21:35:24] SMILES Parse Error: syntax error for input: 'NA'
[21:35:24] Can't kekulize mol.  Unkekulized atoms: 1 18 19 20 21 22 23

[21:35:24] SMILES Parse Error: syntax error for input: 'NA'




[21:35:24] Can't kekulize mol.  Unkekulized atoms: 1 3 4

[21:35:24] SMILES Parse Error: syntax error for input: 'NA'
[21:35:24] Can't kekulize mol.  Unkekulized atoms: 1 3 4 5 6 7 8 9 10

[21:35:24] SMILES Parse Error: syntax error for input: 'NA'
[21:35:24] Can't kekulize mol.  Unkekulized atoms: 1 2 3 11 12 13 20 21 22

[21:35:24] SMILES Parse Error: syntax error for input: 'NA'




[21:35:24] Can't kekulize mol.  Unkekulized atoms: 1 3 4

[21:35:24] SMILES Parse Error: syntax error for input: 'NA'
[21:35:24] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5

[21:35:24] SMILES Parse Error: syntax error for input: 'NA'
[21:35:24] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 9 10 11 13 14 15

[21:35:24] SMILES Parse Error: syntax error for input: 'NA'




[21:35:24] Can't kekulize mol.  Unkekulized atoms: 1 2 20 21 22

[21:35:24] SMILES Parse Error: syntax error for input: 'NA'




[21:35:24] Can't kekulize mol.  Unkekulized atoms: 3 4 5 7 9 13 14 15 16 17 33 34 35 36 37

[21:35:24] SMILES Parse Error: syntax error for input: 'NA'
[21:35:24] Can't kekulize mol.  Unkekulized atoms: 1 3 4

[21:35:24] SMILES Parse Error: syntax error for input: 'NA'
[21:35:25] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 8 9 10

[21:35:25] SMILES Parse Error: syntax error for input: 'NA'
[21:35:25] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 7 8 9

[21:35:25] SMILES Parse Error: syntax error for input: 'NA'
[21:35:25] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5

[21:35:25] SMILES Parse Error: syntax error for input: 'NA'
[21:35:25] Can't kekulize mol.  Unkekulized atoms: 1 3 4

[21:35:25] SMILES Parse Error: syntax error for input: 'NA'
[21:35:25] Can't kekulize mol.  Unkekulized atoms: 1 3 5 7 8 9

[21:35:25] SMILES Parse Error: syntax error for input: 'NA'
[21:35:25] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 14 15 16

[21:35:25] SMILES Parse Error: syntax 



[21:35:25] SMILES Parse Error: syntax error for input: 'NA'
[21:35:25] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 8 9 10

[21:35:25] SMILES Parse Error: syntax error for input: 'NA'
[21:35:25] Can't kekulize mol.  Unkekulized atoms: 1 2 3 11 12 13 14 15 16 17 18 19 20

[21:35:25] SMILES Parse Error: syntax error for input: 'NA'




[21:35:25] Can't kekulize mol.  Unkekulized atoms: 1 3 4 5 6 7 8

[21:35:25] SMILES Parse Error: syntax error for input: 'NA'
[21:35:25] Can't kekulize mol.  Unkekulized atoms: 1 2 3 21 22 23 24

[21:35:25] SMILES Parse Error: syntax error for input: 'NA'




[21:35:25] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5

[21:35:25] SMILES Parse Error: syntax error for input: 'NA'
[21:35:25] Can't kekulize mol.  Unkekulized atoms: 1 2 8 9 10 11 12 13 14

[21:35:25] SMILES Parse Error: syntax error for input: 'NA'
[21:35:25] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 8

[21:35:26] SMILES Parse Error: syntax error for input: 'NA'
[21:35:26] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5

[21:35:26] SMILES Parse Error: syntax error for input: 'NA'




[21:35:26] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5

[21:35:26] SMILES Parse Error: syntax error for input: 'NA'
[21:35:26] Can't kekulize mol.  Unkekulized atoms: 1 3 4 5 6 7 8 9 10

[21:35:26] SMILES Parse Error: syntax error for input: 'NA'




[21:35:26] Can't kekulize mol.  Unkekulized atoms: 21 22 23 25 26 28 29 30 31 32 60 61 62

[21:35:26] SMILES Parse Error: syntax error for input: 'NA'
[21:35:26] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 16

[21:35:26] SMILES Parse Error: syntax error for input: 'NA'
[21:35:26] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 14 15 16

[21:35:26] SMILES Parse Error: syntax error for input: 'NA'
[21:35:26] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5

[21:35:26] SMILES Parse Error: syntax error for input: 'NA'




[21:35:26] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 8 9 26

[21:35:26] SMILES Parse Error: syntax error for input: 'NA'
[21:35:26] Can't kekulize mol.  Unkekulized atoms: 1 3 4 5 7 8 9

[21:35:26] SMILES Parse Error: syntax error for input: 'NA'
[21:35:26] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 9 10 11 12 13 14

[21:35:26] SMILES Parse Error: syntax error for input: 'NA'
[21:35:26] Can't kekulize mol.  Unkekulized atoms: 3 4 5 7 9 13 14 15 16 17 18 19 20 21 22

[21:35:26] SMILES Parse Error: syntax error for input: 'NA'
[21:35:26] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 9

[21:35:26] SMILES Parse Error: syntax error for input: 'NA'
[21:35:27] Can't kekulize mol.  Unkekulized atoms: 1 2 3 8 9 10 17 18 19

[21:35:27] SMILES Parse Error: syntax error for input: 'NA'
[21:35:27] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 7 8 16 17 19

[21:35:27] SMILES Parse Error: syntax error for input: 'NA'
[21:35:27] Can't kekulize mol.  Unkekulized atoms: 1 2 







[21:35:27] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5

[21:35:27] SMILES Parse Error: syntax error for input: 'NA'
[21:35:27] Can't kekulize mol.  Unkekulized atoms: 1 3 4 5 6 7 8

[21:35:27] SMILES Parse Error: syntax error for input: 'NA'
[21:35:27] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5

[21:35:27] SMILES Parse Error: syntax error for input: 'NA'
[21:35:27] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 14 15 16

[21:35:27] SMILES Parse Error: syntax error for input: 'NA'
[21:35:27] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15 16 18 19 27

[21:35:27] SMILES Parse Error: syntax error for input: 'NA'
[21:35:27] Can't kekulize mol.  Unkekulized atoms: 1 12 13

[21:35:27] SMILES Parse Error: syntax error for input: 'NA'




[21:35:27] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9 11 12 17 18

[21:35:27] SMILES Parse Error: syntax error for input: 'NA'
[21:35:27] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 14 15 16

[21:35:27] SMILES Parse Error: syntax error for input: 'NA'
[21:35:27] Can't kekulize mol.  Unkekulized atoms: 1 3 4

[21:35:27] SMILES Parse Error: syntax error for input: 'NA'




[21:35:28] Can't kekulize mol.  Unkekulized atoms: 1 3 4 5 6 7 8

[21:35:28] SMILES Parse Error: syntax error for input: 'NA'
[21:35:28] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5

[21:35:28] SMILES Parse Error: syntax error for input: 'NA'
[21:35:28] Can't kekulize mol.  Unkekulized atoms: 1 2 3 8 9 10 17 18 19

[21:35:28] SMILES Parse Error: syntax error for input: 'NA'
[21:35:28] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5

[21:35:28] SMILES Parse Error: syntax error for input: 'NA'
[21:35:28] Can't kekulize mol.  Unkekulized atoms: 1 3 4

[21:35:28] SMILES Parse Error: syntax error for input: 'NA'
[21:35:28] Can't kekulize mol.  Unkekulized atoms: 1 2 3 10

[21:35:28] SMILES Parse Error: syntax error for input: 'NA'
[21:35:28] Can't kekulize mol.  Unkekulized atoms: 1 2 20 21 22

[21:35:28] SMILES Parse Error: syntax error for input: 'NA'




[21:35:28] Can't kekulize mol.  Unkekulized atoms: 1 2 3 14 15

[21:35:28] SMILES Parse Error: syntax error for input: 'NA'




[21:35:29] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9

[21:35:29] SMILES Parse Error: syntax error for input: 'NA'




[21:35:29] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 11 12 13 14 15 16

[21:35:29] SMILES Parse Error: syntax error for input: 'NA'
[21:35:29] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 7 8 10 11 18

[21:35:29] SMILES Parse Error: syntax error for input: 'NA'
[21:35:29] Can't kekulize mol.  Unkekulized atoms: 1 3 5 7 8 9

[21:35:29] SMILES Parse Error: syntax error for input: 'NA'
[21:35:29] Can't kekulize mol.  Unkekulized atoms: 1 2 3

[21:35:29] SMILES Parse Error: syntax error for input: 'NA'
[21:35:29] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 11





[21:35:29] SMILES Parse Error: syntax error for input: 'NA'
[21:35:29] Can't kekulize mol.  Unkekulized atoms: 1 2 3

[21:35:29] SMILES Parse Error: syntax error for input: 'NA'




[21:35:30] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 7 8 9

[21:35:30] SMILES Parse Error: syntax error for input: 'NA'
[21:35:30] Can't kekulize mol.  Unkekulized atoms: 1 2 3 32 33 34 35 36 37 38 39 40

[21:35:30] SMILES Parse Error: syntax error for input: 'NA'
[21:35:30] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5

[21:35:30] SMILES Parse Error: syntax error for input: 'NA'
[21:35:30] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 10 11 12

[21:35:30] SMILES Parse Error: syntax error for input: 'NA'
[21:35:30] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 14 15 16

[21:35:30] SMILES Parse Error: syntax error for input: 'NA'








[21:35:30] Can't kekulize mol.  Unkekulized atoms: 8 9 11

[21:35:30] SMILES Parse Error: syntax error for input: 'NA'
[21:35:30] Can't kekulize mol.  Unkekulized atoms: 1 3 4 5 6 7 8

[21:35:30] SMILES Parse Error: syntax error for input: 'NA'




[21:35:30] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 9 10 11 12 13 14

[21:35:30] SMILES Parse Error: syntax error for input: 'NA'
[21:35:30] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 18 19 20 21

[21:35:30] SMILES Parse Error: syntax error for input: 'NA'
[21:35:31] Can't kekulize mol.  Unkekulized atoms: 1 2 3 349 350

[21:35:31] SMILES Parse Error: syntax error for input: 'NA'
[21:35:31] Can't kekulize mol.  Unkekulized atoms: 1 2 3 18





[21:35:31] SMILES Parse Error: syntax error for input: 'NA'




[21:35:31] Can't kekulize mol.  Unkekulized atoms: 1 2 3 22 23 24 26

[21:35:31] SMILES Parse Error: syntax error for input: 'NA'
[21:35:32] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6

[21:35:32] SMILES Parse Error: syntax error for input: 'NA'




[21:35:32] Can't kekulize mol.  Unkekulized atoms: 1 2 3

[21:35:32] SMILES Parse Error: syntax error for input: 'NA'




[21:35:32] Can't kekulize mol.  Unkekulized atoms: 1 2 3 8 9 10 18 19 20

[21:35:32] SMILES Parse Error: syntax error for input: 'NA'
[21:35:32] Can't kekulize mol.  Unkekulized atoms: 10 11 12 15 25

[21:35:32] SMILES Parse Error: syntax error for input: 'NA'




[21:35:33] Can't kekulize mol.  Unkekulized atoms: 9 10 12 13 14 15 16 17 18

[21:35:33] SMILES Parse Error: syntax error for input: 'NA'
[21:35:33] Can't kekulize mol.  Unkekulized atoms: 4 5 7

[21:35:33] SMILES Parse Error: syntax error for input: 'NA'
[21:35:33] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6

[21:35:33] SMILES Parse Error: syntax error for input: 'NA'




[21:35:33] Can't kekulize mol.  Unkekulized atoms: 1 3 4 5 13 14

[21:35:33] SMILES Parse Error: syntax error for input: 'NA'
[21:35:33] Can't kekulize mol.  Unkekulized atoms: 1 2 9 10 17

[21:35:33] SMILES Parse Error: syntax error for input: 'NA'




[21:35:33] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 14 20

[21:35:33] SMILES Parse Error: syntax error for input: 'NA'
[21:35:33] Can't kekulize mol.  Unkekulized atoms: 10 11 13 14 15

[21:35:33] SMILES Parse Error: syntax error for input: 'NA'
[21:35:33] Can't kekulize mol.  Unkekulized atoms: 1 2 3 21

[21:35:33] SMILES Parse Error: syntax error for input: 'NA'
[21:35:33] Can't kekulize mol.  Unkekulized atoms: 1 3 4 5 6

[21:35:33] SMILES Parse Error: syntax error for input: 'NA'




[21:35:33] Can't kekulize mol.  Unkekulized atoms: 1 2 3 11 12 13 29 30 31

[21:35:33] SMILES Parse Error: syntax error for input: 'NA'
[21:35:33] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15

[21:35:33] SMILES Parse Error: syntax error for input: 'NA'




[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5

[21:35:34] SMILES Parse Error: syntax error for input: 'NA'
[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 8 9 10 20 21 22

[21:35:34] SMILES Parse Error: syntax error for input: 'NA'
[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9

[21:35:34] SMILES Parse Error: syntax error for input: 'NA'




[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 21 22

[21:35:34] SMILES Parse Error: syntax error for input: 'NA'
[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 3 5 7 8 9

[21:35:34] SMILES Parse Error: syntax error for input: 'NA'
[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7 8 16 17

[21:35:34] SMILES Parse Error: syntax error for input: 'NA'
[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 28 29





[21:35:34] SMILES Parse Error: syntax error for input: 'NA'
[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 10

[21:35:34] SMILES Parse Error: syntax error for input: 'NA'
[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 21

[21:35:34] SMILES Parse Error: syntax error for input: 'NA'
[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 3 4

[21:35:34] SMILES Parse Error: syntax error for input: 'NA'




[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 3 4 5 18 19 20 21 22 23

[21:35:34] SMILES Parse Error: syntax error for input: 'NA'
[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 10

[21:35:34] SMILES Parse Error: syntax error for input: 'NA'
[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5

[21:35:34] SMILES Parse Error: syntax error for input: 'NA'
[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 3 5 6 7 8

[21:35:34] SMILES Parse Error: syntax error for input: 'NA'
[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5

[21:35:34] SMILES Parse Error: syntax error for input: 'NA'
[21:35:34] Can't kekulize mol.  Unkekulized atoms: 1 3 10

[21:35:35] SMILES Parse Error: syntax error for input: 'NA'




[21:35:35] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5

[21:35:35] SMILES Parse Error: syntax error for input: 'NA'
[21:35:35] Can't kekulize mol.  Unkekulized atoms: 7 8 9 12

[21:35:35] SMILES Parse Error: syntax error for input: 'NA'




[21:35:35] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6

[21:35:35] SMILES Parse Error: syntax error for input: 'NA'
[21:35:35] Can't kekulize mol.  Unkekulized atoms: 1 2 3 11 12 13 14 15 16

[21:35:35] SMILES Parse Error: syntax error for input: 'NA'
[21:35:35] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 11 12 13 21 22 23

[21:35:35] SMILES Parse Error: syntax error for input: 'NA'




[21:35:35] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6

[21:35:35] SMILES Parse Error: syntax error for input: 'NA'




[21:35:35] Can't kekulize mol.  Unkekulized atoms: 1 2 3 11 12 13 18 19 20

[21:35:35] SMILES Parse Error: syntax error for input: 'NA'
[21:35:36] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 14 15 16

[21:35:36] SMILES Parse Error: syntax error for input: 'NA'


[*] Number of input structures: 1623
[*] Number of structures for which scaffold_key was generated: 1474
[*] Finding closest reference scaffold for structure 1 out of 1474 .
[*] Finding closest reference scaffold for structure 2 out of 1474 .
[*] Finding closest reference scaffold for structure 3 out of 1474 .
[*] Finding closest reference scaffold for structure 4 out of 1474 .
[*] Finding closest reference scaffold for structure 5 out of 1474 .
[*] Finding closest reference scaffold for structure 6 out of 1474 .
[*] Finding closest reference scaffold for structure 7 out of 1474 .
[*] Finding closest reference scaffold for structure 8 out of 1474 .
[*] Finding closest reference scaffold for structure 9 out of 1474 .
[*] Finding closest reference scaffold for structure 10 out of 1474 .
[*] Finding closest reference scaffold for structure 11 out of 1474 .
[*] Finding closest reference scaffold for structure 12 out of 1474 .
[*] Finding closest reference scaffold for structure 13 out of 1

[*] Finding closest reference scaffold for structure 117 out of 1474 .
[*] Finding closest reference scaffold for structure 118 out of 1474 .
[*] Finding closest reference scaffold for structure 119 out of 1474 .
[*] Finding closest reference scaffold for structure 120 out of 1474 .
[*] Finding closest reference scaffold for structure 121 out of 1474 .
[*] Finding closest reference scaffold for structure 122 out of 1474 .
[*] Finding closest reference scaffold for structure 123 out of 1474 .
[*] Finding closest reference scaffold for structure 124 out of 1474 .
[*] Finding closest reference scaffold for structure 125 out of 1474 .
[*] Finding closest reference scaffold for structure 126 out of 1474 .
[*] Finding closest reference scaffold for structure 127 out of 1474 .
[*] Finding closest reference scaffold for structure 128 out of 1474 .
[*] Finding closest reference scaffold for structure 129 out of 1474 .
[*] Finding closest reference scaffold for structure 130 out of 1474 .
[*] Fi

[*] Finding closest reference scaffold for structure 233 out of 1474 .
[*] Finding closest reference scaffold for structure 234 out of 1474 .
[*] Finding closest reference scaffold for structure 235 out of 1474 .
[*] Finding closest reference scaffold for structure 236 out of 1474 .
[*] Finding closest reference scaffold for structure 237 out of 1474 .
[*] Finding closest reference scaffold for structure 238 out of 1474 .
[*] Finding closest reference scaffold for structure 239 out of 1474 .
[*] Finding closest reference scaffold for structure 240 out of 1474 .
[*] Finding closest reference scaffold for structure 241 out of 1474 .
[*] Finding closest reference scaffold for structure 242 out of 1474 .
[*] Finding closest reference scaffold for structure 243 out of 1474 .
[*] Finding closest reference scaffold for structure 244 out of 1474 .
[*] Finding closest reference scaffold for structure 245 out of 1474 .
[*] Finding closest reference scaffold for structure 246 out of 1474 .
[*] Fi

[*] Finding closest reference scaffold for structure 349 out of 1474 .
[*] Finding closest reference scaffold for structure 350 out of 1474 .
[*] Finding closest reference scaffold for structure 351 out of 1474 .
[*] Finding closest reference scaffold for structure 352 out of 1474 .
[*] Finding closest reference scaffold for structure 353 out of 1474 .
[*] Finding closest reference scaffold for structure 354 out of 1474 .
[*] Finding closest reference scaffold for structure 355 out of 1474 .
[*] Finding closest reference scaffold for structure 356 out of 1474 .
[*] Finding closest reference scaffold for structure 357 out of 1474 .
[*] Finding closest reference scaffold for structure 358 out of 1474 .
[*] Finding closest reference scaffold for structure 359 out of 1474 .
[*] Finding closest reference scaffold for structure 360 out of 1474 .
[*] Finding closest reference scaffold for structure 361 out of 1474 .
[*] Finding closest reference scaffold for structure 362 out of 1474 .
[*] Fi

[*] Finding closest reference scaffold for structure 465 out of 1474 .
[*] Finding closest reference scaffold for structure 466 out of 1474 .
[*] Finding closest reference scaffold for structure 467 out of 1474 .
[*] Finding closest reference scaffold for structure 468 out of 1474 .
[*] Finding closest reference scaffold for structure 469 out of 1474 .
[*] Finding closest reference scaffold for structure 470 out of 1474 .
[*] Finding closest reference scaffold for structure 471 out of 1474 .
[*] Finding closest reference scaffold for structure 472 out of 1474 .
[*] Finding closest reference scaffold for structure 473 out of 1474 .
[*] Finding closest reference scaffold for structure 474 out of 1474 .
[*] Finding closest reference scaffold for structure 475 out of 1474 .
[*] Finding closest reference scaffold for structure 476 out of 1474 .
[*] Finding closest reference scaffold for structure 477 out of 1474 .
[*] Finding closest reference scaffold for structure 478 out of 1474 .
[*] Fi

[*] Finding closest reference scaffold for structure 581 out of 1474 .
[*] Finding closest reference scaffold for structure 582 out of 1474 .
[*] Finding closest reference scaffold for structure 583 out of 1474 .
[*] Finding closest reference scaffold for structure 584 out of 1474 .
[*] Finding closest reference scaffold for structure 585 out of 1474 .
[*] Finding closest reference scaffold for structure 586 out of 1474 .
[*] Finding closest reference scaffold for structure 587 out of 1474 .
[*] Finding closest reference scaffold for structure 588 out of 1474 .
[*] Finding closest reference scaffold for structure 589 out of 1474 .
[*] Finding closest reference scaffold for structure 590 out of 1474 .
[*] Finding closest reference scaffold for structure 591 out of 1474 .
[*] Finding closest reference scaffold for structure 592 out of 1474 .
[*] Finding closest reference scaffold for structure 593 out of 1474 .
[*] Finding closest reference scaffold for structure 594 out of 1474 .
[*] Fi

[*] Finding closest reference scaffold for structure 697 out of 1474 .
[*] Finding closest reference scaffold for structure 698 out of 1474 .
[*] Finding closest reference scaffold for structure 699 out of 1474 .
[*] Finding closest reference scaffold for structure 700 out of 1474 .
[*] Finding closest reference scaffold for structure 701 out of 1474 .
[*] Finding closest reference scaffold for structure 702 out of 1474 .
[*] Finding closest reference scaffold for structure 703 out of 1474 .
[*] Finding closest reference scaffold for structure 704 out of 1474 .
[*] Finding closest reference scaffold for structure 705 out of 1474 .
[*] Finding closest reference scaffold for structure 706 out of 1474 .
[*] Finding closest reference scaffold for structure 707 out of 1474 .
[*] Finding closest reference scaffold for structure 708 out of 1474 .
[*] Finding closest reference scaffold for structure 709 out of 1474 .
[*] Finding closest reference scaffold for structure 710 out of 1474 .
[*] Fi

[*] Finding closest reference scaffold for structure 813 out of 1474 .
[*] Finding closest reference scaffold for structure 814 out of 1474 .
[*] Finding closest reference scaffold for structure 815 out of 1474 .
[*] Finding closest reference scaffold for structure 816 out of 1474 .
[*] Finding closest reference scaffold for structure 817 out of 1474 .
[*] Finding closest reference scaffold for structure 818 out of 1474 .
[*] Finding closest reference scaffold for structure 819 out of 1474 .
[*] Finding closest reference scaffold for structure 820 out of 1474 .
[*] Finding closest reference scaffold for structure 821 out of 1474 .
[*] Finding closest reference scaffold for structure 822 out of 1474 .
[*] Finding closest reference scaffold for structure 823 out of 1474 .
[*] Finding closest reference scaffold for structure 824 out of 1474 .
[*] Finding closest reference scaffold for structure 825 out of 1474 .
[*] Finding closest reference scaffold for structure 826 out of 1474 .
[*] Fi

[*] Finding closest reference scaffold for structure 929 out of 1474 .
[*] Finding closest reference scaffold for structure 930 out of 1474 .
[*] Finding closest reference scaffold for structure 931 out of 1474 .
[*] Finding closest reference scaffold for structure 932 out of 1474 .
[*] Finding closest reference scaffold for structure 933 out of 1474 .
[*] Finding closest reference scaffold for structure 934 out of 1474 .
[*] Finding closest reference scaffold for structure 935 out of 1474 .
[*] Finding closest reference scaffold for structure 936 out of 1474 .
[*] Finding closest reference scaffold for structure 937 out of 1474 .
[*] Finding closest reference scaffold for structure 938 out of 1474 .
[*] Finding closest reference scaffold for structure 939 out of 1474 .
[*] Finding closest reference scaffold for structure 940 out of 1474 .
[*] Finding closest reference scaffold for structure 941 out of 1474 .
[*] Finding closest reference scaffold for structure 942 out of 1474 .
[*] Fi

[*] Finding closest reference scaffold for structure 1044 out of 1474 .
[*] Finding closest reference scaffold for structure 1045 out of 1474 .
[*] Finding closest reference scaffold for structure 1046 out of 1474 .
[*] Finding closest reference scaffold for structure 1047 out of 1474 .
[*] Finding closest reference scaffold for structure 1048 out of 1474 .
[*] Finding closest reference scaffold for structure 1049 out of 1474 .
[*] Finding closest reference scaffold for structure 1050 out of 1474 .
[*] Finding closest reference scaffold for structure 1051 out of 1474 .
[*] Finding closest reference scaffold for structure 1052 out of 1474 .
[*] Finding closest reference scaffold for structure 1053 out of 1474 .
[*] Finding closest reference scaffold for structure 1054 out of 1474 .
[*] Finding closest reference scaffold for structure 1055 out of 1474 .
[*] Finding closest reference scaffold for structure 1056 out of 1474 .
[*] Finding closest reference scaffold for structure 1057 out of

[*] Finding closest reference scaffold for structure 1158 out of 1474 .
[*] Finding closest reference scaffold for structure 1159 out of 1474 .
[*] Finding closest reference scaffold for structure 1160 out of 1474 .
[*] Finding closest reference scaffold for structure 1161 out of 1474 .
[*] Finding closest reference scaffold for structure 1162 out of 1474 .
[*] Finding closest reference scaffold for structure 1163 out of 1474 .
[*] Finding closest reference scaffold for structure 1164 out of 1474 .
[*] Finding closest reference scaffold for structure 1165 out of 1474 .
[*] Finding closest reference scaffold for structure 1166 out of 1474 .
[*] Finding closest reference scaffold for structure 1167 out of 1474 .
[*] Finding closest reference scaffold for structure 1168 out of 1474 .
[*] Finding closest reference scaffold for structure 1169 out of 1474 .
[*] Finding closest reference scaffold for structure 1170 out of 1474 .
[*] Finding closest reference scaffold for structure 1171 out of

[*] Finding closest reference scaffold for structure 1272 out of 1474 .
[*] Finding closest reference scaffold for structure 1273 out of 1474 .
[*] Finding closest reference scaffold for structure 1274 out of 1474 .
[*] Finding closest reference scaffold for structure 1275 out of 1474 .
[*] Finding closest reference scaffold for structure 1276 out of 1474 .
[*] Finding closest reference scaffold for structure 1277 out of 1474 .
[*] Finding closest reference scaffold for structure 1278 out of 1474 .
[*] Finding closest reference scaffold for structure 1279 out of 1474 .
[*] Finding closest reference scaffold for structure 1280 out of 1474 .
[*] Finding closest reference scaffold for structure 1281 out of 1474 .
[*] Finding closest reference scaffold for structure 1282 out of 1474 .
[*] Finding closest reference scaffold for structure 1283 out of 1474 .
[*] Finding closest reference scaffold for structure 1284 out of 1474 .
[*] Finding closest reference scaffold for structure 1285 out of

[*] Finding closest reference scaffold for structure 1386 out of 1474 .
[*] Finding closest reference scaffold for structure 1387 out of 1474 .
[*] Finding closest reference scaffold for structure 1388 out of 1474 .
[*] Finding closest reference scaffold for structure 1389 out of 1474 .
[*] Finding closest reference scaffold for structure 1390 out of 1474 .
[*] Finding closest reference scaffold for structure 1391 out of 1474 .
[*] Finding closest reference scaffold for structure 1392 out of 1474 .
[*] Finding closest reference scaffold for structure 1393 out of 1474 .
[*] Finding closest reference scaffold for structure 1394 out of 1474 .
[*] Finding closest reference scaffold for structure 1395 out of 1474 .
[*] Finding closest reference scaffold for structure 1396 out of 1474 .
[*] Finding closest reference scaffold for structure 1397 out of 1474 .
[*] Finding closest reference scaffold for structure 1398 out of 1474 .
[*] Finding closest reference scaffold for structure 1399 out of

In [11]:
df_embedded.to_csv (fname_out_embedding, sep = '\t', index = False)

In [12]:
# References

# Ref: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reset_index.html