In [9]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [10]:
# read csv file into a pandas dataframe
file_path = '../../Data/ZINC/PROCESSED/train_recurse_frag_only.smi'
df = pd.read_csv(file_path, sep=',')
# drop first column of df (index column)
df = df.iloc[:,1:]
df.head()

Unnamed: 0,smiles,fragments,n_fragments,C,F,N,O,Other,SINGLE,DOUBLE,TRIPLE,Tri,Quad,Pent,Hex,logP,mr,qed,SAS
0,CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1,*C(C)(C)C *C(=O)Cc1coc2ccc(*)cc12 *N* *c1ccccc1F,4,20,1,1,2,0,18,8,0,0,0,1,2,5.0506,93.6477,0.731901,2.084095
1,CC1CC(C)CC(Nc2cncc(-c3nncn3C)c2)C1,*C1CC(C)CC(C)C1 *N* *c1cncc(-c2nncn2C)c1,3,16,0,5,0,0,18,5,0,0,0,1,2,3.1137,83.8237,0.941112,3.432004
2,N#Cc1ccc(-c2ccc(OC(C(=O)N3CCCC3)c3ccccc3)cc2)cc1,*N1CCCC1 *c1ccccc1 *OC(*)C(*)=O *c1ccc(-c2ccc(...,4,25,0,2,2,0,21,10,1,0,0,1,3,4.96778,112.218,0.626105,2.470633
3,CCOC(=O)C1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c2CCCC...,*CC *O* *C(*)=O *C1CCCN(C(=O)c2nc(-c3ccc(C)cc3...,4,24,0,3,3,0,26,7,0,0,0,1,2,4.00022,115.2155,0.716225,2.822753
4,N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])C(C#N)C1...,*SC1=C(C#N)C2(CCCCC2)C(C#N)C([O-])=N1 *c1cccc(...,3,20,0,4,2,2,22,6,2,0,0,0,3,3.60956,107.2347,0.809572,4.035182


In [11]:
# get list of all fragments into one list
fragments_list = []
for fragments in tqdm(df['fragments']):
    fragments_list.append(fragments.split(' '))
fragments_list = [item for sublist in fragments_list for item in sublist]

  0%|          | 0/238556 [00:00<?, ?it/s]

100%|██████████| 238556/238556 [00:00<00:00, 466319.82it/s]


In [18]:
# get a count of unique fragments
unique_fragments = list(set(fragments_list))
print('Number of unique fragments:', len(unique_fragments))

# get a count of each fragment
unique_fragment_count = pd.value_counts(fragments_list)

# get a list of fragments that appear more than 50 times
N = 100
unique_frags_gt_x = unique_fragment_count[unique_fragment_count>N]
print("Number of fragments that appear more than ",N," times:", len(unique_frags_gt_x))

# filter df for molecules that only contain fragments that appear more than 5 times
drop_indices = []
for i, fragments in tqdm(enumerate(df['fragments'])):
    fragments = fragments.split(' ')
    if not(all(x in unique_frags_gt_x for x in fragments)):
        drop_indices.append(i)

df_filtered = df.drop(drop_indices).reset_index(drop=True)
print('Number of molecules after filtering:', len(df_filtered))

Number of unique fragments: 86081
Number of fragments that appear more than  100  times: 643


238556it [00:01, 157959.54it/s]

Number of molecules after filtering: 38023





In [19]:
df_filtered.head()

Unnamed: 0,smiles,fragments,n_fragments,C,F,N,O,Other,SINGLE,DOUBLE,TRIPLE,Tri,Quad,Pent,Hex,logP,mr,qed,SAS
0,CCCCC(=O)NC(=S)Nc1ccccc1C(=O)N1CCOCC1,*C(=O)CCCC *NC(=S)N* *c1ccccc1* *C(*)=O *N1CCOCC1,5,17,0,3,3,1,19,6,0,0,0,0,2,2.1622,97.0529,0.7974,2.037915
1,COc1cc(C)ccc1OCC(=O)Nc1nnc(C)s1,*OC *c1ccc(C)cc1* *O* *c1nnc(C)s1 *CC(=O)N*,5,13,0,3,3,1,15,6,0,0,0,1,1,2.18104,76.5017,0.914357,1.912049
2,COc1ccc(C(C)NC(=O)Cc2cccc3ccccc23)cc1,*OC *c1ccc(*)cc1 *C(*)C *N* *CC(*)=O *c1cccc2c...,6,21,0,1,2,0,17,9,0,0,0,0,3,4.2683,97.2037,0.762534,2.121841
3,COc1ccccc1NC(=O)CSc1ccc(-c2ccccc2OC)nn1,*OC *c1ccccc1* *N* *CC(*)=O *S* *OC *c1ccc(*)n...,8,20,0,3,3,1,19,10,0,0,0,0,3,3.8916,106.3677,0.626082,2.00698
4,CCC(NC(=O)C(C)n1cccn1)c1ccc(C)c(F)c1,*n1cccn1 *C(=O)C(*)C *N* *C(*)CC *c1ccc(C)c(F)c1,5,16,1,3,1,0,16,6,0,0,0,1,1,3.15912,79.2497,0.918746,3.006379


In [20]:
for fragment in df_filtered.iloc[100]['fragments'].split(' '):
    print(fragment, unique_fragment_count[fragment])

*CCCC 1590
*N* 112790
*CS(*)(=O)=O 162
*c1ccc([N+](=O)[O-])cc1 741


In [21]:
# save filtered dataframe to csv
df_filtered.to_csv('../../Data/ZINC/PROCESSED/train_frag_gt_100.smi', index=True)

In [29]:
fragment_list = []
for frag in tqdm(df_filtered.fragments):
    fragment_list.extend(frag.split())
fragment_counts = pd.value_counts(fragment_list)

100%|██████████| 38023/38023 [00:00<00:00, 550770.56it/s]


In [23]:
fragment_counts

*N*               30725
*C(*)=O           21236
*O*                8718
*C*                6991
*OC                6103
                  ...  
*c1ncoc1*            20
*COC                 19
*C(=O)C(F)(F)F       19
*CC(C)=O             15
*C(C)Cl               4
Length: 643, dtype: int64

In [38]:

penalty = np.sum(np.log(fragment_counts + 1)) / np.log(fragment_counts + 1)
penalty_weights = penalty / np.linalg.norm(penalty) * 50
penalty_weights

*N*               0.855388
*C(*)=O           0.887099
*O*               0.974138
*C*               0.998428
*OC               1.013986
                    ...   
*c1ncoc1*         2.903118
*COC              2.950400
*C(=O)C(F)(F)F    2.950400
*CC(C)=O          3.187854
*C(C)Cl           5.491737
Length: 643, dtype: float64