In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import tqdm
import pandas as pd
import selfies as sf
from rdkit import Chem, RDLogger
RDLogger.DisableLog('rdApp.*')

In [3]:
REBADD_LIB_PATH = os.path.abspath(os.pardir)
if REBADD_LIB_PATH not in sys.path:
    sys.path = [REBADD_LIB_PATH] + sys.path

from rebadd.chemutils import GSKScorer, JNKScorer, SAScorer, RAScorer, calc_chem_properties # smi -> (mw, clogp, tpsa, qed)

In [4]:
calc_gsk = GSKScorer().set_params(n_jobs=4)
calc_jnk = JNKScorer().set_params(n_jobs=4)
calc_sa  = SAScorer()
calc_ra  = RAScorer().set_params(n_jobs=4)

In [5]:
def calc_properties(smi):
    ## init
    mw = clogp = tpsa = qed = 0.
    s_gsk = s_jnk = 0.
    sa = 10.
    ra = 0.

    ## eval
    try:
        mw, clogp, tpsa, qed = calc_chem_properties(smi)
        sa = calc_sa(smi)
        ra = calc_ra(smi)
        s_gsk = calc_gsk(smi)
        s_jnk = calc_jnk(smi)

    except Chem.rdchem.AtomKekulizeException:
        pass
    except Chem.rdchem.AtomSanitizeException:
        pass
    except Chem.rdchem.AtomValenceException:
        pass
    except Chem.rdchem.KekulizeException:
        pass
    except Chem.rdchem.MolSanitizeException:
        pass
    
    return {'smiles':smi, 'gsk3b':s_gsk, 'jnk3':s_jnk, 'sa':sa, 'ra':ra, 'mw':mw, 'logp':clogp, 'tpsa':tpsa, 'qed':qed}

In [6]:
drug_smi = "C1=CC=C2C(=C1)C3=NNC4=CC=CC(=C43)C2=O"
pd.DataFrame([calc_properties(drug_smi)])

Unnamed: 0,smiles,gsk3b,jnk3,sa,ra,mw,logp,tpsa,qed
0,C1=CC=C2C(=C1)C3=NNC4=CC=CC(=C43)C2=O,0.74,0.67,2.126782,0.954688,220.063663,2.7743,45.75,0.494951


In [7]:
class CKPTCONFIGS:
    def __init__(self):
        
        self.input_dir = 'outputs_3_checkpoints'
        
        self.modelnames = [
            #'gsk3_jnk3_qed_sa',
            #'gsk3_jnk3',
            'gsk3',
            'jnk3'
        ]
                
        self.filenames = [f'smi_after.csv.{num:04d}' for num in range(50,550,50)]
        
ckptconfigs = CKPTCONFIGS()

In [8]:
class OUTPUTCONFIGS:
    def __init__(self):
        self.output_dir = "outputs_4_calculate_properties"
        if not os.path.exists(self.output_dir):
            os.mkdir(self.output_dir)
            
outputconfigs = OUTPUTCONFIGS()

In [9]:
for modelname in ckptconfigs.modelnames:
    
    input_dir = os.path.join(ckptconfigs.input_dir, modelname)
    output_dir = os.path.join(outputconfigs.output_dir, modelname)
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    
    for filename in ckptconfigs.filenames:
        
        filepath = os.path.join(input_dir, filename)
        
        df = pd.read_csv(filepath, header=None, skip_blank_lines=True)
        gen_smiles = df.iloc[:,0].values.tolist()
        
        print(len(gen_smiles))
        
        data = []
        for smi in tqdm.tqdm(gen_smiles):
            data.append(calc_properties(smi))
            
        df = pd.DataFrame(data)
        
        output_filepath = os.path.join(output_dir, filename)
        df.to_csv(output_filepath, index=False)

1000


100%|██████████| 1000/1000 [01:21<00:00, 12.31it/s]


1000


100%|██████████| 1000/1000 [01:21<00:00, 12.20it/s]


1000


100%|██████████| 1000/1000 [01:20<00:00, 12.46it/s]


1000


100%|██████████| 1000/1000 [00:33<00:00, 29.47it/s]


1000


100%|██████████| 1000/1000 [00:16<00:00, 59.06it/s]


1000


100%|██████████| 1000/1000 [00:16<00:00, 60.35it/s]


1000


100%|██████████| 1000/1000 [00:16<00:00, 60.92it/s]


1000


100%|██████████| 1000/1000 [00:16<00:00, 60.04it/s]


1000


100%|██████████| 1000/1000 [00:16<00:00, 59.04it/s]


1000


100%|██████████| 1000/1000 [00:16<00:00, 59.66it/s]


1000


100%|██████████| 1000/1000 [00:16<00:00, 59.47it/s]


1000


100%|██████████| 1000/1000 [00:16<00:00, 59.60it/s]


1000


100%|██████████| 1000/1000 [00:16<00:00, 59.08it/s]


1000


100%|██████████| 1000/1000 [00:17<00:00, 58.18it/s]


1000


100%|██████████| 1000/1000 [00:17<00:00, 56.47it/s]


1000


100%|██████████| 1000/1000 [00:17<00:00, 56.56it/s]


1000


100%|██████████| 1000/1000 [00:18<00:00, 54.53it/s]


1000


100%|██████████| 1000/1000 [00:17<00:00, 55.70it/s]


1000


100%|██████████| 1000/1000 [00:18<00:00, 54.05it/s]


1000


100%|██████████| 1000/1000 [00:18<00:00, 54.82it/s]
