In [1]:
import pandas as pd
from pathlib import Path
from rdkit import Chem

In [2]:
DATA_PATH = "../../data/mol_opt.csv"
PICKLE_PATH = "../../data/mol_opt.pickle"

In [3]:
data_path = Path(DATA_PATH)
f = data_path.open("r")
opt_df = pd.read_csv(f)

In [4]:
opt_df.head()

Unnamed: 0,Input,Output,Set
0,"LogD_change_(-0.5, -0.3]Solubility_low->highCl...",O=C1CCC(O)CCN1C(CSc1ccccc1)Cc1ccccc1,train
1,"LogD_change_(-0.5, -0.3]Solubility_no_changeCl...",COc1ccc(-c2cn(CCO)c(CCN)n2)cc1,train
2,"LogD_change_(0.5, 0.7]Solubility_no_changeClin...",Cc1cc(Nc2ccc(F)cc2)n2nc(C)nc2n1,train
3,"LogD_change_(-0.1, 0.1]Solubility_no_changeCli...",COc1ccc(NC(=O)C2CCN(C(=O)c3ccc(OC)cc3)CC2)cc1,train
4,"LogD_change_(0.3, 0.5]Solubility_no_changeClin...",COCc1ccc(C(C)=O)c2sc(C(=O)Nc3ccc4c(C(=O)OC)ccc...,train


In [5]:
print(f"Total number of rows: {len(opt_df.index)}")

Total number of rows: 198558


In [6]:
sets = set(opt_df["Set"].tolist())
print(list(sets))

['validation', 'train', 'test']


In [7]:
inputs = opt_df["Input"].tolist()
outputs = opt_df["Output"].tolist()
sets = opt_df["Set"].tolist()

In [8]:
for input_mol in inputs[:5]:
    print(input_mol)

LogD_change_(-0.5, -0.3]Solubility_low->highClint_no_changeO=C(NC1CCC(=O)N(C(CSc2ccccc2)Cc2ccccc2)CC1)OCc1ccccc1
LogD_change_(-0.5, -0.3]Solubility_no_changeClint_low->highCCCNC(=O)CCn1cc(-c2ccc(OC)cc2)nc1CCN
LogD_change_(0.5, 0.7]Solubility_no_changeClint_no_changeCc1cc(Nc2ccc(F)cc2)n2ncnc2n1
LogD_change_(-0.1, 0.1]Solubility_no_changeClint_no_changeCOc1ccc(C(=O)N2CCC(C(=O)Nc3ccc4c(c3)OCO4)CC2)cc1
LogD_change_(0.3, 0.5]Solubility_no_changeClint_no_changeCOCCOCc1cccc2nc(NC(=O)c3sc4c(C(C)=O)ccc(COC)c4c3C)ccc12


In [9]:
smiles = []
prop_tokens = []
for input_mol in inputs:
    splits = input_mol.split("Clint_")
    smi = splits[-1][9:]
    props = splits[0] + "Clint_" + splits[-1][:9]
    smiles.append(smi)
    prop_tokens.append(props)

In [10]:
for idx, input_mol in enumerate(inputs):
    assert input_mol == prop_tokens[idx] + smiles[idx]

In [11]:
sets = ["valid" if s == "validation" else s for s in sets]

In [12]:
input_mols = [Chem.MolFromSmiles(smi) for smi in smiles]
output_mols = [Chem.MolFromSmiles(smi) for smi in outputs]

In [13]:
invalid_inputs = [mol for mol in input_mols if mol is None]
invalid_outputs = [mol for mol in output_mols if mol is None]

In [14]:
print(len(invalid_inputs))
print(len(invalid_outputs))

0
0


In [15]:
data_dict = {
    "property_tokens": prop_tokens,
    "input_smiles": smiles,
    "input_mols": input_mols,
    "output_smiles": outputs,
    "output_mols": output_mols,
    "set": sets
}
opt_df = pd.DataFrame(data=data_dict)

In [16]:
opt_df.head()

Unnamed: 0,property_tokens,input_smiles,input_mols,output_smiles,output_mols,set
0,"LogD_change_(-0.5, -0.3]Solubility_low->highCl...",O=C(NC1CCC(=O)N(C(CSc2ccccc2)Cc2ccccc2)CC1)OCc...,<rdkit.Chem.rdchem.Mol object at 0x000002AF332...,O=C1CCC(O)CCN1C(CSc1ccccc1)Cc1ccccc1,<rdkit.Chem.rdchem.Mol object at 0x000002AFD13...,train
1,"LogD_change_(-0.5, -0.3]Solubility_no_changeCl...",CCCNC(=O)CCn1cc(-c2ccc(OC)cc2)nc1CCN,<rdkit.Chem.rdchem.Mol object at 0x000002AF332...,COc1ccc(-c2cn(CCO)c(CCN)n2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000002AFD13...,train
2,"LogD_change_(0.5, 0.7]Solubility_no_changeClin...",Cc1cc(Nc2ccc(F)cc2)n2ncnc2n1,<rdkit.Chem.rdchem.Mol object at 0x000002AF332...,Cc1cc(Nc2ccc(F)cc2)n2nc(C)nc2n1,<rdkit.Chem.rdchem.Mol object at 0x000002AFD13...,train
3,"LogD_change_(-0.1, 0.1]Solubility_no_changeCli...",COc1ccc(C(=O)N2CCC(C(=O)Nc3ccc4c(c3)OCO4)CC2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000002AF33C...,COc1ccc(NC(=O)C2CCN(C(=O)c3ccc(OC)cc3)CC2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000002AFD13...,train
4,"LogD_change_(0.3, 0.5]Solubility_no_changeClin...",COCCOCc1cccc2nc(NC(=O)c3sc4c(C(C)=O)ccc(COC)c4...,<rdkit.Chem.rdchem.Mol object at 0x000002AF333...,COCc1ccc(C(C)=O)c2sc(C(=O)Nc3ccc4c(C(=O)OC)ccc...,<rdkit.Chem.rdchem.Mol object at 0x000002AFD13...,train


In [17]:
sets = set(opt_df["set"].tolist())
print(list(sets))

['valid', 'train', 'test']


In [18]:
opt_df.to_pickle(PICKLE_PATH)

In [3]:
# Produce inputs as text file for use with predict.py script

In [6]:
INPUT_TEXT_PATH = "mol_opt_test.txt"

In [4]:
df = pd.read_pickle(PICKLE_PATH)

In [5]:
df.head()

Unnamed: 0,property_tokens,input_smiles,input_mols,output_smiles,output_mols,set
0,"LogD_change_(-0.5, -0.3]Solubility_low->highCl...",O=C(NC1CCC(=O)N(C(CSc2ccccc2)Cc2ccccc2)CC1)OCc...,<rdkit.Chem.rdchem.Mol object at 0x00000123A13...,O=C1CCC(O)CCN1C(CSc1ccccc1)Cc1ccccc1,<rdkit.Chem.rdchem.Mol object at 0x00000123CA6...,train
1,"LogD_change_(-0.5, -0.3]Solubility_no_changeCl...",CCCNC(=O)CCn1cc(-c2ccc(OC)cc2)nc1CCN,<rdkit.Chem.rdchem.Mol object at 0x00000123A13...,COc1ccc(-c2cn(CCO)c(CCN)n2)cc1,<rdkit.Chem.rdchem.Mol object at 0x00000123CA6...,train
2,"LogD_change_(0.5, 0.7]Solubility_no_changeClin...",Cc1cc(Nc2ccc(F)cc2)n2ncnc2n1,<rdkit.Chem.rdchem.Mol object at 0x00000123A00...,Cc1cc(Nc2ccc(F)cc2)n2nc(C)nc2n1,<rdkit.Chem.rdchem.Mol object at 0x00000123CA6...,train
3,"LogD_change_(-0.1, 0.1]Solubility_no_changeCli...",COc1ccc(C(=O)N2CCC(C(=O)Nc3ccc4c(c3)OCO4)CC2)cc1,<rdkit.Chem.rdchem.Mol object at 0x00000123A13...,COc1ccc(NC(=O)C2CCN(C(=O)c3ccc(OC)cc3)CC2)cc1,<rdkit.Chem.rdchem.Mol object at 0x00000123CA6...,train
4,"LogD_change_(0.3, 0.5]Solubility_no_changeClin...",COCCOCc1cccc2nc(NC(=O)c3sc4c(C(C)=O)ccc(COC)c4...,<rdkit.Chem.rdchem.Mol object at 0x00000123A13...,COCc1ccc(C(C)=O)c2sc(C(=O)Nc3ccc4c(C(=O)OC)ccc...,<rdkit.Chem.rdchem.Mol object at 0x00000123CA6...,train


In [7]:
property_tokens = df["property_tokens"].tolist()
smiles = df["input_smiles"].tolist()
sets = df["set"].tolist()

In [8]:
test_prop_tokens = [props for idx, props in enumerate(property_tokens) if sets[idx] == "test"]
test_smiles = [smi for idx, smi in enumerate(smiles) if sets[idx] == "test"]

In [9]:
test_inputs = [f"{props}{smi}" for props, smi in zip(test_prop_tokens, test_smiles)]

In [11]:
print(f"Length of test set: {len(test_inputs)}")
assert len(test_inputs) == len(test_prop_tokens) == len(test_smiles)

Length of test set: 19856


In [12]:
output_str = "\n".join(test_inputs)

In [13]:
p = Path(INPUT_TEXT_PATH)
p.write_text(output_str)

2109089