In [28]:
%load_ext autoreload
%autoreload 2

# QM9

In [None]:
import torch
from rdkit import Chem as rdChem
from tqdm import tqdm
from source.utils.mol_utils import preprocess_mol
from source.utils.mol2pyg import mol2pyg
from source.utils.npz_utils import save_npz, save_pyg_as_npz
import numpy as np

sdf = '/home/nobilm@usi.ch/pretrain_paper/data/qm9/raw/gdb9.sdf'
csv = '/home/nobilm@usi.ch/pretrain_paper/data/qm9/raw/gdb9.sdf.csv'
to_be_skip = '/home/nobilm@usi.ch/pretrain_paper/data/qm9/raw/uncharacterized.txt'

HAR2EV = 27.211386246
KCALMOL2EV = 0.04336414
conversion = torch.tensor([
    1., 1., HAR2EV, HAR2EV, HAR2EV, 1., HAR2EV, HAR2EV, HAR2EV, HAR2EV, HAR2EV,
    1., KCALMOL2EV, KCALMOL2EV, KCALMOL2EV, KCALMOL2EV, 1., 1., 1.
])

# TODO script to download things above
# def download(self) -> None:
#     try:
#         import rdkit  # noqa
#         file_path = download_url(self.raw_url, self.raw_dir)
#         extract_zip(file_path, self.raw_dir)
#         os.unlink(file_path)

#         file_path = download_url(self.raw_url2, self.raw_dir)
#         os.rename(osp.join(self.raw_dir, '3195404'),
#                     osp.join(self.raw_dir, 'uncharacterized.txt'))
#     except ImportError:
#         path = download_url(self.processed_url, self.raw_dir)
#         extract_zip(path, self.raw_dir)
#         os.unlink(path)

# TODO add check to make sure that save_folder exists
def process(save_folder):

    with open(csv) as f:
        target = [[float(x) for x in line.split(',')[1:20]] for line in f.read().split('\n')[1:-1]]
        y = torch.tensor(target, dtype=torch.float)
        y = torch.cat([y[:, 3:], y[:, :3]], dim=-1)
        y = y * conversion.view(1, -1)

    with open(to_be_skip) as f:
        skip = [int(x.split()[0]) - 1 for x in f.read().split('\n')[9:-2]]

    suppl = rdChem.SDMolSupplier(sdf, removeHs=False, sanitize=False) # raw_paths[0]='/.../raw/gdb9.sdf'
    for i, mol in enumerate(tqdm(suppl)):

        if i in skip:
            continue

        mol = preprocess_mol(mol)

        if mol is None:
            continue

        smiles = rdChem.MolToSmiles(mol, isomericSmiles=True)
        data = mol2pyg(mol, smiles)
        data.y = y[i].unsqueeze(0)
        save_pyg_as_npz(data, f'{save_folder}/mol_{i}')

In [12]:
# process('/storage_common/nobilm/pretrain_paper/guacamol/EXPERIMENTS/qm9ftTEST/all')

# MoleculeNet

In [3]:
# load + scaffold split
from source.utils import parse_csv
from source.utils.mol_utils import drop_disconnected_components, preprocess_mol, visualize_3d_mols
from source.utils.mol2pyg import mols2pyg_list_with_targets
from source.utils.npz_utils import save_npz
from source.utils.conforge_conformer_generation import generate_conformers, get_conformer_generator
from source.utils.data_splitting_utils import scaffold_splitter
from collections import defaultdict

In [None]:
path = '/home/nobilm@usi.ch/pretrain_paper/data/moelculenet/freesolv.csv'
out = parse_csv(path, [1,-1])
out.keys()

In [5]:
filtered = defaultdict(list)
for s, y in zip(out['smiles'], out['calc']):
    s = drop_disconnected_components(s)
    mol = preprocess_mol(rdChem.MolFromSmiles(s))
    if mol:
        conformers = generate_conformers(s, get_conformer_generator(1))
        if conformers:
            filtered['smiles'].append(s)
            filtered['mols'].append(conformers[0])
            filtered['y'].append(y)

In [6]:
# type(filtered['mols'][0]) #visualize_3d_mols([filtered['mols'][0]])

In [7]:
pyg_mol_fixed_fields = mols2pyg_list_with_targets(mols=filtered['mols'], smiles=filtered['smiles'], ys=filtered['y'])

In [8]:
from source.utils.data_splitting_utils import create_data_folders
# all_dir, _, _, _ = create_data_folders(path)

In [None]:
path = '/storage_common/nobilm/pretrain_paper/guacamol/EXPERIMENTS/freesolv'
save_npz(pyg_mol_fixed_fields, path)

In [12]:
scaffold_splitter(path, 'tmp')

In [None]:
asd = np.load('/storage_common/nobilm/pretrain_paper/guacamol/EXPERIMENTS/freesolv/test/mol_77.npz')
for k, v in asd.items():
    print(k, v.shape)

In [None]:
asd['graph_labels']

# Conformer Generation from train folder full of single mol npz

In [17]:
path = '/storage_common/nobilm/pretrain_paper/guacamol/EXPERIMENTS/freesolv/train'