In [None]:
!pip install -qU rdkit
!pip install -qU datasets

In [None]:
from rdkit import Chem
from rdkit.Chem import Draw
import datasets
from tqdm import tqdm

In [None]:
def is_canonical_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=False, canonical=True)
    return smiles == canonical_smiles

def save_molecule_image(smiles_string, image_path, image_format="PNG"):
    mol = Chem.MolFromSmiles(smiles_string)
    img = Draw.MolToImage(mol)
    img.save(image_path, image_format)

def canonical_to_isomeric(smiles_string):
    mol = Chem.MolFromSmiles(smiles_string)
    return Chem.MolToSmiles(mol, isomericSmiles=True, canonical=False)

def isomeric_to_canonical(smiles_string):
    mol = Chem.MolFromSmiles(smiles_string)
    return Chem.MolToSmiles(mol, isomericSmiles=False, canonical=True)

In [None]:
from datasets import load_dataset

dataset = load_dataset("language-plus-molecules/LPM-24_train")

In [None]:
!mkdir "LPM-24_transform"
!mkdir "LPM-24_transform/images"

In [None]:
splits = ['split_train', 'split_valid']

for split in splits:
    dataset_split = dataset[split]
    df = {'id': [], 'canonical': [], 'isomeric': [], 'caption':[]}
    pbar = tqdm(range(len(dataset_split)))
    for i in pbar:
        pbar.set_description(f'{split} | {i}/{len(dataset_split)}')
        sample = dataset_split[i]
        molecule, caption = sample['molecule'], sample['caption']
        if is_canonical_smiles(molecule):
            canonical_smiles = molecule
            isomeric_smiles = canonical_to_isomeric(molecule)
        else:
            canonical_smiles = isomeric_to_canonical(molecule)
            isomeric_smiles = molecule
        
        df['id'].append(i)
        df['canonical'].append(canonical_smiles)
        df['isomeric'].append(isomeric_smiles)
        df['caption'].append(caption)


        save_molecule_image(molecule, image_path=f"/content/LPM-24_transform/images/{id}.png")

    df.to_csv(f'/content/LPM-24_transform/{split}.csv')