<a href="https://colab.research.google.com/github/nyee88/Brilliant/blob/main/OCSR-CLIP/OCSR-CLIP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install RDKit (Colab)
!pip install rdkit -q

import os, json
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, Draw

# 1) A tiny hard-coded SMILES list (replace these with whatever you want)
smiles_list = {
    "benzene": "c1ccccc1",
    "thiophene": "c1ccsc1",
    "pyridine": "c1ccncc1",
    "aniline": "Nc1ccccc1",
    "benzoic_acid": "OC(=O)c1ccccc1",
    "furan": "c1ccoc1",
    "DPP_core": "O=C1C(C2=CN1)=CNC2=O",  # silly big one; just for test
    "ethyl_acetate": "CCOC(=O)C",
    "acetic_acid": "CC(=O)O",
    "toluene": "Cc1ccccc1",
}

# 2) Make output folder
out_dir = "clip_molecule_pairs"
os.makedirs(out_dir, exist_ok=True)

# 3) Helper to build a simple graph representation
def mol_to_graph_dict(mol):
    atoms = []
    for atom in mol.GetAtoms():
        atoms.append({
            "idx": atom.GetIdx(),
            "symbol": atom.GetSymbol(),
            "atomic_num": atom.GetAtomicNum(),
            "degree": atom.GetDegree(),
            "is_aromatic": atom.GetIsAromatic(),
            "is_in_ring": atom.IsInRing(),
            "formal_charge": atom.GetFormalCharge(),
        })

    bonds = []
    for bond in mol.GetBonds():
        bonds.append({
            "begin": bond.GetBeginAtomIdx(),
            "end": bond.GetEndAtomIdx(),
            "bond_type": str(bond.GetBondType()),  # e.g. SINGLE, DOUBLE, AROMATIC
            "is_in_ring": bond.IsInRing(),
        })

    # adjacency matrix for convenience
    adj = Chem.GetAdjacencyMatrix(mol).tolist()

    return {
        "atoms": atoms,
        "bonds": bonds,
        "adjacency": adj,
    }

# 4) Loop over SMILES, sanitize, draw PNG, save graph as JSON
for name, smi in smiles_list.items():
    print(f"Processing {name}: {smi}")
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        print(f"  [!] Failed to parse SMILES for {name}, skipping.")
        continue

    try:
        Chem.SanitizeMol(mol)
    except Exception as e:
        print(f"  [!] Sanitization failed for {name}: {e}")
        continue

    # Compute 2D coordinates for nicer drawings
    AllChem.Compute2DCoords(mol)

    # 4a) Save 2D structure PNG
    png_path = os.path.join(out_dir, f"{name}.png")
    Draw.MolToFile(mol, png_path, size=(300, 300))
    print(f"  [+] Saved image to {png_path}")

    # 4b) Save graph representation as JSON
    graph = mol_to_graph_dict(mol)
    graph_path = os.path.join(out_dir, f"{name}_graph.json")
    with open(graph_path, "w") as f:
        json.dump({
            "name": name,
            "smiles": smi,
            "graph": graph,
        }, f, indent=2)
    print(f"  [+] Saved graph to {graph_path}")

print("\nDone. Check the 'clip_molecule_pairs' folder for PNGs + JSON graphs.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25hProcessing benzene: c1ccccc1
  [+] Saved image to clip_molecule_pairs/benzene.png
  [+] Saved graph to clip_molecule_pairs/benzene_graph.json
Processing thiophene: c1ccsc1
  [+] Saved image to clip_molecule_pairs/thiophene.png
  [+] Saved graph to clip_molecule_pairs/thiophene_graph.json
Processing pyridine: c1ccncc1
  [+] Saved image to clip_molecule_pairs/pyridine.png
  [+] Saved graph to clip_molecule_pairs/pyridine_graph.json
Processing aniline: Nc1ccccc1
  [+] Saved image to clip_molecule_pairs/aniline.png
  [+] Saved graph to clip_molecule_pairs/aniline_graph.json
Processing benzoic_acid: OC(=O)c1ccccc1
  [+] Saved image to clip_molecule_pairs/benzoic_acid.png
  [+] Saved graph to clip_molecule_pairs/benzoic_acid_graph.json
Processing furan: c1ccoc1
  [+] Saved image to clip_molecule_pairs/furan.png
  [+] Saved graph to clip_molecule_pairs/furan_graph.json


[20:14:35] Can't kekulize mol.  Unkekulized atoms: 12 13 14 15 19 20 21 22 27 28 29
