In [1]:
%load_ext dotenv
%dotenv

import os
import base64
import tempfile
from pathlib import Path

N_SAMPLES = 100
MODEL_NAME = "gemini-2.5-pro"
RUN_ID = base64.b64encode(os.urandom(32))[:8].decode().replace("/","").replace("\\","")
RAW_OUTPUT_DIR = f"./data/raw/{RUN_ID}/"
OUTPUT_PATH = Path(f"./data/collected-{MODEL_NAME}-N{N_SAMPLES}-{RUN_ID}.csv")
TMP_OPSIN_PATH = Path(tempfile.gettempdir()) / "py2opsin_input.txt"

In [2]:
# sample zinc dataset
from datasets.iterable_dataset import IterableDataset
from datasets import load_dataset

dataset: IterableDataset = load_dataset(
    "haydn-jones/ZINC20", split="test", streaming=True
)

samples = dataset.shuffle(seed=225).take(N_SAMPLES)["smiles"]
original_smiles = list(samples)

Resolving data files:   0%|          | 0/787 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/96 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/93 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/787 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/96 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/93 [00:00<?, ?it/s]

In [3]:
from batchata import Batch

batch = (
    Batch(
        results_dir=RAW_OUTPUT_DIR,
        max_parallel_batches=10,
        items_per_batch=10,
    )
    .set_default_params(model=MODEL_NAME)
    .add_cost_limit(10.0)
)

prompt_template = "Write the IUPAC name of this molecule:\n\n%s\n\nWrite the IUPAC name only. Do not write any comments."
for smiles in original_smiles:
    messages = [{"role": "user", "content": prompt_template % smiles}]
    batch.add_job(messages, temperature=0, max_tokens=65536)

In [4]:
# dry run the batch
try:
    run = batch.run(dry_run=True)
except KeyError as e:
    print(e)
    print(f"Counting tokens for model {MODEL_NAME} is not supported in `tokencost`")

2026-02-01 16:36:37 - batchata.core.batch_run - INFO - Created temporary state file: /tmp/tmpa6x9n7no.json
2026-02-01 16:36:37 - batchata.core.batch_run - INFO - === DRY RUN MODE ===
2026-02-01 16:36:37 - batchata.core.batch_run - INFO - This will show cost estimates without executing jobs
2026-02-01 16:36:37 - batchata.core.batch_run - INFO - Analyzing 100 pending jobs...
2026-02-01 16:36:37 - batchata.core.batch_run - INFO - 
Job breakdown:
2026-02-01 16:36:37 - batchata.core.batch_run - INFO - 
GeminiProvider (100 jobs):
2026-02-01 16:36:39 - batchata.core.batch_run - INFO -   Batch 1: 10 jobs, estimated cost: $3.2772
2026-02-01 16:36:39 - batchata.core.batch_run - INFO -     - job-3749dfa2: direct messages (citations: False)
2026-02-01 16:36:39 - batchata.core.batch_run - INFO -     - job-5d82a222: direct messages (citations: False)
2026-02-01 16:36:39 - batchata.core.batch_run - INFO -     - job-d2b6fb02: direct messages (citations: False)
2026-02-01 16:36:39 - batchata.core.batch

In [5]:
try:
    run = batch.run(print_status=True)
except KeyError:
    print(f"Counting tokens is not supported in `tokencost` for model {MODEL_NAME}")

2026-02-01 16:36:49 - batchata.core.batch_run - INFO - Created temporary state file: /tmp/tmpygo6jqgl.json
2026-02-01 16:36:49 - batchata.core.batch_run - INFO - Starting batch run


Output()

2026-02-01 16:36:59 - batchata.core.batch_run - INFO - Estimating cost for batch of 10 jobs...
2026-02-01 16:36:59 - batchata.core.batch_run - INFO - Estimating cost for batch of 10 jobs...
2026-02-01 16:36:59 - batchata.core.batch_run - INFO - Estimating cost for batch of 10 jobs...
2026-02-01 16:36:59 - batchata.core.batch_run - INFO - Estimating cost for batch of 10 jobs...
2026-02-01 16:36:59 - batchata.core.batch_run - INFO - Estimating cost for batch of 10 jobs...
2026-02-01 16:36:59 - batchata.core.batch_run - INFO - Estimating cost for batch of 10 jobs...
2026-02-01 16:36:59 - batchata.core.batch_run - INFO - Estimating cost for batch of 10 jobs...
2026-02-01 16:36:59 - batchata.core.batch_run - INFO - Estimating cost for batch of 10 jobs...
2026-02-01 16:36:59 - batchata.core.batch_run - INFO - Estimating cost for batch of 10 jobs...
2026-02-01 16:36:59 - batchata.core.batch_run - INFO - Estimating cost for batch of 10 jobs...
2026-02-01 16:36:59 - batchata.core.batch_run - IN

2026-02-01 16:42:05 - batchata.core.batch_run - INFO - Getting results for batch batches/g42tfmz526mmi2k3193kydchsehgmq185ugj
2026-02-01 16:42:05 - batchata.core.batch_run - INFO - ✓ Batch batches/g42tfmz526mmi2k3193kydchsehgmq185ugj completed: 10 success, 0 failed, cost: $0.002413
2026-02-01 16:42:05 - batchata.core.batch_run - INFO - ✓ Job job-a176e5cf completed successfully
2026-02-01 16:42:05 - batchata.core.batch_run - INFO - ✓ Job job-0945233a completed successfully
2026-02-01 16:42:05 - batchata.core.batch_run - INFO - ✓ Job job-3766449f completed successfully
2026-02-01 16:42:05 - batchata.core.batch_run - INFO - ✓ Job job-6a261567 completed successfully
2026-02-01 16:42:05 - batchata.core.batch_run - INFO - ✓ Job job-9b903333 completed successfully
2026-02-01 16:42:05 - batchata.core.batch_run - INFO - ✓ Job job-e827fcdd completed successfully
2026-02-01 16:42:05 - batchata.core.batch_run - INFO - ✓ Job job-14d4307a completed successfully
2026-02-01 16:42:05 - batchata.core.ba

2026-02-01 16:42:12 - batchata.core.batch_run - INFO - Getting results for batch batches/mqvg2t2e81njr2vaqh0ksyvf01a70v1rfzyd
2026-02-01 16:42:12 - batchata.core.batch_run - INFO - ✓ Batch batches/mqvg2t2e81njr2vaqh0ksyvf01a70v1rfzyd completed: 10 success, 0 failed, cost: $0.002359
2026-02-01 16:42:12 - batchata.core.batch_run - INFO - ✓ Job job-3749dfa2 completed successfully
2026-02-01 16:42:12 - batchata.core.batch_run - INFO - ✓ Job job-5d82a222 completed successfully
2026-02-01 16:42:12 - batchata.core.batch_run - INFO - ✓ Job job-d2b6fb02 completed successfully
2026-02-01 16:42:12 - batchata.core.batch_run - INFO - ✓ Job job-92192c6b completed successfully
2026-02-01 16:42:12 - batchata.core.batch_run - INFO - ✓ Job job-b804a4d7 completed successfully
2026-02-01 16:42:12 - batchata.core.batch_run - INFO - ✓ Job job-ef03ae94 completed successfully
2026-02-01 16:42:12 - batchata.core.batch_run - INFO - ✓ Job job-dca569c7 completed successfully
2026-02-01 16:42:12 - batchata.core.ba

2026-02-01 16:42:28 - batchata.core.batch_run - INFO - Getting results for batch batches/oe0xl9miop2s1759wr2tk3bl99oarrrvmnrc
2026-02-01 16:42:28 - batchata.core.batch_run - INFO - ✓ Batch batches/oe0xl9miop2s1759wr2tk3bl99oarrrvmnrc completed: 10 success, 0 failed, cost: $0.002186
2026-02-01 16:42:28 - batchata.core.batch_run - INFO - ✓ Job job-6e490f28 completed successfully
2026-02-01 16:42:28 - batchata.core.batch_run - INFO - ✓ Job job-99da4fc2 completed successfully
2026-02-01 16:42:28 - batchata.core.batch_run - INFO - ✓ Job job-0e885bfe completed successfully
2026-02-01 16:42:28 - batchata.core.batch_run - INFO - ✓ Job job-0609cafa completed successfully
2026-02-01 16:42:28 - batchata.core.batch_run - INFO - ✓ Job job-34f109d0 completed successfully
2026-02-01 16:42:28 - batchata.core.batch_run - INFO - ✓ Job job-4e9895ea completed successfully
2026-02-01 16:42:28 - batchata.core.batch_run - INFO - ✓ Job job-75357aa7 completed successfully
2026-02-01 16:42:28 - batchata.core.ba

In [6]:
results = run.results()
for k, v in results.items():
    print(k, len(v))

completed 30
failed 70
cancelled 0


In [7]:
# convert warnings to exceptions
import warnings

warnings.filterwarnings("error")

from py2opsin import py2opsin

completed = results["completed"]
iupacs = []
smiles = []
errors = []
for job in completed:
    response = job.raw_response
    try:
        smi = py2opsin(response, "SMILES", tmp_fpath=str(TMP_OPSIN_PATH))
        err = ""
    except RuntimeWarning as e:
        smi = ""
        err = (
            str(e)
            .replace("\n", " ")
            .removeprefix("OPSIN raised the following error(s) while parsing:  > ")
        )
    iupacs.append(response)
    errors.append(err)
    smiles.append(smi)

In [9]:
import polars as pl

if len(original_smiles) > len(iupacs):
    original_smiles = original_smiles[:len(iupacs)]

df = pl.DataFrame(
    {
        "ZINC SMILES": original_smiles,
        "Generated IUPAC": iupacs,
        "OPSIN SMILES": smiles,
        "OPSIN Errors": errors,
    }
)
df.write_csv(OUTPUT_PATH)