In [1]:
%load_ext dotenv
%dotenv

import os
import base64
import tempfile
from pathlib import Path

N_SAMPLES = 100
MODEL_NAME = "gemini-2.5-pro"
RUN_ID = base64.b64encode(os.urandom(32))[:8].decode().replace("/","").replace("\\","")
RAW_OUTPUT_DIR = f"./data/raw/{RUN_ID}/"
OUTPUT_PATH = Path(f"./data/collected-{MODEL_NAME}-N{N_SAMPLES}-{RUN_ID}.csv")
TMP_OPSIN_PATH = Path(tempfile.gettempdir()) / "py2opsin_input.txt"

In [2]:
# sample zinc dataset
from datasets.iterable_dataset import IterableDataset
from datasets import load_dataset

dataset: IterableDataset = load_dataset(
    "haydn-jones/ZINC20", split="test", streaming=True
)

samples = dataset.shuffle(seed=225).take(N_SAMPLES)["smiles"]
original_smiles = list(samples)

Resolving data files:   0%|          | 0/787 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/96 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/93 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/787 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/96 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/93 [00:00<?, ?it/s]

In [3]:
from batchata import Batch

batch = (
    Batch(
        results_dir=RAW_OUTPUT_DIR,
        max_parallel_batches=10,
        items_per_batch=20,
    )
    .set_default_params(model=MODEL_NAME)
    .add_cost_limit(100.0)
)

prompt_template = "Write the IUPAC name of this molecule:\n\n%s\n\nWrite the IUPAC name only. Do not write any comments."
for smiles in original_smiles:
    messages = [{"role": "user", "content": prompt_template % smiles}]
    batch.add_job(messages, temperature=0, max_tokens=65536)

In [4]:
# dry run the batch
try:
    run = batch.run(dry_run=True)
except KeyError as e:
    print(e)
    print(f"Counting tokens for model {MODEL_NAME} is not supported in `tokencost`")

2026-02-02 11:58:09 - batchata.core.batch_run - INFO - Created temporary state file: /var/folders/5k/k24pr0ns75zcrcwfppb15tph0000gn/T/tmpjc9jmxwl.json
2026-02-02 11:58:09 - batchata.core.batch_run - INFO - === DRY RUN MODE ===
2026-02-02 11:58:09 - batchata.core.batch_run - INFO - This will show cost estimates without executing jobs
2026-02-02 11:58:09 - batchata.core.batch_run - INFO - Analyzing 100 pending jobs...
2026-02-02 11:58:09 - batchata.core.batch_run - INFO - 
Job breakdown:
2026-02-02 11:58:09 - batchata.core.batch_run - INFO - 
GeminiProvider (100 jobs):
2026-02-02 11:58:11 - batchata.core.batch_run - INFO -   Batch 1: 20 jobs, estimated cost: $6.5543
2026-02-02 11:58:11 - batchata.core.batch_run - INFO -     - job-3b2595d4: direct messages (citations: False)
2026-02-02 11:58:11 - batchata.core.batch_run - INFO -     - job-27b93610: direct messages (citations: False)
2026-02-02 11:58:11 - batchata.core.batch_run - INFO -     - job-d07b4c5e: direct messages (citations: Fals

In [5]:
try:
    run = batch.run(print_status=True)
except KeyError:
    print(f"Counting tokens is not supported in `tokencost` for model {MODEL_NAME}")

2026-02-02 11:58:17 - batchata.core.batch_run - INFO - Created temporary state file: /var/folders/5k/k24pr0ns75zcrcwfppb15tph0000gn/T/tmp8ed8xmpp.json
2026-02-02 11:58:17 - batchata.core.batch_run - INFO - Starting batch run


Output()

2026-02-02 11:58:23 - batchata.core.batch_run - INFO - Estimating cost for batch of 20 jobs...
2026-02-02 11:58:23 - batchata.core.batch_run - INFO - Estimating cost for batch of 20 jobs...
2026-02-02 11:58:23 - batchata.core.batch_run - INFO - Estimating cost for batch of 20 jobs...
2026-02-02 11:58:23 - batchata.core.batch_run - INFO - Estimating cost for batch of 20 jobs...
2026-02-02 11:58:23 - batchata.core.batch_run - INFO - Estimating cost for batch of 20 jobs...
2026-02-02 11:58:24 - batchata.core.batch_run - INFO - Total estimated cost: $6.5543, remaining budget: $100.0000
2026-02-02 11:58:24 - batchata.core.batch_run - INFO - Creating batch with 20 jobs...
2026-02-02 11:58:24 - batchata.core.batch_run - INFO - Total estimated cost: $6.5543, remaining budget: $93.4457
2026-02-02 11:58:24 - batchata.core.batch_run - INFO - Creating batch with 20 jobs...
2026-02-02 11:58:24 - batchata.core.batch_run - INFO - Total estimated cost: $6.5543, remaining budget: $86.8914
2026-02-02 11

2026-02-02 12:14:09 - batchata.core.batch_run - INFO - Getting results for batch batches/e2un68bi5sj04krvp5ut1my011o6l9bqkqc7
2026-02-02 12:14:09 - batchata.core.batch_run - INFO - ✓ Batch batches/e2un68bi5sj04krvp5ut1my011o6l9bqkqc7 completed: 20 success, 0 failed, cost: $0.004755
2026-02-02 12:14:09 - batchata.core.batch_run - INFO - ✓ Job job-46a707c1 completed successfully
2026-02-02 12:14:09 - batchata.core.batch_run - INFO - ✓ Job job-c7fbc34b completed successfully
2026-02-02 12:14:09 - batchata.core.batch_run - INFO - ✓ Job job-92f961f9 completed successfully
2026-02-02 12:14:09 - batchata.core.batch_run - INFO - ✓ Job job-0a7d3816 completed successfully
2026-02-02 12:14:09 - batchata.core.batch_run - INFO - ✓ Job job-64d6e08d completed successfully
2026-02-02 12:14:09 - batchata.core.batch_run - INFO - ✓ Job job-5ab6da07 completed successfully
2026-02-02 12:14:09 - batchata.core.batch_run - INFO - ✓ Job job-44e6164f completed successfully
2026-02-02 12:14:09 - batchata.core.ba

2026-02-02 12:14:35 - batchata.core.batch_run - INFO - Getting results for batch batches/ojrk84ue2mwoy3g2m90e7uxgfz7mf43c6tny
2026-02-02 12:14:35 - batchata.core.batch_run - INFO - ✓ Batch batches/ojrk84ue2mwoy3g2m90e7uxgfz7mf43c6tny completed: 20 success, 0 failed, cost: $0.004716
2026-02-02 12:14:35 - batchata.core.batch_run - INFO - ✓ Job job-3b2595d4 completed successfully
2026-02-02 12:14:35 - batchata.core.batch_run - INFO - ✓ Job job-27b93610 completed successfully
2026-02-02 12:14:35 - batchata.core.batch_run - INFO - ✓ Job job-d07b4c5e completed successfully
2026-02-02 12:14:35 - batchata.core.batch_run - INFO - ✓ Job job-d0955fd4 completed successfully
2026-02-02 12:14:35 - batchata.core.batch_run - INFO - ✓ Job job-1e55cdfd completed successfully
2026-02-02 12:14:35 - batchata.core.batch_run - INFO - ✓ Job job-969d3305 completed successfully
2026-02-02 12:14:35 - batchata.core.batch_run - INFO - ✓ Job job-50dc8edb completed successfully
2026-02-02 12:14:35 - batchata.core.ba

2026-02-02 12:18:51 - batchata.core.batch_run - INFO - Getting results for batch batches/xqaeibz8j7gpbcxd6dajby8t88aw9z7x24pn
2026-02-02 12:18:51 - batchata.core.batch_run - ERROR - ✗ Batch execution failed: 'NoneType' object is not iterable
2026-02-02 12:18:51 - batchata.core.batch_run - ERROR - ✗ Job job-6c77853c failed: 'NoneType' object is not iterable
2026-02-02 12:18:51 - batchata.core.batch_run - ERROR - ✗ Job job-01434b5f failed: 'NoneType' object is not iterable
2026-02-02 12:18:51 - batchata.core.batch_run - ERROR - ✗ Job job-ff02798b failed: 'NoneType' object is not iterable
2026-02-02 12:18:51 - batchata.core.batch_run - ERROR - ✗ Job job-231fc855 failed: 'NoneType' object is not iterable
2026-02-02 12:18:51 - batchata.core.batch_run - ERROR - ✗ Job job-7166f76a failed: 'NoneType' object is not iterable
2026-02-02 12:18:51 - batchata.core.batch_run - ERROR - ✗ Job job-d7ec9fa3 failed: 'NoneType' object is not iterable
2026-02-02 12:18:51 - batchata.core.batch_run - ERROR - 

2026-02-02 12:18:59 - batchata.core.batch_run - INFO - Getting results for batch batches/s6cca5nmvcdyt25aabfsj2dcy2ultkekha8v
2026-02-02 12:18:59 - batchata.core.batch_run - ERROR - ✗ Batch execution failed: 'NoneType' object is not iterable
2026-02-02 12:18:59 - batchata.core.batch_run - ERROR - ✗ Job job-991a1f82 failed: 'NoneType' object is not iterable
2026-02-02 12:18:59 - batchata.core.batch_run - ERROR - ✗ Job job-b601d3f9 failed: 'NoneType' object is not iterable
2026-02-02 12:18:59 - batchata.core.batch_run - ERROR - ✗ Job job-4c0fcb07 failed: 'NoneType' object is not iterable
2026-02-02 12:18:59 - batchata.core.batch_run - ERROR - ✗ Job job-e1a141fc failed: 'NoneType' object is not iterable
2026-02-02 12:18:59 - batchata.core.batch_run - ERROR - ✗ Job job-a3631e66 failed: 'NoneType' object is not iterable
2026-02-02 12:18:59 - batchata.core.batch_run - ERROR - ✗ Job job-8d5f9d26 failed: 'NoneType' object is not iterable
2026-02-02 12:18:59 - batchata.core.batch_run - ERROR - 

2026-02-02 12:23:41 - batchata.core.batch_run - INFO - Getting results for batch batches/ljtyk3zd5u0lxk2itpzgg04dwouki7tuzwua
2026-02-02 12:23:41 - batchata.core.batch_run - INFO - ✓ Batch batches/ljtyk3zd5u0lxk2itpzgg04dwouki7tuzwua completed: 20 success, 0 failed, cost: $0.004466
2026-02-02 12:23:41 - batchata.core.batch_run - INFO - ✓ Job job-9926bbd5 completed successfully
2026-02-02 12:23:41 - batchata.core.batch_run - INFO - ✓ Job job-a9bcd599 completed successfully
2026-02-02 12:23:41 - batchata.core.batch_run - INFO - ✓ Job job-1d8ac056 completed successfully
2026-02-02 12:23:41 - batchata.core.batch_run - INFO - ✓ Job job-16f87d7e completed successfully
2026-02-02 12:23:41 - batchata.core.batch_run - INFO - ✓ Job job-b736422f completed successfully
2026-02-02 12:23:41 - batchata.core.batch_run - INFO - ✓ Job job-f2c42746 completed successfully
2026-02-02 12:23:41 - batchata.core.batch_run - INFO - ✓ Job job-b83023eb completed successfully
2026-02-02 12:23:41 - batchata.core.ba

In [6]:
results = run.results()
for k, v in results.items():
    print(k, len(v))

completed 60
failed 40
cancelled 0


In [7]:
# convert warnings to exceptions
import warnings

warnings.filterwarnings("error")

from py2opsin import py2opsin

completed = results["completed"]
iupacs = []
smiles = []
errors = []
for job in completed:
    response = job.raw_response
    try:
        smi = py2opsin(response, "SMILES", tmp_fpath=str(TMP_OPSIN_PATH))
        err = ""
    except RuntimeWarning as e:
        smi = ""
        err = (
            str(e)
            .replace("\n", " ")
            .removeprefix("OPSIN raised the following error(s) while parsing:  > ")
        )
    iupacs.append(response)
    errors.append(err)
    smiles.append(smi)

In [8]:
import polars as pl

if len(original_smiles) > len(iupacs):
    original_smiles = original_smiles[: len(iupacs)]

df = pl.DataFrame(
    {
        "ZINC SMILES": original_smiles,
        "Generated IUPAC": iupacs,
        "OPSIN SMILES": smiles,
        "OPSIN Errors": errors,
    }
)
df.write_csv(OUTPUT_PATH)