# Enumeration of a Template Reaction Over a CSV File

This example demonstrates how to enumerate a template reaction over a spreadsheet file in Python. This operation can also be achieved using the onlne ORD editor, but sometimes it is useful to run the enumeration locally. In this case we are also running the enumeration over 2 separate templates to account for differences in which ananlysis was conducted. The 2 datasets are subsequently merged into a single dataset.

## Setup

In [1]:
# from the enumeration script
import docopt
import numpy as np

from ord_schema import message_helpers, templating, validations
from ord_schema.logging import get_logger
from ord_schema.proto import dataset_pb2, reaction_pb2

logger = get_logger(__name__)

In [2]:
# code modified from enumeration script in ord-schema
def enumerate_dataset(template, data, output):
    with open(template) as f:
        template_string = f.read()
        df = templating.read_spreadsheet(data)
        logger.info(
            "generating new Dataset from %s and %s",
            template,
            data,
        )
        dataset = templating.generate_dataset(
            name= name,
            description= description,
            template_string= template_string,
            df=df,
            validate= True,
        )
        logger.info("writing new Dataset to %s", output)
        message_helpers.write_message(dataset, output)

## Enumeration

In [3]:
# specify the dataset name and description
name = 'Ligand screening for Aza-Heck cyclization in total synthesis of Impatien A'
description = '24 ligands are screened for the aza-Heck cyclization in a total synthesis of Impatien A.'

In [4]:
# Enumerate the main set of reactions
enumerate_dataset('./template_v2.1.pbtxt', './aza_heck.xlsx', 'dataset_1.pbtxt')

INFO 2025-01-20 15:25:15,270 1297433464.py:6: generating new Dataset from ./template_v2.1.pbtxt and ./aza_heck.xlsx
INFO 2025-01-20 15:25:15,616 1297433464.py:18: writing new Dataset to dataset_1.pbtxt


In [5]:
# Enumerate the subset of reactions that also have NMR data
enumerate_dataset('./template_nmr_v2.1.pbtxt', './aza_heck_w_nmr.xlsx', 'dataset_2.pbtxt')

INFO 2025-01-20 15:25:16,863 1297433464.py:6: generating new Dataset from ./template_nmr_v2.1.pbtxt and ./aza_heck_w_nmr.xlsx
INFO 2025-01-20 15:25:16,957 1297433464.py:18: writing new Dataset to dataset_2.pbtxt


## Merge the dataset files

In [6]:
# Open the primary dataset file
pb = './dataset_1.pbtxt'
primary = message_helpers.load_message(pb, dataset_pb2.Dataset)
print(f"Dataset has {len(primary.reactions)} reactions")

Dataset has 19 reactions


In [7]:
# open the dataset to be added to the primary dataset
pb_2 = './dataset_2.pbtxt'
secondary = message_helpers.load_message(pb_2, dataset_pb2.Dataset)
print(f"Dataset has {len(secondary.reactions)} reactions")

Dataset has 5 reactions


In [8]:
merged_dataset = primary
for rxn in secondary.reactions:
    merged_dataset.reactions.append(rxn)

In [9]:
#Check the size of the new dataset
print(f"Dataset has {len(merged_dataset.reactions)} reactions")

Dataset has 24 reactions


In [10]:
message_helpers.write_message(merged_dataset, './merged_dataset.pbtxt')

## Review the Results

In [11]:
#Check the size of the new dataset
print(f"Dataset has {len(merged_dataset.reactions)} reactions")

Dataset has 24 reactions


In [12]:
# Validate dataset
validations.validate_dataset(merged_dataset)

In [13]:
# Inspect random reaction from this set
random_index = np.random.randint(len(merged_dataset.reactions))
print(f"Reaction entry {random_index}:")
merged_dataset.reactions[random_index]

Reaction entry 21:


identifiers {
  type: CUSTOM
  details: "Sample number"
  value: "B5"
}
inputs {
  key: "Reactant"
  value {
    components {
      identifiers {
        type: SMILES
        value: "COc1cc(C(=O)NOc2ccccc2)c(C2=C(C)c3ccc4c(c3C2)OCO4)cc1OC"
      }
      amount {
        mass {
          value: 1.49
          units: MILLIGRAM
        }
      }
      reaction_role: REACTANT
      is_limiting: true
    }
    components {
      identifiers {
        type: NAME
        value: "acetonitrile"
      }
      identifiers {
        type: SMILES
        details: "NAME resolved by the PubChem API"
        value: "CC#N"
      }
      amount {
        volume {
          value: 150
          units: MICROLITER
        }
      }
      reaction_role: SOLVENT
      preparations {
        type: DRIED
        details: "dried on alumina"
      }
    }
    addition_order: 3
    addition_speed {
      type: ALL_AT_ONCE
    }
    addition_device {
      type: PIPETTE
    }
  }
}
inputs {
  key: "Ligand"
  value