# Generate the JSON input file for the AWS batch inference

In [None]:
import json
import boto3
from botocore.exceptions import ClientError
import pandas as pd
from typing import Any, Dict

In [None]:
def generate_prompt(note_text: str) -> str:
    return f"""
Review the following clinical note and assign the appropriate label:

### Labels

- **Positive**
- **Negative**
- **Absent / Insufficient Evidence**

### Clinical Note

{note_text}

### Output Format

```jsonc
{{
"Label": "<Positive | Negative | Absent / Insufficient Evidence>",
"Reasoning": "<one concise sentence or short clause>"
}}
```
""".strip()

def generate_json(record_id: str, 
                  system_prompt: str,
                  note_text: str
    ) -> Dict[str, Any]:
    """
    Generates a single JSON object formatted for AWS batch inference with Claude Sonnet 3.5 using the messages API
    """
    return {
    "recordId": record_id,
    "modelInput": {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 10000,
        "system": system_prompt,
        "temperature": 0.3,
        "top_p": 0.6,
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                    "type": "text",
                    "text": generate_prompt(note_text),
                    }
                ]
            }
        ]
    }
}

In [None]:
system_prompt = """
Your role is to evaluate clinical notes to decide whether they contain **documented evidence of actual feeding behavior** involving breast milk, and assign one of three labels.

### Labels

| Label | Definition | Typical Triggers | Typical Non-Triggers |
|-------|------------|------------------|----------------------|
| **Positive** | Explicit documentation that the infant/patient **is currently receiving breast milk**, whether exclusively or mixed with other feeds. | “Breastfeeds every 3 h”, “Takes 30 mL expressed breast milk by bottle”, “Review of nutrition: breast and bottle” | Statements that only recommend or plan breastfeeding (*Absent / Insufficient Evidence*) |
| **Negative** | Evidence that the infant/patient **is not currently receiving breast milk**. This includes exclusive formula feeding **or** statements that breastfeeding is historical or has been stopped/declined. | “Feeds Enfamil 60 mL q3h”, “Continue formula ad lib”, “Previously breastfed, now formula only”, “Mother stopped breastfeeding 2 weeks ago” | Any direct documentation of current breast-milk intake |
| **Absent / Insufficient Evidence** | No clear evidence of current feeding with breast milk **or** formula; context is hypothetical, educational, uncertain, intentional, future plans, prescription, or ambiguous. Includes negated/historical/discontinued exclusive formula and unclear bottle-feeding statements. | “Last feed 2 h ago via bottle” (no milk type specified), “Breastfeeding is highly recommended”, “Family plans to breastfeed after discharge”, “Discussed exclusive formula feeding vs breastfeeding options” | Explicit statements of current exclusive formula feeding (*Negative*) or documented breast-milk intake (*Positive*) 

### Context Rules

1. **Actual behavior vs. guidance**  
   Recommendations, anticipatory guidance, or best-practice statements **do not count** as evidence of what was actually fed.  

2. **Nutrition-specific sections** often contain the relevant evidence. Pay extra attention to headings such as **“Review of Nutrition”, “Current Feeding”, “Feeding Difficulties” (but do not ignore other sections).**

3. **Pumping**
    - If pumping is mentioned, look for evidence in addition to pumping that suggests the intent is to feed the patient or to build or maintain a milk supply. If there is additional information that suggests the intent of pumping is to feed the baby or build a milk supply (i.e., *not* dump it), classify the note as **positive**.
    - If pumping is mentioned in conjunction with a lactation consultation, classify the note as **positive**.

4. **Context matters**  
   - Historical: 
      - If *breast milk* is mentioned ONLY as part of **past feeding behavior**, choose **Negative**
      - If *formula* is mentioned ONLY as part of **past feeding behavior**, (e.g., 'sent home on') choose **Absent / Insufficient Evidence**
   - Hypothetical: 
      - If *breast milk* is mentioned ONLY as a **recommendation, plan, or intended future behavior**, choose **Absent / Insufficient Evidence**.  
      - If *formula* is mentioned ONLY as a **recommendation, plan, or intended future behavior**, choose **Absent / Insufficient Evidence**.  

### Steps

1. **Read the clinical note** thoroughly.  
2. **Identify evidence** of current or past feeding behavior.  
3. **Apply the context rules** to exclude purely hypothetical or advisory text.  
4. **Assign the appropriate label**.  
5. **Output** a JSON object.

### Output Format

```jsonc
{
  "Label": "<Positive | Negative | Absent / Insufficient Evidence>",
  "Reasoning": "<one concise sentence or short clause>"
}
```

### Examples

| **Input** | **Expected JSON Output** | **Explanation** |
|-----------|--------------------------|------------------|
| *“The infant was fed breast milk exclusively during the stay.”* | `{"Label":"Positive","Reasoning":"Exclusive breast-milk feeding documented."}` | Direct evidence of breast milk as actual feeding behavior. |
| *“The infant was given formula in addition to breastfeeding.”* | `{"Label":"Positive","Reasoning":"Both formula and breast milk documented; breast milk present."}` | Breast milk is part of current feeding behavior. |
| *“Last feed 2 h ago via bottle.”* | `{"Label":"Absent / Insufficient Evidence","Reasoning":"Ambiguous bottle feeding."}` | Feeding is documented but the content of the bottle is ambiguous. |
| *“Nutrition: breastfeeding is highly recommended.”* | `{"Label":"Absent / Insufficient Evidence","Reasoning":"Statement is guidance, not evidence of actual feeding."}` | This is a best-practice recommendation, not documentation of feeding. |
| *“Review of nutrition: breast and bottle.”* | `{"Label":"Positive","Reasoning":"Explicit mention of breast feeding behavior."}` | "Breast" indicates breastfeeding was part of actual feeding behavior, regardless of the ambiguity in “bottle.” |
| *"The mom pumped breast milk this morning."* | `{"Label":"Absent / Insufficient Evidence","Reasoning":"The note does not mention the infant receiving breast milk."}` | "While the mom pumped this morning, it is unclear whether the milk was given to the baby or dumped." |
| *"Primary feeding method: PO bottle, Formula"* | `{"Label":"Negative","Reasoning":"Explicit mention of formula feeding."}` | "Direct evidence of formula feeding." |
| *"Bottlefed and eating solids"* | `{"Label":"Absent / Insufficient Evidence","Reasoning":"Ambiguous bottle feeding."}` | "Feeding is documented but the content of the bottle is ambiguous." |
| *"Neosure 3 oz per feeding, 7-8 times per day"* | `{"Label":"Negative","Reasoning":"Explicit mention of formula feeding."}` | "Direct evidence of formula feeding (Neosure)." |
| *"The patient was sent home on Neosure."* | `{"Label":"Absent / Insufficient Evidence","Reasoning":"Historical mention of formula feeding does not provide positive or negative information regarding current feeding behavior."}` | "Historical evidence of formula feeding is criteria for Absent / Insufficient Evidence." |
| *"The patient was discharged on formula."* | `{"Label":"Absent / Insufficient Evidence","Reasoning":"Historical mention of formula feeding does not provide positive or negative information regarding current feeding behavior."}` | "Historical evidence of formula feeding is criteria for Absent / Insufficient Evidence." |
| *"Discontinued similac advance"* | `{"Label":"Absent / Insufficient Evidence","Reasoning":"Mention of discontinued formula feeding indicates no positive or negative information regarding current feeding behavior."}` | "Evidence of discontinuing/stopping formula is criteria for Absent / Insufficient Evidence." |
| *"Plan: Script for Isomil"* | `{"Label":"Absent / Insufficient Evidence","Reasoning":"Prescribed mention of formula feeding."}` | "A script written for formula is a prescription and represents an intention of future plans, not current behavior." |

### Edge cases

1. Human milk fortifier (or fortified human milk) is considered evidence of breast milk and should be labeled **Positive**.
2. Breast milk and formula can both be administered via feeding tubes; do **not** assume NPO (nothing by mouth) is **Absent / Insufficient Evidence**.
3. Both donor milk and mother's own milk should be labeled **Positive**.

> **Note**: Only output the JSON object, and nothing else.
""".strip()



# Load data

In [None]:
man_df = pd.read_excel("/Volumes/RISIDataServices_MPrint_NCH/MPRINT_LACTATE_BF_1_20250424.xlsx")
man_df.reset_index(names=['row_ix'], inplace=True)
man_df

In [None]:
with open("output_all_notes.jsonl", "w", encoding="utf-8") as f:
    for row in man_df.itertuples():
        output = generate_json(record_id=row.row_ix, system_prompt=system_prompt, note_text=row.NOTE_TEXT)
        json.dump(output, f)
        f.write("\n")

# STOP HERE -- Next step is in AWS

- Next, go to AWS and use the batch inference functionality within bedrock to tag the notes in parallel.  
- It will finish in a few minutes -- save the resulting `.out` file.  
- Run the following code cell after changing the approrpiate path name.  

In [None]:
with open("/Users/cxg042/Documents/git/ods-preglac/dev_cg/output_all_notes.jsonl.out", 'r') as f: # <- change this path to the path of the output file from AWS
    results = f.readlines()

def organize_batch_results(batch_results):
    dfs = []
    for res in tqdm(batch_results):
        dfs.append(parse_batch_results(res))
    return pd.concat(dfs)

def parse_batch_results(single_result: Dict[str, str]) -> pd.DataFrame:
    res = json.loads(single_result)

    record_id = res['recordId']

    input_tokens = res['modelOutput']['usage']['input_tokens']
    output_tokens = res['modelOutput']['usage']['output_tokens']

    label = json.loads(res['modelOutput']['content'][0]['text'])['Label']
    reasoning = json.loads(res['modelOutput']['content'][0]['text'])['Reasoning']

    return pd.DataFrame([
        {
            "recordID": record_id,
            "Label": label,
            "Reasoning": reasoning,
            "input_cost": (input_tokens/1_000_000) * 3,
            "output_cost": (output_tokens/1_000_000) * 15,
        }
    ])

def standardize_label(label):
    if label == "Absent / Insufficient Evidence":
        return "absent"
    else:
        return label.lower()

res = organize_batch_results(results)
res.recordID = res.recordID.astype(int)
res.recordID -=1

# display the results
res