In [59]:
# !pip install openpyxl

In [None]:
import os
from pathlib import Path
from openai import OpenAI

# Load OPENAI_API_KEY from environment or fallback to .env
env_path = Path('.env')
if env_path.exists():
    for line in env_path.read_text().splitlines():
        if line.strip().startswith('#') or '=' not in line:
            continue
        key, value = line.split('=', 1)
        if key.strip() == 'OPENAI_API_KEY' and value.strip():
            os.environ.setdefault('OPENAI_API_KEY', value.strip())
            break

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if not OPENAI_API_KEY:
    raise RuntimeError('Set OPENAI_API_KEY in your environment or .env')

# create an OpenAI client using the API key
client = OpenAI(api_key=OPENAI_API_KEY)


In [None]:
# import pymupdf4llm
# md_text = pymupdf4llm.to_markdown("6832_SercePehlevan_2020.pdf")

# import pathlib
# pathlib.Path("output.md").write_bytes(md_text.encode())

In [19]:
file = client.files.create(
    file=open("studies/6832_SercePehlevan_2020.pdf", "rb"),
    purpose="user_data",
)

In [66]:
from pydantic import BaseModel

class Assessment(BaseModel):
    answer: str
    justification: str
    citations: list[str]


from typing_extensions import TypedDict, Literal
from typing import List

class RobAnswer(TypedDict):
    answer: Literal["Y", "PY", "NI", "PN", "N"]
    justification: str
    citations: List[str]

def generate_response_with_chatgpt(prompt, file_id):
    response = client.responses.create(
        model="gpt-4.1",
        # model="gpt-4o-2024-08-06",
        input=[
            {
                "role": "user",
                "content": [
                    { "type": "input_text", "text": prompt },
                    {
                        "type": "input_file",
                        "file_id": file_id,
                    }
                ]
            }
        ],
        text={
            "format": {
                "type": "json_schema",
                "name": "response_details",
                "schema": {
                    "type": "object",
                    "properties": {
                        "answer": {"type": "string"},
                        "justification": {"type": "string"},
                        "citations": {"type": "array", "items": {"type": "string"}},
                    },
                    "required": ["answer", "justification", "citations"],
                    "additionalProperties": False,
                },
                "strict": True,
            }
        },
    )

    return response.output_text

In [67]:
# with open("prompts/domain_1_randomization/question_1.txt", "r", encoding="utf-8") as f:
#     content = f.read()

# answer = generate_response_with_chatgpt(content, file.id)

In [10]:
import re
from collections import defaultdict
from pathlib import Path

# Map domain (variants via folder name) -> question code -> prompt question file path
prompt_question_files = defaultdict(dict)
pattern = re.compile(r"^domain_?(\d+)(?:_(.+))?$")
for prompt_path in Path('prompts').rglob('question_*.txt'):
    match = pattern.match(prompt_path.parent.name)
    if not match:
        continue
    domain_id, variant = match.groups()

    stem_parts = prompt_path.stem.split('_')
    qnum = stem_parts[1] if len(stem_parts) > 1 else 'unknown'
    question_code = f"{domain_id}.{qnum}"

    domain_key = f"domain_{domain_id}" if not variant else f"domain_{domain_id}_{variant}"
    prompt_question_files[domain_key][question_code] = str(prompt_path)

# Sort questions for stable output
prompt_question_files = {
    domain: {code: path for code, path in sorted(questions.items())}
    for domain, questions in sorted(prompt_question_files.items())
}

print('Prompt question files by domain (variants split via folder name):')
for domain, questions in prompt_question_files.items():
    print(domain)
    for code, f in questions.items():
        print(f" - {code}: {f}")


Prompt question files by domain (variants split via folder name):
domain_1_randomization
 - 1.1: prompts/domain_1_randomization/question_1.txt
 - 1.2: prompts/domain_1_randomization/question_2.txt
 - 1.3: prompts/domain_1_randomization/question_3.txt
domain_2_adhering
 - 2.1: prompts/domain_2_adhering/question_1.txt
 - 2.2: prompts/domain_2_adhering/question_2.txt
 - 2.3: prompts/domain_2_adhering/question_3.txt
 - 2.4: prompts/domain_2_adhering/question_4.txt
 - 2.5: prompts/domain_2_adhering/question_5.txt
 - 2.6: prompts/domain_2_adhering/question_6.txt
domain_2_assigment
 - 2.1: prompts/domain_2_assigment/question_1.txt
 - 2.2: prompts/domain_2_assigment/question_2.txt
 - 2.3: prompts/domain_2_assigment/question_3.txt
 - 2.4: prompts/domain_2_assigment/question_4.txt
 - 2.5: prompts/domain_2_assigment/question_5.txt
 - 2.6: prompts/domain_2_assigment/question_6.txt
 - 2.7: prompts/domain_2_assigment/question_7.txt
domain_3_missing_data
 - 3.1: prompts/domain_3_missing_data/question

In [None]:
# Read PDF text, load a domain module, walk signalling questions, and store responses to Excel
import json
from pathlib import Path
import pandas as pd
import time

from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
from rob2.common import Response
from rob2.domains import get_domain_specs

def clean_excel(val):
    if isinstance(val, str):
        return ILLEGAL_CHARACTERS_RE.sub("", val)
    if isinstance(val, list):
        return [clean_excel(v) for v in val]
    if isinstance(val, dict):
        return {k: clean_excel(v) for k, v in val.items()}
    return val

# Load domain specs once
DOMAIN_SPECS = get_domain_specs()

pdf_paths = sorted(Path('studies').glob('*.pdf'))

if not pdf_paths:
    print('No PDF files found in studies/.')
else:
    for pdf_path in pdf_paths:
        print(f"Processing {pdf_path.name}")
        file = client.files.create(
            file=open(pdf_path, "rb"),
            purpose="user_data",
        )

        rows = []

        for domain_key in prompt_question_files:
            state = {}
            domain_prompts = prompt_question_files[domain_key]
            spec = DOMAIN_SPECS.get(domain_key)
            if spec is None:
                print(f"Skipping {domain_key}: no spec registered.")
                continue

            get_next_question = spec.get_next_question
            questions = spec.questions

            question_code = get_next_question(state)
            while question_code:
                prompt_path = Path(domain_prompts.get(question_code, ''))
                prompt_text = prompt_path.read_text(encoding='utf-8') if prompt_path.exists() else ''
                question_text = questions.get(question_code, '')

                print(f"{domain_key} -> {question_code}")
                if not prompt_text:
                    print('Prompt not found for this question code.')

                response_raw = generate_response_with_chatgpt(prompt_text, file.id)
                response = json.loads(response_raw)

                answer = response.get("answer", "")
                justification = response.get("justification", "")
                citations = response.get("citations", [])

                print(answer)
                print(justification)
                print(citations)

                state[question_code] = Response(answer ) # store string
                rows.append({
                    "file_name": pdf_path.name,
                    "domain": domain_key,
                    "question_code": question_code,
                    "question_text": question_text,
                    "prompt_path": str(prompt_path),
                    "answer": answer,
                    "justification": clean_excel(justification),
                    "citations": clean_excel("; ".join(citations)) if isinstance(citations, list) else str(citations),
                })

                time.sleep(15)

                question_code = get_next_question(state)

        output_dir = Path('outputs')
        output_dir.mkdir(exist_ok=True)
        output_file = output_dir / f"{pdf_path.stem}_responses.xlsx"
        pd.DataFrame(rows).to_excel(output_file, index=False)
        print(f"Saved {len(rows)} rows to {output_file}")


In [None]:
# Run a single domain for one PDF (requires setup cells above for client/prompts)
import json
import time
from pathlib import Path
import pandas as pd

from rob2.common import Response

# Set the PDF to evaluate and the target domain key
single_pdf = Path("studies/990_Chen_2021.pdf")
single_domain = "domain_4_measurement"

if "prompt_question_files" not in globals() or "DOMAIN_SPECS" not in globals():
    raise RuntimeError("Run the prompt/domain setup cells first (cells 6 and 7).")

if not single_pdf.exists():
    raise FileNotFoundError(f"PDF not found: {single_pdf}")

if single_domain not in prompt_question_files:
    raise ValueError(f"Unknown domain key: {single_domain}")

spec = DOMAIN_SPECS.get(single_domain)
if spec is None:
    raise ValueError(f"No spec registered for domain: {single_domain}")

domain_prompts = prompt_question_files[single_domain]
state = {}
rows = []

file = client.files.create(
    file=open(single_pdf, "rb"),
    purpose="user_data",
)

question_code = spec.get_next_question(state)
while question_code:
    prompt_path = Path(domain_prompts.get(question_code, ""))
    prompt_text = prompt_path.read_text(encoding="utf-8") if prompt_path.exists() else ""
    question_text = spec.questions.get(question_code, "")

    print(f"{single_domain} -> {question_code}")
    if not prompt_text:
        print("Prompt not found for this question code.")

    response_raw = generate_response_with_chatgpt(prompt_text, file.id)
    response = json.loads(response_raw)

    answer = response.get("answer", "")
    justification = response.get("justification", "")
    citations = response.get("citations", [])

    print(answer)
    print(justification)
    print(citations)

    state[question_code] = Response(answer)
    rows.append({
        "file_name": single_pdf.name,
        "domain": single_domain,
        "question_code": question_code,
        "question_text": question_text,
        "prompt_path": str(prompt_path),
        "answer": answer,
        "justification": clean_excel(justification),
        "citations": clean_excel("; ".join(citations)) if isinstance(citations, list) else str(citations),
    })

    time.sleep(15)
    question_code = spec.get_next_question(state)

output_dir = Path("outputs")
output_dir.mkdir(exist_ok=True)
output_file = output_dir / f"{single_pdf.stem}_{single_domain}_responses.xlsx"
pd.DataFrame(rows).to_excel(output_file, index=False)
print(f"Saved {len(rows)} rows to {output_file}")


domain_4_measurement -> 4.1
N
The outcome measurement methods were appropriate for the outcomes being assessed. The primary outcome—infant crying/fussing time—was measured using the validated Barr diary, a recognized tool for this purpose, with caregiver instructions standardized by nurses. Stool consistency and frequency were also measured using established scales, and parental quality of life was assessed with a validated questionnaire (PedsQLTM). Standardized, validated, and accepted approaches were applied consistently across groups at pre-specified timepoints.
['"The validated Barr diary (17) was used to record the infant colicky full force crying/fussing time (mins/day), number of episodes of colicky full force crying/fussing/day, stool consistency, and stool frequency. Stool consistency on diapers was scored as 0 for watery stool, 1 for loose stool, 2 for formed stool, and 3 for hard stool as per Amsterdam consistency subscale (18). PedsQLTM, a 15-item validated questionnaire, w