In [59]:
# !pip install openpyxl

In [11]:
import os
from pathlib import Path
from openai import OpenAI

# Load API_KEY from environment or fallback to .env
env_path = Path('.env')
if env_path.exists():
    for line in env_path.read_text().splitlines():
        if line.strip().startswith('#') or '=' not in line:
            continue
        key, value = line.split('=', 1)
        if key.strip() == 'API_KEY' and value.strip():
            os.environ.setdefault('API_KEY', value.strip())
            break

API_KEY = os.getenv('API_KEY')
if not API_KEY:
    raise RuntimeError('Set API_KEY in your environment or .env')

# create an OpenAI client using the API key
client = OpenAI(api_key=API_KEY)


In [None]:
# import pymupdf4llm
# md_text = pymupdf4llm.to_markdown("6832_SercePehlevan_2020.pdf")

# import pathlib
# pathlib.Path("output.md").write_bytes(md_text.encode())

In [19]:
file = client.files.create(
    file=open("studies/6832_SercePehlevan_2020.pdf", "rb"),
    purpose="user_data",
)

In [None]:
from pydantic import BaseModel

class Assessment(BaseModel):
    answer: str
    justification: str
    citations: list[str]


from typing_extensions import TypedDict, Literal
from typing import List

class RobAnswer(TypedDict):
    answer: Literal["Y", "PY", "NI", "PN", "N"]
    justification: str
    citations: List[str]

def generate_response_with_chatgpt(prompt, file_id):
    response = client.responses.create(
        # model="gpt-4.1",
        model="gpt-4o-2024-08-06",
        input=[
            {
                "role": "user",
                "content": [
                    { "type": "input_text", "text": prompt },
                    {
                        "type": "input_file",
                        "file_id": file_id,
                    }
                ]
            }
        ],
        text={
            "format": {
                "type": "json_schema",
                "name": "response_details",
                "schema": {
                    "type": "object",
                    "properties": {
                        "answer": {"type": "string"},
                        "justification": {"type": "string"},
                        "citations": {"type": "array", "items": {"type": "string"}},
                    },
                    "required": ["answer", "justification", "citations"],
                    "additionalProperties": False,
                },
                "strict": True,
            }
        },
    )

    return response.output_text

In [45]:
with open("prompts/domain_1_randomization/question_1.txt", "r", encoding="utf-8") as f:
    content = f.read()

answer = generate_response_with_chatgpt(content, file.id)

'{ "answer": "Y", "justification": "The study used a method involving random numbers generated by a computer to determine the allocation sequence, which is considered a truly random method. The assignments were placed in consecutively numbered sealed envelopes, indicating careful implementation of this randomization process.", "citations": ["\\"Se asignó aleatoriamente a los recién nacidos al grupo del estudio o al grupo de referencia en las primeras horas de vida con base en números consecutivos generados en el centro informático.\\""] }'

In [10]:
import re
from collections import defaultdict
from pathlib import Path

# Map domain (variants via folder name) -> question code -> prompt question file path
prompt_question_files = defaultdict(dict)
pattern = re.compile(r"^domain_?(\d+)(?:_(.+))?$")
for prompt_path in Path('prompts').rglob('question_*.txt'):
    match = pattern.match(prompt_path.parent.name)
    if not match:
        continue
    domain_id, variant = match.groups()

    stem_parts = prompt_path.stem.split('_')
    qnum = stem_parts[1] if len(stem_parts) > 1 else 'unknown'
    question_code = f"{domain_id}.{qnum}"

    domain_key = f"domain_{domain_id}" if not variant else f"domain_{domain_id}_{variant}"
    prompt_question_files[domain_key][question_code] = str(prompt_path)

# Sort questions for stable output
prompt_question_files = {
    domain: {code: path for code, path in sorted(questions.items())}
    for domain, questions in sorted(prompt_question_files.items())
}

print('Prompt question files by domain (variants split via folder name):')
for domain, questions in prompt_question_files.items():
    print(domain)
    for code, f in questions.items():
        print(f" - {code}: {f}")


Prompt question files by domain (variants split via folder name):
domain_1_randomization
 - 1.1: prompts/domain_1_randomization/question_1.txt
 - 1.2: prompts/domain_1_randomization/question_2.txt
 - 1.3: prompts/domain_1_randomization/question_3.txt
domain_2_adhering
 - 2.1: prompts/domain_2_adhering/question_1.txt
 - 2.2: prompts/domain_2_adhering/question_2.txt
 - 2.3: prompts/domain_2_adhering/question_3.txt
 - 2.4: prompts/domain_2_adhering/question_4.txt
 - 2.5: prompts/domain_2_adhering/question_5.txt
 - 2.6: prompts/domain_2_adhering/question_6.txt
domain_2_assigment
 - 2.1: prompts/domain_2_assigment/question_1.txt
 - 2.2: prompts/domain_2_assigment/question_2.txt
 - 2.3: prompts/domain_2_assigment/question_3.txt
 - 2.4: prompts/domain_2_assigment/question_4.txt
 - 2.5: prompts/domain_2_assigment/question_5.txt
 - 2.6: prompts/domain_2_assigment/question_6.txt
 - 2.7: prompts/domain_2_assigment/question_7.txt
domain_3_missing_data
 - 3.1: prompts/domain_3_missing_data/question

In [61]:
# Read PDF text, load a domain module, walk signalling questions, and store responses to Excel
import importlib
import json
from pathlib import Path
import pandas as pd
from rob2 import questions as question_bank

pdf_path = Path('studies/6832_SercePehlevan_2020.pdf')

file = client.files.create(
    file=open(pdf_path, "rb"),
    purpose="user_data",
)

# Map domain key -> question dictionary from questions.py
QUESTION_BANKS = {
    "domain_1_randomization": question_bank.DOMAIN1_RANDOMIZATION_QUESTIONS,
    "domain_2_assigment": question_bank.DOMAIN2_ASSIGMENT_QUESTIONS,
    "domain_2_adhering": question_bank.DOMAIN2_ADHERENCE_QUESTIONS,
    "domain_3_missing_data": question_bank.DOMAIN3_MISSING_DATA_QUESTIONS,
    "domain_4_measurement": question_bank.DOMAIN4_MEASUREMENT_QUESTIONS,
    "domain_5_reporting": question_bank.DOMAIN5_REPORTING_QUESTIONS,
}

rows = []

for domain_key in prompt_question_files:
    state = {}
    domain_prompts = prompt_question_files[domain_key]
    domain_module = importlib.import_module(f'rob2.{domain_key}')
    get_next_question = next(
        getattr(domain_module, name)
        for name in dir(domain_module)
        if name.startswith('get_next_question_domain')
    )

    question_code = get_next_question(state)
    while question_code:
        prompt_path = Path(domain_prompts.get(question_code, ''))
        prompt_text = prompt_path.read_text(encoding='utf-8') if prompt_path.exists() else ''
        question_text = QUESTION_BANKS.get(domain_key, {}).get(question_code, '')

        print(f"{domain_key} -> {question_code}")
        if not prompt_text:
            print('Prompt not found for this question code.')

        response_raw = generate_response_with_chatgpt(prompt_text, file.id)
        response = json.loads(response_raw)

        answer = response.get("answer", "")
        justification = response.get("justification", "")
        citations = response.get("citations", [])

        print(answer)
        print(justification)
        print(citations)

        state[question_code] = answer  # store string
        rows.append({
            "file_name": pdf_path.name,
            "domain": domain_key,
            "question_code": question_code,
            "question_text": question_text,
            "prompt_path": str(prompt_path),
            "answer": answer,
            "justification": justification,
            "citations": "; ".join(citations) if isinstance(citations, list) else str(citations),
        })

        question_code = get_next_question(state)
        break
    break

output_dir = Path('outputs')
output_dir.mkdir(exist_ok=True)
output_file = output_dir / 'responses.xlsx'
pd.DataFrame(rows).to_excel(output_file, index=False)
print(f"Saved {len(rows)} rows to {output_file}")


domain_1_randomization -> 1.1
Y
The study used a truly random method to generate the allocation sequence, as described in the document. The allocation was based on computer-generated random numbers, ensuring the randomness of the process.
['"Se asignó aleatoriamente a los recién nacidos al grupo del estudio o al grupo de referencia en las primeras horas de vida con base en números consecutivos generados en el centro informático..."']
Saved 1 rows to outputs/responses.xlsx
