# Setup and Import

In [60]:
import re
import fitz  # PyMuPDF
import base64
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
client = OpenAI()
with open('../docs/prompts.md', 'r') as prompts_file:
    prompts_content = prompts_file.read()
pdf_path = "test_paper/black_spatula.pdf"

## Extraction functions

In [61]:
def extract_images_from_pdf(pdf_path:str) -> list:
    """
    Extracts images from a PDF, converts them to base64-encoded strings, 
    and prepares messages with base64-encoded images.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        list: A list of message dictionaries containing base64-encoded images.
    """
    pdf_document = fitz.open(pdf_path)
    messages = [{"role": "user", "content": []}]

    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()
        image_bytes = pix.tobytes("png")
        image_base64 = base64.b64encode(image_bytes).decode('utf-8')
        image_data_url = f"data:image/png;base64,{image_base64}"
        messages[0]["content"].append({
            "type": "image_url", 
            "image_url": {"url": image_data_url}
        })

    return messages

def extract_text_from_pdf(pdf_path:str) -> str:
    """
    Extracts all text content from a PDF file.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        str: A single string containing the concatenated text from all PDF pages.
    """
    pdf_document = fitz.open(pdf_path)
    extracted_text = []

    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text = page.get_text()
        extracted_text.append(text)
    full_text = "\n".join(extracted_text)
    messages = [{"role": "user", "content": [{"type": "text", "text": full_text}]}]
    return messages

def extract_prompt_content(markdown_text:str, prompt_number:int) -> str:
    """
    Extracts the content of a specific prompt's code block from the given Markdown text.
    
    Args:
        markdown_text (str): The full text containing prompts in Markdown format.
        prompt_number (int): The prompt number (e.g., 1, 2, 3) to extract content for.

    Returns:
        str: The extracted content inside the code block below the specified prompt.
    """
    pattern = rf"### Prompt {prompt_number}\s*```.*?\n(.*?)```"
    match = re.search(pattern, markdown_text, re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        return f"Prompt {prompt_number} not found or has no associated code block."
    
def extract_page_content(pdf_path:str, page_number:int) -> dict:
    """
    Extracts raw text and a base64-encoded image from a specified page of a PDF.

    Args:
        pdf_path (str): Path to the PDF file.
        page_number (int): The page number to extract content from (0-based index).

    Returns:
        dict: A dictionary containing raw text and base64-encoded image data for the specified page.
    """
    pdf_document = fitz.open(pdf_path)
    if page_number < 0 or page_number >= len(pdf_document):
        raise ValueError("Page number out of range.")
    page = pdf_document.load_page(page_number)
    raw_text = page.get_text()
    pix = page.get_pixmap()
    image_bytes = pix.tobytes("png")
    image_base64 = base64.b64encode(image_bytes).decode('utf-8')
    image_data_url = f"data:image/png;base64,{image_base64}"
    result = {
        "page_number": page_number,
        "raw_text": raw_text,
        "image": {
            "type": "image_url",
            "image_url": {"url": image_data_url}
        }
    }
    return result

def make_extract_page_info_message(pdf_path:str, page_number:int) -> list:
    """
    Extracts all text content and images from a PDF file.

    Args:
        pdf_path (str): Path to the PDF file.
        page_number (int): The page number to extract content from (0-based index).

    Returns:
        list: A list of message dictionaries containing text and image data.
    """
    result = extract_page_content(pdf_path, page_number)
    with open('../docs/prompts.md', 'r') as prompts_file:
        prompts_content = prompts_file.read()
    prompt2_content = extract_prompt_content(prompts_content, 2).format(raw_text=result["raw_text"])
    extract_page_info_messages = [{"role": "user", "content": [{"type": "text", "text": prompt2_content}, result["image"]]}]
    return extract_page_info_messages

# Tests with prompt1 (images or straight up raw text)

In [62]:
prompt1_content = extract_prompt_content(prompts_content, 1)

image_messages = extract_images_from_pdf(pdf_path)
image_messages[0]["content"].append({"type": "text", "text": prompt1_content})
text_messages = extract_text_from_pdf(pdf_path)
text_messages[0]["content"].append({"type": "text", "text": prompt1_content})

In [63]:
completion = client.chat.completions.create(
  model="o1-preview",
  messages=text_messages
)

print(completion.choices[0].message.content)

**Review and Verification of Numerical Calculations in the Article**

After a careful review of the research article, I focused on verifying all arithmetic calculations, unit conversions, numerical comparisons, and quantitative interpretations presented. Below, I detail my findings, including any errors identified and the steps taken to verify the calculations.

---

### **1. Calculation of the U.S. EPA Reference Dose (RfD) for BDE-209 and Its Application**

**Location in Article:**
- **Section:** Health and exposure concerns (Subsection of Results and Discussion)
- **Paragraph:** "Estimation of exposure to BDE-209 from contaminated kitchen utensils indicated users would have a median intake of 34,700 ng/day, exceeding estimates for intake from dust and diet. The detection of FRs in collected household products indicates that recycling, without the necessary transparency and restrictions to ensure safety, is resulting in unexpected exposure to toxic flame retardants in household items.

# Test with prompt 2

In [64]:
extract_page_info_messages = make_extract_page_info_message(pdf_path, 7)

In [65]:
completion = client.chat.completions.create(
  model="gpt-4o",
  messages=extract_page_info_messages
)

print(completion.choices[0].message.content)

The image displays a page from a document titled "Chemosphere 365 (2024) 143319." It includes text discussing the regulation and concerns about the use of organohalogen flame retardants (FRs) in electronic devices in the EU, New York, and Washington. The text highlights the high levels of FRs found in household products, emphasizing the need for regulatory systems to ensure safer materials. The conclusion section states the importance of eliminating toxic additives in recycled plastic products and mentions the role of authors in producing the document. Acknowledgments and references follow, along with a note on data availability and a link to supplementary data.
