In [24]:
from dotenv import load_dotenv

load_dotenv()

True

In [25]:
import textwrap

prompt = textwrap.dedent("""You are a meticulous document converter. Convert the full input into **clean, valid Markdown** only (no commentary).  

### Global Rules
- Output = Markdown only (no surrounding code fences).  
- Replace all `<` as `/<` to avoid HTML rendering issues.  
- Preserve the document’s original wording; only fix spacing or OCR artifacts.  
- Use proper Markdown syntax:  
  - Headings → `#`, `##`, `###` according to hierarchy.  
  - Bullet points → `-`.  
  - Numbered lists → `1.`, `2.`, etc.  
  - Emphasis → `*italic*`, `**bold**`.  
  - Links → `[text](url)`.  
  - Images without URLs → `![alt](#)`.  
  - Footnotes → `[^1]`.  
- Remove repeated headers/footers and page numbers.  

### Tables (Normalize)
When tabular data is detected:  
1. Expand into rectangular grids with consistent column count.  
2. If first row looks like headers, treat as headers; else create headers (`Column 1`, `Column 2`, …) and keep row as data.  
3. Align numeric columns right (`--:`), text left (`:--`).  
4. Escape `|` inside cells as `\|`.  
5. Replace internal newlines in cells with `<br>`.  
6. Do not use rowspan/colspan — duplicate or leave blank.  

Example format:  
| Header 1   |Header 2 |Header3  |
|:-----------|:--------|:--------|
| text       |123      |center   |

### Output Requirements
- All sections must be converted, with proper Markdown headings and lists.  
- All tables must be normalized into GitHub-flavored Markdown tables.  
- Escape every `<` as `/<`.  
- Return the **entire document** as a single clean Markdown text.""")

In [32]:
from langchain_openai import ChatOpenAI

reasoning = {
    "effort": "minimal",
    "summary": None, 
}
llm = ChatOpenAI(model="gpt-5-nano", use_responses_api=True, reasoning=reasoning)

                reasoning was transferred to model_kwargs.
                Please confirm that reasoning is what you intended.
  if await self.run_code(code, result, async_=asy):


In [33]:
from base64 import b64encode

file_path = "./test_dir/chunk_0006_to_0010.pdf"

with open(file_path, "rb") as file:
    pdf_data = file.read()
    pdf_base64 = b64encode(pdf_data).decode("utf-8")

# instruction = {
#     "role": "developer",
#     "content": [
#         {
#             "type": "text",
#             "text": prompt,
#         }
#     ]
# }

message = {
    "role": "user",
    "content": [
        {
            "type": "text",
            "text": "Extract headers and footers from all 5 pages and give me what are headers and footers for each page in a list",
        },
        {
            "type": "file",
            "source_type": "base64",
            "data": pdf_base64,
            "mime_type": "application/pdf",
            "filename": "myfile"
        },
    ],
}


In [34]:
result = llm.invoke([message])

BadRequestError: Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is not supported with this model.", 'type': 'invalid_request_error', 'param': 'temperature', 'code': None}}

In [31]:
result

AIMessage(content=[{'type': 'text', 'text': 'Here are the headers and footers for each page you provided (as they appear at the top and bottom of each page):\n\n- Page 7\n  - Header: Product Transparency and Disclosure\n  - Footer: 7 of 102\n\n- Page 8\n  - Header: Product Transparency and Disclosure\n  - Footer: 8 of 102\n\nNotes:\n- The content you shared includes two consecutive pages (7 and 8) with identical header text “Product Transparency and Disclosure” and page footers indicating the page number within the 102-page document.\n- If you have pages 5–6 and 9–10 as separate images or text, I can extract their headers/footers in the same format as well.', 'annotations': []}], additional_kwargs={'reasoning': {'id': 'rs_68af243c889c8191911f926cd0334a040c992a6f488a671b', 'summary': [], 'type': 'reasoning'}}, response_metadata={'id': 'resp_68af243b880081919830eeafda5782930c992a6f488a671b', 'created_at': 1756308539.0, 'metadata': {}, 'model': 'gpt-5-nano-2025-08-07', 'object': 'response

In [29]:
print(result.text())

Here are the headers and footers for each page you provided (as they appear at the top and bottom of each page):

- Page 7
  - Header: Product Transparency and Disclosure
  - Footer: 7 of 102

- Page 8
  - Header: Product Transparency and Disclosure
  - Footer: 8 of 102

Notes:
- The content you shared includes two consecutive pages (7 and 8) with identical header text “Product Transparency and Disclosure” and page footers indicating the page number within the 102-page document.
- If you have pages 5–6 and 9–10 as separate images or text, I can extract their headers/footers in the same format as well.


In [30]:
with open("./nano_markdown.md", "w", encoding="utf-8") as f:
    f.write(result.text())