In [None]:
import os
from mistralai import Mistral
import json
from pathlib import Path


PDFS_FOLDER = '/Users/patrick/dev/btcopilot/btcopilot-sources/bowentheory/FTiCP Chapters'

client = Mistral(api_key=os.getenv('MISTRAL_API_KEY'))

def pdf_2_markdown(fpath):

    output_file = Path(fpath).with_suffix('.md')

    if os.path.exists(output_file):
        print(f"Skipping {fpath} as {output_file} already exists")
        return
    
    print(f"Uploading {fpath}")
    uploaded_pdf = client.files.upload(
        file={
            "file_name": os.path.basename(fpath),
            "content": open(fpath, "rb"),
        },
        purpose="ocr"
    )
    print(f"Uploaded PDF: {uploaded_pdf.id}")

    signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
    print(f"Signed URL: {signed_url}")

    response = client.ocr.process(
        model="mistral-ocr-latest",
        document={
            "type": "document_url",
            "document_url": signed_url.url
        },
        include_image_base64=False
    )

    Path(fpath).parent.mkdir(exist_ok=True)

    combined = []
    for i, page in enumerate(response.pages):
        content = page.get('markdown', '') if isinstance(page, dict) else page.markdown
        combined.append(content)

    s_combined = ' '.join(combined)
    with open(str(output_file), 'w', encoding='utf-8') as f:
        f.write(s_combined)
    print(f"Wrote document to {str(output_file)}")


pdf_2_markdown('/Users/patrick/dev/btcopilot/btcopilot-sources/collective-intelligence/Berdahl & Couzin et al (2013) - Emergent Sensing of Complex Environments by Mobile Animal Groups.pdf')

# for root, dirs, files in os.walk(PDFS_FOLDER):
#     for f in files:
#         if f.endswith('.pdf'):
#             fpath = os.path.join(root, f)
#             pdf_2_markdown(fpath)