# PDF to Markdown with Claude 3 Haiku
Make sure you have an `ANTHROPIC_API_KEY` in your environment variables, or you aren't going to be able to run this notebook in its entirety.

Let's import the necessary dependencies.

In [None]:
import os
import base64
from pathlib import Path
from pdf2image import convert_from_path
import anthropic

api_key = os.environ.get("ANTHROPIC_API_KEY")

client = anthropic.Client(api_key=api_key)


First we import the pdf we want to convert to markdown. In this case we're calling it `convert-me-to-markdown.pdf`.

In [None]:
if not os.path.exists("page_jpegs"):
    os.makedirs("page_jpegs")
    
print(f"Converting PDF to images with DPI={300}...")
images = convert_from_path("./convert-me-to-markdown.pdf", dpi=300, fmt='jpeg')
total_pages = len(images)
digits = len(str(total_pages))

for i, image in enumerate(images):
    image_path = os.path.join("page_jpegs", f"Page_{str(i+1).zfill(digits)}.jpeg")
    image.save(image_path, "JPEG")
    print(f"Page {i+1} saved as image: {image_path}")

Now that we have jpeg images of the pdf pages, we can use the our multi-modal model to convert the images to markdown.

In [None]:
def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def image_to_markdown(base64_image):
    response = client.messages.create(
        model="claude-3-opus-20240229", # Opus claude-3-opus-20240229 or Haiku claude-3-haiku-20240307
        max_tokens=4096,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/jpeg",
                            "data": base64_image,
                        },
                    },
                    {
                        "type": "text",
                        "text": "Give me the markdown text output from this page in a PDF using formatting to match the structure of the page as close as you can get. Only output the markdown and nothing else. Do not explain the output, just return it. Do not use a single # for a heading. All headings will start with ## or ###. Convert tables to markdown tables. Describe charts as best you can. DO NOT return in a codeblock. Just return the raw text in markdown format."
                    }
                ],
            }
        ],
    )


    return response.content[0].text

if not os.path.exists("page_markdowns"):
    os.makedirs("page_markdowns")

images = sorted(Path("page_jpegs").iterdir(), key=lambda x: x.stem)
for image_path in images:
    print(f"Processing {image_path.name}...")
    base64_image = encode_image_to_base64(str(image_path))
    markdown_content = image_to_markdown(base64_image)
    output_path = Path("page_markdowns") / f"{image_path.stem}.md"
    with open(output_path, 'w') as f:
        f.write(markdown_content)
        print(f"Markdown for {image_path.name} saved to {output_path}")

print("All images converted to markdown.")

Now we will clean up the markdown, removing any unnecessary characters and image tags

In [None]:
def clean_markdown_content(text):
    """
    Sends the markdown text to Claude 3 to remove irrelevant content.
    """

    response = client.messages.create(
        model="claude-3-opus-20240229", # Opus claude-3-opus-20240229 or Haiku claude-3-haiku-20240307
        max_tokens=4096,
        system="You are tasked with cleaning up the following markdown text. You should return only the cleaned up markdown text. Do not explain your output or reasoning. \n remove any irellevant text from the markdown, returning the cleaned up version of the content. Examples include any images []() or 'click here' or 'Listen to this article' or page numbers or logos.",
        messages=[
            {
                "role": "user",
                "content": text,
            }
        ],
    )

    try:
        cleaned_content = response.content[0].text if response.content else ""
    except AttributeError:
        cleaned_content = "Error in processing image to markdown. Response format may have changed or is invalid."
        print(cleaned_content)
    
    return cleaned_content

def process_markdown_files(input_directory_path, output_directory_path):
    """
    Iterates through markdown files in the given input directory, cleans their content,
    and saves the cleaned content to a corresponding file in the output directory.
    """
    input_dir = Path(input_directory_path)
    output_dir = Path(output_directory_path)

    # Create the output directory if it doesn't exist
    output_dir.mkdir(parents=True, exist_ok=True)

    if not input_dir.is_dir():
        print(f"The directory {input_directory_path} does not exist.")
        return
    
    # Sort the files in alphanumeric order before processing
    sorted_files = sorted(input_dir.glob('*.md'), key=lambda path: path.stem)

    for markdown_file in sorted_files:
        print(f"Processing {markdown_file.name}...")
        with open(markdown_file, 'r', encoding='utf-8') as file:
            content = file.read()

        cleaned_content = clean_markdown_content(content)

        # Define the path for the cleaned file in the output directory
        cleaned_file_path = output_dir / markdown_file.name
        with open(cleaned_file_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_content)
        print(f"Cleaned content saved to {cleaned_file_path}")


markdown_files_directory = "page_markdowns"
cleaned_markdown_directory = "cleaned_page_markdowns"
process_markdown_files(markdown_files_directory, cleaned_markdown_directory)
print("Markdown cleanup process completed.")


Congrats! At this point you have converted a PDF to a bunch of JPEGs, converted those JPEGs to Markdown using a multi-modal model, and cleaned up the markdown using a Large Language Model.

For next steps, you could stitch the markdown together into a single file, chunk up the markdown for ingestion into a Vector Database, use an LLM to generate Knowledge Graphs out of the markdown pages, or anything else you can think of! You could even just copy and paste the markdown into something like ChatGPT and ask questions about the content. The possibilities are endless!