In [None]:
# Load .env

from dotenv import load_dotenv

load_dotenv()

In [135]:
# PDF Loader

import fitz
import os

pdf_file_path = "userguide.pdf"
pdf_file_path_no_ext = os.path.splitext(pdf_file_path)[0]
pages = fitz.open(pdf_file_path)

In [136]:
# Azure Blob Storage

import os

from azure.storage.blob import BlobServiceClient

blob_service_client = BlobServiceClient.from_connection_string(
    os.getenv("AZURE_STORAGE_ACCOUNT_CONNECTION_STRING")
)


def upload_blob(blob_name, data):
    """Upload image to Azure Blob storage."""
    blob_client = blob_service_client.get_blob_client(
        container=os.getenv("AZURE_STORAGE_CONTAINER_NAME"),
        blob=blob_name,
    )
    blob_client.upload_blob(data, overwrite=True)

In [137]:
# Extract texts, images and upload them to Azure Blob storage

import re


def adjust_formatting(text):
    """Adjust formatting to make content look better."""

    # Fix bullet points
    text = text.replace("•\n\n", "- ")

    # Trim multiple empty lines
    text = re.sub(r"\n\s*\n", "\n\n", text)

    return text


# Main content
content = ""
page_counter = 1
image_counter = 1

# Special setttings
footer = "User Guide  |"
header_size = 14

# Iterate through each page
for page in pages:
    # Get all blocks in the page
    page = page.get_text("dict", sort=True)

    # Add page header
    # content += f"\n\n## {pdf_file_path_no_ext} - Page {page_counter}\n\n"

    for block in page["blocks"]:
        # Process Text
        if block["type"] == 0:
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span["text"].strip()

                    # Format header
                    if span["size"] >= header_size:
                        text = f"### {text}\n"

                    # Ignore footer and page number
                    if footer not in text and text.isdigit() == False:
                        content += f"{text}\n"

        # Process Image
        elif block["type"] == 1:
            image_data = block["image"]
            ext = block["ext"]
            img_file_name = f"{pdf_file_path_no_ext}/img_{image_counter}.png"

            # Upload image
            upload_blob(img_file_name, image_data)

            # Add image link to content
            content += f"![{img_file_name}]({img_file_name})\n\n"

            image_counter += 1

    page_counter += 1

# Extra formatting
content = adjust_formatting(content)

# Upload content file
content_file_name = f"{pdf_file_path_no_ext}/userguide.md"
upload_blob(content_file_name, content)