# Preprocessing

## Setup

In [1]:
import os
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import ContentFormat
from openai import AzureOpenAI
from preprocessing.preprocessing_with_image import analyze_layout
from chunking.split_documents import split_markdown_headings

In [2]:
load_dotenv()

AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
AZURE_DOCUMENT_INTELLIGENCE_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")

AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY= os.getenv("AZURE_OPENAI_API_KEY")
aoai_deployment_name = 'gpt-4o' # your model deployment name for GPT-4o
aoai_api_version = '2024-02-01' # this might change in the future

AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
AZURE_SEARCH_ADMIN_KEY = os.getenv("AZURE_SEARCH_ADMIN_KEY")

BLOB_CONNECTION_STRING = os.getenv("BLOB_CONNECTION_STRING")

In [3]:
import os

directory_path = '../data/03_business_docs/'
output_dir_images = "../output/03_output/images"
output_dir_documents = "../output/03_output/documents"
os.makedirs(output_dir_images, exist_ok=True)
os.makedirs(output_dir_documents, exist_ok=True)

## Extracting text and images from document by Document Intelligence

In [None]:
# List all PDF files in the directory
pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]

# Loop through each PDF file and call the analyze_layout function
for pdf_file in pdf_files:
    file_path = os.path.join(directory_path, pdf_file)
    md_content = analyze_layout(
		file_path,
		output_dir_images,
		AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT,
		AZURE_DOCUMENT_INTELLIGENCE_KEY,
		AZURE_OPENAI_ENDPOINT,
		AZURE_OPENAI_API_KEY,
		aoai_deployment_name,
		aoai_api_version)
    
    with open(f"{output_dir_documents}/{os.path.splitext(os.path.basename(file_path))[0]}.md", 'w', encoding='utf-8') as f:
        f.write(md_content)