In [None]:
FILE_NAME='data/pdf/sample-pdf.pdf'

In [None]:
#%pip install logging tiktoken azure-ai-documentintelligence azure-ai-documentanalytics azure-core azure-identity PyPDF2 python-dotenv

import os
import re
import tiktoken
import logging
from io import BytesIO
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import ContentFormat, AnalyzeResult

load_dotenv()
encoding = tiktoken.get_encoding("cl100k_base")
logger = logging.getLogger()

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("debug.log"),
        logging.StreamHandler()
    ]
)

def handle_pdf_locally(uploaded_file, clean=False):
    logger.info("Processing document locally")
    try:
        pdf_reader = PdfReader(uploaded_file)
        texts = [page.extract_text() for page in pdf_reader.pages]
        if clean: 
            return clean_text('\n'.join(texts))
        else:
            return '\n'.join(texts)
        
    except Exception as e:
        return logger.error(e, "Error processing document:")

def handle_pdf_remotely(uploaded_file, clean=False):   
    logger.info("Processing PDF document remotely")
    try:
        doc_intelligence_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
        doc_intelligence_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY")
       
        document_intelligence_client = DocumentIntelligenceClient(
            endpoint=doc_intelligence_endpoint, credential=AzureKeyCredential(doc_intelligence_key)
        )    
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-layout", 
            analyze_request=uploaded_file,
            content_type="application/octet-stream", 
            output_content_format=ContentFormat.MARKDOWN)       
        result: AnalyzeResult = poller.result()
        if clean: 
            return clean_text(result.content)
        else:
            return result.content
    except Exception as e:
        return logger.error(e, "Error processing PDF document remotely:")

def read_file_bin(file_name: str) -> BytesIO:
    """
    Reads a file and returns its content.
    
    Parameters:
    file_name (str): The name of the file to read.

    Returns:
    BytesIO: The content of the file.
    """
    logger.info(f"Reading file {file_name}")
    try:
        with open(file_name, "rb") as file:
            return BytesIO(file.read())
    except FileNotFoundError:
        return BytesIO(b"The file does not exist.")
    
def save_file(file_name: str, data: str) -> None:
    """
    Writes data to a file.
    
    Parameters:
    file_name (str): The name of the file to write to.
    data (str): The data to write to the file.
    """
    logger.info(f"Saving file {file_name}")
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(data)
        
def num_tokens_from_string(string: str) -> int:
    return len(encoding.encode(string))

def print_chunks_page_content(page_content):
    print(f"Number of chunks: {len(page_content)}")
    for i, chunk in enumerate(page_content):
        print(f"Chunk {i + 1} character count: {len(chunk.page_content)} token number: {num_tokens_from_string(chunk.page_content)}" )
        print(chunk.page_content)
        print()

def clean_text(text, remove_comments=False, put_html_tables_on_new_line=True):
    logger.info(f"Cleaning text")
    text = re.sub('(?<=<table>)(.*?)(?=</table>)', lambda m: m.group(0).replace('\n', ' '), text, flags=re.DOTALL)
    patterns = {
        '\n+': '\n',
        ' +': ' ',
        r'\s<': '<',
        r'>\s': '>',
        r'\s\.': '.',
        r'\s,': ',',
        r'\s!': '!',
        r'\s\?': '?',
        r'\s:': ':',
        r'\s;': ';',
        r'\s\)': ')',
        r'\(\s': '(',
        r'\[\s': '[',
        r'\s\]': ']',
        r'\s\}': '}',
        r'\}\s': '}',
    }
    for pattern, replacement in patterns.items():
        text = re.sub(pattern, replacement, text)
    if put_html_tables_on_new_line:
        text = text.replace('<table>', '\n<table>')
    if remove_comments:
        text = re.sub(r'<!--(.*?)-->', '', text, flags=re.DOTALL)
    return text


# Proces document in DI and by PyPDF2

In [None]:
file=read_file_bin(FILE_NAME)
md_file=handle_pdf_remotely(file)
txt_file=handle_pdf_locally(file)

save_file(FILE_NAME.replace(".pdf",".md"),md_file)
save_file(FILE_NAME.replace(".pdf",".txt"),txt_file)