# 01_Preprocessing
Tips for data preprocessing to create search indexes for RAG

## Document format examples and extraction tools:

### Document Format Examples and Extraction Tools

In [None]:
! pip install PyMuPDF python-docx beautifulsoup4

In [None]:
import fitz  # PyMuPDF
from docx import Document
from bs4 import BeautifulSoup

# PDF to text
def extract_text_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text


# Word to text
def extract_text_from_word(docx_path):
    doc = Document(docx_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text


# HTML to text
def extract_text_from_html(html_path):
    with open(html_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        text = soup.get_text()
    return text


In [None]:
# Example of usage
example_extracted_text = extract_text_from_pdf("../data/02_article/Retrieval-Augmented-Generation-for-LLM.pdf")
example_extracted_text

## Document Intelligence sample
### Reference
- https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/unlocking-advanced-document-insights-with-azure-ai-document/ba-p/4109675
- https://github.com/Azure-Samples/document-intelligence-code-samples/blob/main/Python(v4.0)/Retrieval_Augmented_Generation_(RAG)_samples/sample_figure_understanding.ipynb

In [None]:
! pip install python-dotenv openai azure-ai-documentintelligence azure-identity pillow PyMuPDF

In [None]:
import os
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import ContentFormat
from openai import AzureOpenAI

load_dotenv()

doc_intelligence_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
doc_intelligence_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")

aoai_api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
aoai_api_key= os.getenv("AZURE_OPENAI_API_KEY")
aoai_deployment_name = 'gpt-4o' # your model deployment name for GPT-4V
aoai_api_version = '2024-02-01' # this might change in the future

In [None]:
import re

# Unify the format of headings in markdown text
def convert_markdown_headings(markdown_text):
    # Convert "===" headers to "#"
    markdown_text = re.sub(r'^(.*?)\n={3,}$', r'# \1', markdown_text, flags=re.MULTILINE)

    # Convert "---" headers to "##"
    markdown_text = re.sub(r'^(.*?)\n-{3,}$', r'## \1', markdown_text, flags=re.MULTILINE)
    
    return markdown_text

In [None]:
def analyze_layout(input_file_path, output_folder):
    """
    Analyzes the layout of a document and extracts figures along with their descriptions, then update the markdown output with the new description.

    Args:
        input_file_path (str): The path to the input document file.
        output_folder (str): The path to the output folder where the cropped images will be saved.

    Returns:
        str: The updated Markdown content with figure descriptions.

    """
    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=doc_intelligence_endpoint, 
        credential=AzureKeyCredential(doc_intelligence_key),
        headers={"x-ms-useragent":"sample-code-figure-understanding/1.0.0"},
    )

    with open(input_file_path, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-layout", analyze_request=f, content_type="application/octet-stream", output_content_format=ContentFormat.MARKDOWN 
        )

    result = poller.result()
    md_content = convert_markdown_headings(result.content)
            
    with open(f"{output_folder}/{os.path.splitext(os.path.basename(input_file_path))[0]}.md", 'w', encoding='utf-8') as f:
        f.write(md_content)
    
    return md_content

In [None]:
analyze_layout("../data/01_aisearch_docs/azure-search-concept.pdf", "../output")

### 【TBD】With Image
https://github.com/Azure-Samples/document-intelligence-code-samples/blob/main/Python(v4.0)/Retrieval_Augmented_Generation_(RAG)_samples/sample_figure_understanding.ipynb

## [Option] Text standardization and normalization
- Utilizing LLMs for text standardization and normalization is a highly effective approach.
- It can extend the capabilities of traditional rule-based text transformation.
- However, since LLMs do not guarantee the same output every time, traditional rule-based transformations should be used when output consistency is required.

### Using LLM such as GPT


In [None]:
import re
import os
from openai import AzureOpenAI
import json

client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version="2024-02-01"
)

system_message = """
# Your Role
You are an excellent AI assistant for proofreading text data. Your task is to ensure the provided text data is of high quality. You are only allowed to proofread. Adding or removing context from the original document is not allowed. Additionally, you cannot change the structure of the document.

# Examples of Corrections
- Grammar errors and typos
- OCR misrecognitions
- Inconsistencies in terminology and expressions

# Your input
text: 
"""

def correct_text_gpt(text):
    message_text = [
		{"role":"system","content": system_message},
		{"role":"user","content": text}
	]
    completion = client.chat.completions.create(
		model="gpt-4o", # model = "deployment_name"
		messages = message_text,
		temperature=0,
		)
    return completion.choices[0].message.content



### Using traditional rule-based text transformation

In [None]:
# This is example function to clean text data
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove duplicate lines
    lines = text.split("\n")
    unique_lines = list(dict.fromkeys(lines))
    return "\n".join(unique_lines)