# Ingesting PDF Documents

## Setup Azure API Keys

In [1]:
from notebookutils.mssparkutils.credentials import getSecret

KEYVAULT_ENDPOINT = "https://rag-demo-east-us-kv.vault.azure.net/"
# Azure AI Search
AI_SEARCH_NAME = getSecret(KEYVAULT_ENDPOINT, "AI-SEARCH-NAME")
AI_SEARCH_API_KEY = getSecret(KEYVAULT_ENDPOINT, "AI-SEARCH-API-KEY")
AI_SEARCH_INDEX_NAME = "rag-demo-index"
# Azure AI Services
AI_SERVICES_NAME = getSecret(KEYVAULT_ENDPOINT, "AI-SERVICES-NAME")
AI_SERVICES_API_KEY = getSecret(KEYVAULT_ENDPOINT, "AI-SERVICES-API-KEY")
AI_SERVICES_LOCATION = "eastus"
# Azure Open AI - (if F64 SKU is not used)
OPEN_AI_NAME = getSecret(KEYVAULT_ENDPOINT, "OPEN-AI-NAME")
OPEN_AI_API_KEY = getSecret(KEYVAULT_ENDPOINT, "OPEN-AI-API-KEY")
#OPEN_AI_EMBEDDING_DEPLOYMENT_NAME = "text-embedding-ada-002" #1536
#OPEN_AI_GPT_DEPLOYMENT_NAME = "gpt-35-turbo-16k" # deploymentName could be one of {gpt-35-turbo, gpt-35-turbo-16k}

StatementMeta(, 70cf1d3a-48b1-4f2b-b369-ace16b74d029, 3, Finished, Available)

## Load and Analyse the Document

In [4]:
import requests
import os

url = "https://github.com/Azure-Samples/azure-search-openai-demo/raw/main/data/Northwind_Health_Plus_Benefits_Details.pdf"
response = requests.get(url)

# Specify your path here
path = "/lakehouse/default/Files/"

# Ensure the directory exists
os.makedirs(path, exist_ok=True)

# Write the content to a file in the specified path
filename = url.rsplit("/")[-1]
with open(os.path.join(path, filename), "wb") as f:
    f.write(response.content)


StatementMeta(, bb583fda-5187-4728-89fd-62eb47310594, 6, Finished, Available)

In [5]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

document_path = f"Files/{filename}"

df = spark.read.format("binaryFile").load(document_path).select("_metadata.file_name", "content").limit(10).cache()

display(df)


StatementMeta(, bb583fda-5187-4728-89fd-62eb47310594, 7, Finished, Available)

SynapseWidget(Synapse.DataFrame, 8e01c208-087f-4ab6-aa2d-f4b725cb9736)

In [6]:
from synapse.ml.services import AnalyzeDocument
from pyspark.sql.functions import col

analyze_document = (
    AnalyzeDocument()
    .setPrebuiltModelId("prebuilt-layout")
    .setCustomServiceName(AI_SERVICES_NAME)
    .setSubscriptionKey(AI_SERVICES_API_KEY)
    .setLocation(AI_SERVICES_LOCATION)
    .setImageBytesCol("content")
    .setOutputCol("result")
)

analyzed_df = (
    analyze_document.transform(df)
    .withColumn("output_content", col("result.analyzeResult.content"))
    .withColumn("paragraphs", col("result.analyzeResult.paragraphs"))
).cache()

StatementMeta(, bb583fda-5187-4728-89fd-62eb47310594, 8, Finished, Available)

In [7]:
analyzed_df = analyzed_df.drop("content")
display(analyzed_df)

StatementMeta(, bb583fda-5187-4728-89fd-62eb47310594, 9, Finished, Available)

SynapseWidget(Synapse.DataFrame, bb72d8df-1909-43a8-8fdd-32927ac769c0)

## Chunking Text

In [8]:
from synapse.ml.featurize.text import PageSplitter

ps = (
    PageSplitter()
    .setInputCol("output_content")
    .setMaximumPageLength(4000)
    .setMinimumPageLength(3000)
    .setOutputCol("chunks")
)

splitted_df = ps.transform(analyzed_df)
display(splitted_df)

StatementMeta(, bb583fda-5187-4728-89fd-62eb47310594, 10, Finished, Available)

SynapseWidget(Synapse.DataFrame, 52cd2c97-14f1-40be-ae82-942843ce2ff6)

In [12]:
from pyspark.sql.functions import posexplode, col, concat

# Each "chunks" column contains the chunks for a single document in an array
# The posexplode function will separate each chunk into its own row
exploded_df = splitted_df.select("file_name", posexplode(col("chunks")).alias("chunk_index", "chunk"))

# Add a unique identifier for each chunk
exploded_df = exploded_df.withColumn("unique_id", concat(exploded_df.file_name, exploded_df.chunk_index))

# Write the exploded_df DataFrame to a Lakehouse table in Microsoft Fabric
exploded_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("rag_demo.document_chunks")

display(exploded_df)

StatementMeta(, bb583fda-5187-4728-89fd-62eb47310594, 14, Finished, Available)

SynapseWidget(Synapse.DataFrame, dfa48590-166e-4d8d-a440-6498eb4af056)