# Ingesting PDF Documents

## Setup Azure API Keys

In [1]:
from notebookutils.mssparkutils.credentials import getSecret

KEYVAULT_ENDPOINT = "https://rag-demo-east-us-kv.vault.azure.net/"
# Azure AI Search
AI_SEARCH_NAME = getSecret(KEYVAULT_ENDPOINT, "AI-SEARCH-NAME")
AI_SEARCH_API_KEY = getSecret(KEYVAULT_ENDPOINT, "AI-SEARCH-API-KEY")
AI_SEARCH_INDEX_NAME = "rag-demo-index"
# Azure AI Services
AI_SERVICES_NAME = getSecret(KEYVAULT_ENDPOINT, "AI-SERVICES-NAME")
AI_SERVICES_API_KEY = getSecret(KEYVAULT_ENDPOINT, "AI-SERVICES-API-KEY")
AI_SERVICES_LOCATION = "eastus"
# Azure Open AI - (if F64 SKU is not used)
OPEN_AI_NAME = getSecret(KEYVAULT_ENDPOINT, "OPEN-AI-NAME")
OPEN_AI_API_KEY = getSecret(KEYVAULT_ENDPOINT, "OPEN-AI-API-KEY")
#OPEN_AI_EMBEDDING_DEPLOYMENT_NAME = "text-embedding-ada-002" #1536
#OPEN_AI_GPT_DEPLOYMENT_NAME = "gpt-35-turbo-16k" # deploymentName could be one of {gpt-35-turbo, gpt-35-turbo-16k}

StatementMeta(, bf4f7d23-39a0-4ffd-8b88-43988345afea, 3, Finished, Available)

## Github: Do not embed secrets within the code
![Github secrets commit error](https://github.com/luisdza/rag-fabric-workbook/raw/main/images/github-commit.png)

## Load and Analyse the Document

In [2]:
import requests
import os

url = "https://github.com/luisdza/rag-fabric-workbook/raw/main/docs/Northwind_Standard_Benefits_Details.pdf"
#url = "https://github.com/luisdza/rag-fabric-workbook/raw/main/docs/Northwind_Health_Plus_Benefits_Details.pdf"

response = requests.get(url)

# Specify your path here
path = "/lakehouse/default/Files/"

# Ensure the directory exists
os.makedirs(path, exist_ok=True)

# Write the content to a file in the specified path
filename = url.rsplit("/")[-1]
with open(os.path.join(path, filename), "wb") as f:
    f.write(response.content)

StatementMeta(, bf4f7d23-39a0-4ffd-8b88-43988345afea, 4, Finished, Available)

In [3]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

document_path = f"Files/{filename}"

df = spark.read.format("binaryFile").load(document_path).select("_metadata.file_name", "content").limit(10).cache()

display(df)


StatementMeta(, bf4f7d23-39a0-4ffd-8b88-43988345afea, 5, Finished, Available)

SynapseWidget(Synapse.DataFrame, ab289046-6f25-4ee8-b72e-4115a80c1942)

In [4]:
from synapse.ml.services import AnalyzeDocument
from pyspark.sql.functions import col

analyze_document = (
    AnalyzeDocument()
    .setPrebuiltModelId("prebuilt-layout")
    .setCustomServiceName(AI_SERVICES_NAME)
    .setSubscriptionKey(AI_SERVICES_API_KEY)
    .setLocation(AI_SERVICES_LOCATION)
    .setImageBytesCol("content")
    .setOutputCol("result")
)

analyzed_df = (
    analyze_document.transform(df)
    .withColumn("output_content", col("result.analyzeResult.content"))
    .withColumn("paragraphs", col("result.analyzeResult.paragraphs"))
).cache()

StatementMeta(, bf4f7d23-39a0-4ffd-8b88-43988345afea, 6, Finished, Available)

In [12]:
analyzed_df = analyzed_df.drop("content")
display(analyzed_df)

StatementMeta(, bf4f7d23-39a0-4ffd-8b88-43988345afea, 14, Finished, Available)

SynapseWidget(Synapse.DataFrame, 6a259c0b-dd99-4c46-917e-c21b6c94b1f1)

## Chunking Text

In [6]:
from synapse.ml.featurize.text import PageSplitter

ps = (
    PageSplitter()
    .setInputCol("output_content")
    .setMaximumPageLength(4000)
    .setMinimumPageLength(3000)
    .setOutputCol("chunks")
)

splitted_df = ps.transform(analyzed_df)
display(splitted_df)

StatementMeta(, bf4f7d23-39a0-4ffd-8b88-43988345afea, 8, Finished, Available)

SynapseWidget(Synapse.DataFrame, 0b0bc29a-cf57-4ccb-882a-9d32ac0229b8)

In [9]:
from pyspark.sql.functions import posexplode, col, concat

# Each "chunks" column contains the chunks for a single document in an array
# The posexplode function will separate each chunk into its own row
exploded_df = splitted_df.select("file_name", posexplode(col("chunks")).alias("chunk_index", "chunk"))

# Add a unique identifier for each chunk
exploded_df = exploded_df.withColumn("unique_id", concat(exploded_df.file_name, exploded_df.chunk_index))

# Write the exploded_df DataFrame to a Lakehouse table in Microsoft Fabric
exploded_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("fabric_demo.document_chunks")

display(exploded_df)

StatementMeta(, bf4f7d23-39a0-4ffd-8b88-43988345afea, 11, Finished, Available)

SynapseWidget(Synapse.DataFrame, adefcf5e-85da-4cfd-88e3-6c1f1c83577c)