### Install required external libraries 

In [0]:
%pip install transformers==4.30.2 "unstructured[pdf,docx]==0.10.30" langchain==0.1.5 llama-index==0.9.3 databricks-vectorsearch==0.22 pydantic==1.10.9 mlflow==2.10.1
dbutils.library.restartPython()

### Init our resources and catalog

In [0]:
%run ./00-init $reset_all_data=false

###pdf files are available in Volume (or DBFS)

In [0]:
# List our raw PDF docs
volume_folder =  f"/Volumes/litware_unity_catalog/rag/documents_store/pdf_documents"   

display(dbutils.fs.ls(volume_folder))

### Transform pdf as text

In [0]:
from unstructured.partition.auto import partition
import re

def extract_doc_text(x : bytes) -> str:
  # Read files and extract the values with unstructured
  sections = partition(file=io.BytesIO(x))
  def clean_section(txt):
    txt = re.sub(r'\n', '', txt)
    return re.sub(r' ?\.', '.', txt)
  # Default split is by section of document, concatenate them all together because we want to split by sentence instead.
  return "\n".join([clean_section(s.text) for s in sections]) 

2025-07-30 12:33:34.350607: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-30 12:33:34.505367: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-30 12:33:35.099253: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [0]:
%sql
--Note that we need to enable Change Data Feed on the table to create the index
CREATE TABLE IF NOT EXISTS litware_unity_catalog.rag.documents_embedding (
  id BIGINT GENERATED BY DEFAULT AS IDENTITY,
  url STRING,
  content STRING,
  embedding STRING
) TBLPROPERTIES (delta.enableChangeDataFeed = true); 

#### Create the final documents_embedding table containing chunks and embeddings

In [0]:
from pyspark.sql import SparkSession, functions as F
import io
from llama_index.langchain_helpers.text_splitter import SentenceSplitter
from llama_index import Document, set_global_tokenizer
from transformers import AutoTokenizer

# Reduce the arrow batch size as our PDF can be big in memory
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 10)

# Define the function to read data as chunks
def read_as_chunk(text):
    # set llama2 as tokenizer to match our model size (will stay below BGE 1024 limit)
    set_global_tokenizer(
        AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
    )
    # Sentence splitter from llama_index to split on sentences
    splitter = SentenceSplitter(chunk_size=500, chunk_overlap=50)
    txt = extract_doc_text(text)
    nodes = splitter.get_nodes_from_documents([Document(text=txt)])
    return [n.text for n in nodes]

# Register the UDF
read_as_chunk_udf = F.udf(read_as_chunk, F.ArrayType(F.StringType()))

# Read data from the 'pdf_raw' table
df = spark.read.table(f"{catalog}.{db}.documents_raw")

# Use withColumn and the UDF to add a new column
df = df.withColumn("text", F.explode(read_as_chunk_udf(df.content)))

# Show the modified DataFrame
df.show(2)

+--------------------+-------------------+------+--------------------+--------------------+
|                path|   modificationTime|length|             content|                text|
+--------------------+-------------------+------+--------------------+--------------------+
|dbfs:/Volumes/lit...|2025-07-28 12:03:18|634144|[25 50 44 46 2D 3...|Quarterly Market ...|
|dbfs:/Volumes/lit...|2025-07-28 12:03:18|634144|[25 50 44 46 2D 3...|In summary, Conto...|
+--------------------+-------------------+------+--------------------+--------------------+
only showing top 2 rows



### Computing the chunk embeddings and saving them to our Delta Table


In [0]:
import mlflow.deployments
from pyspark.sql import functions as F
from pyspark.sql.functions import *

def get_embedding(text: str) -> list[float]:
    deploy_client = mlflow.deployments.get_deploy_client("databricks")  # Assuming MLflow is configured

    try:
        response = deploy_client.predict(endpoint="databricks-bge-large-en", inputs={"input": [text]}) # if required use: databricks-bge-large-en
        return response.data[0]['embedding']  # Extract embedding for the single input
    except Exception as e:
        return None  # Handle potential errors


get_embedding_udf = F.udf(get_embedding, F.StringType())  # Adjusted return type for single string
df = df.withColumn("embedding", get_embedding_udf(F.col("text")))  # Apply UDF to text column

df.show(2)

+--------------------+-------------------+------+--------------------+--------------------+--------------------+
|                path|   modificationTime|length|             content|                text|           embedding|
+--------------------+-------------------+------+--------------------+--------------------+--------------------+
|dbfs:/Volumes/lit...|2025-07-28 12:03:18|634144|[25 50 44 46 2D 3...|Quarterly Market ...|[0.00300407409667...|
|dbfs:/Volumes/lit...|2025-07-28 12:03:18|634144|[25 50 44 46 2D 3...|In summary, Conto...|[0.01162719726562...|
+--------------------+-------------------+------+--------------------+--------------------+--------------------+
only showing top 2 rows



### Store embedding table into Unity Catalog

In [0]:
from pyspark.sql.functions import concat

df.selectExpr('path as url', 'text as content', 'embedding') \
  .write \
  .mode("append") \
  .format("delta") \
  .saveAsTable(f"{catalog}.{db}.documents_embedding")