### Install required external libraries 

In [0]:
%pip install transformers==4.30.2 "unstructured[pdf,docx]==0.10.30" langchain==0.1.5 llama-index==0.9.3 databricks-vectorsearch==0.22 pydantic==1.10.9 mlflow==2.10.1
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting transformers==4.30.2
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.2/7.2 MB 45.6 MB/s eta 0:00:00
Collecting unstructured[docx,pdf]==0.10.30
  Downloading unstructured-0.10.30-py3-none-any.whl (1.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.7/1.7 MB 47.6 MB/s eta 0:00:00
Collecting langchain==0.1.5
  Downloading langchain-0.1.5-py3-none-any.whl (806 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 806.7/806.7 kB 40.7 MB/s eta 0:00:00
Collecting llama-index==0.9.3
  Downloading llama_index-0.9.3-py3-none-any.whl (886 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 886.7/886.7 kB 45.6 MB/s eta 0:00:00
Collecting databricks-vectorsearch==0.22
  Downloading databricks_vectorsearch-0.22-py3-none-any.whl (8.5 kB)
Collecting pydantic==1.10.9
  Downloading pydantic-1.10.9-cp310-cp310-manylinux_2_17_x86_

### Init our resources and catalog

In [0]:
%run ./00-init $reset_all_data=false

###pdf files are available in Volume (or DBFS)

In [0]:
# List our raw PDF docs
volume_folder =  f"/Volumes/{catalog}/{db}/documents_store/pdf_documents"   

display(dbutils.fs.ls(volume_folder))

path,name,size,modificationTime
dbfs:/Volumes/litware_unity_catalog/rag/documents_store/pdf_documents/Contoso News 1.pdf,Contoso News 1.pdf,1512290,1715020303000
dbfs:/Volumes/litware_unity_catalog/rag/documents_store/pdf_documents/Contoso News 2.pdf,Contoso News 2.pdf,1461847,1715020304000
dbfs:/Volumes/litware_unity_catalog/rag/documents_store/pdf_documents/Contoso Quarterly Financial Report - December 2023.pdf,Contoso Quarterly Financial Report - December 2023.pdf,146640,1715020301000
dbfs:/Volumes/litware_unity_catalog/rag/documents_store/pdf_documents/Contoso Quarterly Financial Report - March 2024.pdf,Contoso Quarterly Financial Report - March 2024.pdf,118041,1715020302000
dbfs:/Volumes/litware_unity_catalog/rag/documents_store/pdf_documents/Contoso Quarterly Market SentimentReport - December 2023.pdf,Contoso Quarterly Market SentimentReport - December 2023.pdf,226345,1715020302000
dbfs:/Volumes/litware_unity_catalog/rag/documents_store/pdf_documents/Contoso Quarterly Market SentimentReport - March 2024.pdf,Contoso Quarterly Market SentimentReport - March 2024.pdf,222272,1715020302000
dbfs:/Volumes/litware_unity_catalog/rag/documents_store/pdf_documents/Marketing Campaign for Contoso to improve customer churn.pdf,Marketing Campaign for Contoso to improve customer churn.pdf,576637,1715020302000


### Transform pdf as text

In [0]:
from unstructured.partition.auto import partition
import re

def extract_doc_text(x : bytes) -> str:
  # Read files and extract the values with unstructured
  sections = partition(file=io.BytesIO(x))
  def clean_section(txt):
    txt = re.sub(r'\n', '', txt)
    return re.sub(r' ?\.', '.', txt)
  # Default split is by section of document, concatenate them all together because we want to split by sentence instead.
  return "\n".join([clean_section(s.text) for s in sections]) 

In [0]:
%sql
--Note that we need to enable Change Data Feed on the table to create the index
CREATE TABLE IF NOT EXISTS litware_unity_catalog.rag.documents_embedding_openai (
  id BIGINT GENERATED BY DEFAULT AS IDENTITY,
  url STRING,
  content STRING,
  embedding STRING
) TBLPROPERTIES (delta.enableChangeDataFeed = true); 

#### Create the final documents_embedding table containing chunks and embeddings

In [0]:
from pyspark.sql import SparkSession, functions as F
import io
from llama_index.langchain_helpers.text_splitter import SentenceSplitter
from llama_index import Document, set_global_tokenizer
from transformers import AutoTokenizer

# Reduce the arrow batch size as our PDF can be big in memory
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 10)

# Define the function to read data as chunks
def read_as_chunk(text):
    # set llama2 as tokenizer to match our model size (will stay below BGE 1024 limit)
    set_global_tokenizer(
        AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
    )
    # Sentence splitter from llama_index to split on sentences
    splitter = SentenceSplitter(chunk_size=500, chunk_overlap=50)
    txt = extract_doc_text(text)
    nodes = splitter.get_nodes_from_documents([Document(text=txt)])
    return [n.text for n in nodes]

# Register the UDF
read_as_chunk_udf = F.udf(read_as_chunk, F.ArrayType(F.StringType()))

# Read data from the 'pdf_raw' table
df = spark.read.table(f"{catalog}.{db}.documents_raw")

# Use withColumn and the UDF to add a new column
df = df.withColumn("text", F.explode(read_as_chunk_udf(df.content)))

# Show the modified DataFrame
df.show(9)

+--------------------+-------------------+------+--------------------+--------------------+
|                path|   modificationTime|length|             content|                text|
+--------------------+-------------------+------+--------------------+--------------------+
|dbfs:/Volumes/lit...|2024-03-27 21:29:52|634144|[25 50 44 46 2D 3...|Quarterly Market ...|
|dbfs:/Volumes/lit...|2024-03-27 21:29:52|634144|[25 50 44 46 2D 3...|In summary, Conto...|
|dbfs:/Volumes/lit...|2024-03-27 21:29:52|634144|[25 50 44 46 2D 3...|Highlight new ser...|
|dbfs:/Volumes/lit...|2024-03-27 21:29:52|634144|[25 50 44 46 2D 3...|Highly Unsatisfie...|
|dbfs:/Volumes/lit...|2024-03-27 21:29:52|634144|[25 50 44 46 2D 3...|Highly Unsatisfie...|
|dbfs:/Volumes/lit...|2024-03-27 21:29:52|634144|[25 50 44 46 2D 3...|It has been nothi...|
|dbfs:/Volumes/lit...|2024-03-27 21:29:52|634144|[25 50 44 46 2D 3...|I will not be rec...|
|dbfs:/Volumes/lit...|2024-03-27 21:29:52|634144|[25 50 44 46 2D 3...|I would no

### Computing the chunk embeddings and saving them to our Delta Table


In [0]:
import mlflow.deployments
from pyspark.sql import functions as F
from pyspark.sql.functions import *

def get_embedding(text: str) -> list[float]:
    deploy_client = mlflow.deployments.get_deploy_client("databricks")  # Assuming MLflow is configured

    try:
        response = deploy_client.predict(endpoint="text-embedding-ada-002-Azure-OpenAI", inputs={"input": [text]})  # databricks-bge-large-en
        return response.data[0]['embedding']  # Extract embedding for the single input
    except Exception as e:
        return None  # Handle potential errors

# Read data from the 'pdf_raw' table
#df = spark.read.table(f"{catalog}.{db}.fabcon_pdf_raw")

get_embedding_udf = F.udf(get_embedding, F.StringType())  # Adjusted return type for single string
df = df.withColumn("embedding", get_embedding_udf(F.col("text")))  # Apply UDF to text column

df.show(2)

+--------------------+-------------------+------+--------------------+--------------------+--------------------+
|                path|   modificationTime|length|             content|                text|           embedding|
+--------------------+-------------------+------+--------------------+--------------------+--------------------+
|dbfs:/Volumes/lit...|2024-03-27 21:29:52|634144|[25 50 44 46 2D 3...|Quarterly Market ...|[-0.013218905, -0...|
|dbfs:/Volumes/lit...|2024-03-27 21:29:52|634144|[25 50 44 46 2D 3...|In summary, Conto...|[-0.013934631, -0...|
+--------------------+-------------------+------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import concat

df.selectExpr('path as url', 'text as content', 'embedding') \
  .write \
  .mode("append") \
  .format("delta") \
  .saveAsTable(f"{catalog}.{db}.documents_embedding_openai")