In [None]:
from notebookutils import mssparkutils
from pyspark.sql.functions import udf, col, from_json, concat_ws, explode, current_timestamp
from pyspark.sql.types import StringType, Row, StructType, StructField, ArrayType, MapType
from pyspark.sql.utils import AnalysisException


from synapse.ml.services import AnalyzeDocument

from delta.tables import *

from synapse.ml.services.openai import OpenAIChatCompletion
import json

In [None]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")

In [None]:
# Getting all necessary secrets 

ai_services_key = mssparkutils.credentials.getSecret('https://keyvaultnew.vault.azure.net/', 'DocIntelligenceKey')
ai_services_location = mssparkutils.credentials.getSecret('https://keyvaultnew.vault.azure.net/', 'DocIntelligenceRegion') 
ai_aoai_key = mssparkutils.credentials.getSecret('https://keyvaultnew.vault.azure.net/', 'AOAIKey')
ai_aoai_url = mssparkutils.credentials.getSecret('https://keyvaultnew.vault.azure.net/', 'AOAIURL')

In [None]:
# Input parameter
document_path = "Files/PDF/MYPDFFILE.pdf"

In [None]:
df = (
    spark.read.format("binaryFile")
    .load(document_path)
    .limit(10)
    .cache()
)

In [None]:
analyze_document = (
    AnalyzeDocument()
    .setPrebuiltModelId("prebuilt-layout")
    .setSubscriptionKey(ai_services_key)
    .setLocation(ai_services_location)
    .setImageBytesCol("content")
    .setOutputCol("result")
    .setPages("1-5") # for sake of quick processing, only read the first 15 pages of the documents
)

analyzed_df = (
    analyze_document.transform(df)
    .withColumn("output_content", col("result.analyzeResult.content"))
    .withColumn("paragraphs", col("result.analyzeResult.paragraphs"))).cache()

In [None]:
analyzed_df = analyzed_df.drop("content")

In [None]:
# Define the JSON structure you want to extract
json_structure = {
  "myjsonstructure": {
    "id": "",
    "date": "",
    "attribute 1": "",
    "attribute 2": "",
    ...
}

In [None]:
def make_message(role, content):
    return Row(role=role, content=content, name=role)

In [None]:
messages = []

for i in analyzed_df.collect(): 
    messages.append(
        [
            (
                [
                    make_message(
                        "system", "You are a useful assistant supporting with structured extraction of information from texts. Don't add any comments or explaining text. Always only return the expected JSON filled with the content that was asked for. When you are asked to extract a Project Reference number search for a string that is between six and 15 characters long and can contain characters and numbers like 'MGR65002', 'PCM031', 'MDRCOVID19', 'M819943'. When an Audittype is asked this can be one of 'ISRS', 'ISA'"
                    ),
                    make_message("user", f"Extract the following information in JSON format: {json.dumps(json_structure)} from the following text: {i['output_content']}"),
                ]
            )
        ]
        )

In [None]:
colname = ["messages"]
chat_df = spark.createDataFrame(messages, colname)

In [None]:
# Using a provisioned AOAI gpt-4-32k model in case Fabric Copilot is not available

response = (
    OpenAIChatCompletion()
    .setSubscriptionKey(ai_aoai_key)
    .setDeploymentName("gpt-4-32k")
    .setUrl(ai_aoai_url)
    .setMessagesCol("messages")
    .setErrorCol("error")
    .setOutputCol("chat_completions")
)

In [None]:
# Using the Fabric built-in AOAI model in case Fabric Copilot is available = no explicit AOAI Model necessary
'''
response =(
    OpenAIChatCompletion()
    .setDeploymentName("gpt-4-32k")
    .setMessagesCol("messages")
    .setErrorCol("error")
    .setOutputCol("chat_completions")
)
'''

In [None]:
intermediate_df = response.transform(chat_df).select("messages", "chat_completions.choices.message.content")
intermediate_df = intermediate_df.withColumn("content_str", concat_ws("", col("content")))

In [None]:
myjson_schema = ArrayType(StructType([
    StructField("myjsonstructure", StructType([
        StructField("id", StringType(), True),
        StructField("attribute 1", StringType(), True),
        StructField("attribute 2", StringType(), True),
        ...
])
)

In [None]:
new_df = intermediate_df.withColumn("parsedContent", from_json(col("content_str"), myjson_schema))

In [None]:
new_df.cache()

In [None]:
new_df_exploded = new_df.select(explode("parsedContent").alias("parsedContent"))

In [None]:
new_dfs_info = [
    {"newDataFrameName": "df_myjsonstructure", "columnNames": ["parsedContent.root.id", "parsedContent.root.attribute 1", "parsedContent.root.attribute 2", "...", current_timestamp().alias("insert_datetime")]},

]

In [None]:
def create_new_dataframes(sourceDataFrame, newDataFrames):
    
    # Dictionary to store the new DataFrames
    new_dfs = {}
    
    # Iterate through the array of newDataFrames
    for row in newDataFrames:
        new_df_name = row["newDataFrameName"]
        column_names = row["columnNames"]
        print(column_names)
        # Select the specified columns from the source DataFrame
        new_df = sourceDataFrame.select(*column_names)
        
        # Store the new DataFrame in the dictionary
        new_dfs[new_df_name] = new_df
    
    return new_dfs

In [None]:
new_dfs = create_new_dataframes(new_df_exploded, new_dfs_info)

In [None]:
output_path = 'Tables/'

for df_name, df in new_dfs.items():
        # Write each DataFrame as a Delta Lake table
        df.write \
            .format("delta") \
            .option("mergeSchema", "true") \
            .mode("append") \
            .save(f"{output_path}/{df_name}")
