In [0]:
from datetime import datetime

In [0]:
CATALOG = "marcell"
SCHEMA = "marine_ai_poc"

In [0]:
# Table names for storing data
DOC_REFERENCE_TABLE = f"{CATALOG}.{SCHEMA}.document_pages_parsed"
PARAGRAPH_TARGET_TABLE = f"{CATALOG}.{SCHEMA}.app_paragraph_stage"
DOCUMENT_TARGET_TABLE = f"{CATALOG}.{SCHEMA}.app_document_stage"
PROJECT_TARGET_TABLE = f"{CATALOG}.{SCHEMA}.app_project_stage"
SUMMARY_FEEDBACK_TABLE = f"{CATALOG}.{SCHEMA}.app_feedback"

In [0]:
time_now = datetime.now()
current_user = spark.sql("SELECT current_user()").collect()[0][0]
RUN_ID = f"{time_now.strftime('%Y%m%d%H%M%S')}_{current_user}"

In [0]:

# AI models used for processing
MODEL_1 = "databricks-meta-llama-3-1-8b-instruct"
MODEL_2 = "databricks-meta-llama-3-3-70b-instruct"
MODEL_3 = "databricks-meta-llama-3-3-70b-instruct"

# Default prompts for AI models
prompt_1 = """
You are an environmental policy expert. You are given a paragraph or excerpt from an environmental planning correspondence document. If the text pertains to any impacts on marine life, extract the text in complete sentences. If the text does not discuss topics potentially related to marine impact, output <NO IMPACT IDENTIFIED>. If the excerpt on the page looks like a table of contents, a bibliographical reference, or incomplete, ignore the excerpt and output <NO IMPACT IDENTIFIED>. Do not output anything else, apart from an exact extract of the text, or <NO IMPACT IDENTIFIED>. Here's the text: \n\n
"""

prompt_2 = """
You are an environment policy expert. You are given excerpts from a decision correspondence, where each excerpt pertains to potential or confirmed impact on marine life. Review the document and identify all instances where the Secretary of State (SoS) concluded that adverse effects on integrity (AEoI) cannot be ruled out beyond reasonable scientific doubt. For each instance, extract the specific impact, the quantitative impacts, affected species, locations and any compensation measures. Present these findings as a bulleted list of coherent and succinct sentences with page number references where this information appears in the document. Here is the text: \n\n
"""

prompt_3 = """
You are an environmental policy expert. You are given summary lists of issues pertaining to impact of infrastructure development on marine life, which have been identified in separate documents. Create a summary list of coherent sentences from these issues, merging any similar or duplicate issues into one item in your final list, and including the species, any quantitative impacts if any, compensation measures if any, the location, and the source document and page of the information. Format your content as markdown, putting any locations, species, and quantities into bold, and creating hyperlinks from your document references from the title and URLs provided. Here is the text: \n\n
"""

# PDF urls to search
pdf_urls = [
    "https://infrastructure.planninginspectorate.gov.uk/wp-content/ipc/uploads/projects/EN010109/EN010109-002394-SADEP NMC Amendment Order- 6 Feb.pdf",
    "https://infrastructure.planninginspectorate.gov.uk/wp-content/ipc/uploads/projects/EN010109/EN010109-002383-MOD 30 August 24.pdf"
]

### CREATE TABLES

In [0]:
create_paragraph_table_query = f"""
CREATE TABLE IF NOT EXISTS {PARAGRAPH_TARGET_TABLE} (
    run_id STRING, pdf_url_encoded STRING, title STRING, chunk_text STRING, 
    chunk_pages STRING, model_used STRING, prompt STRING, ai_assessment STRING
)
"""

create_document_table_query = f"""
CREATE TABLE IF NOT EXISTS {DOCUMENT_TARGET_TABLE} (
    run_id STRING, pdf_url_encoded STRING, title STRING, document_summary STRING, 
    model_used STRING, prompt STRING
)
"""

create_project_table_query = f"""
CREATE TABLE IF NOT EXISTS {PROJECT_TARGET_TABLE} (
    run_id STRING, project_impacts STRING, model_used STRING, prompt STRING
)
"""

spark.sql(create_paragraph_table_query)
spark.sql(create_document_table_query)
spark.sql(create_project_table_query)

### FUNCTIONS TO GENERATE SQL QUERIES

In [0]:
def construct_query_1(run_id, paragraph_target_table, prompt, model, document_ids):
    documents = "'" + "', '".join(document_ids) + "'"
    return f"""
        INSERT INTO {paragraph_target_table}
        SELECT
            '{run_id}' AS run_id,
            REPLACE(pdf_url, " ", "%20") AS pdf_url_encoded,
            title,
            chunk_text,
            chunk_pages,
            '{model}' AS model_used,
            "{prompt}" AS prompt,
            ai_query('{model}', "{prompt}" || chunk_text) AS ai_assessment
        FROM marcell.marine_planning.decision_chunks
        WHERE filename IN ({documents})
    """

In [0]:

def construct_query_2(run_id, paragraph_target_table, document_target_table, prompt, model):
    return f"""
        INSERT INTO {document_target_table}
        WITH impactful_pages AS (
            SELECT
                *,
                "Page(s): " || chunk_pages || "\n\n" || ai_assessment AS ai_assessment_with_pages
            FROM {paragraph_target_table}
            WHERE ai_assessment NOT ILIKE "%<NO IMPACT IDENTIFIED>%" AND run_id = '{run_id}'
        ),
        numbered_impactful_pages AS (
            SELECT
                title,
                pdf_url_encoded,
                ai_assessment_with_pages,
                ROW_NUMBER() OVER (PARTITION BY title ORDER BY chunk_pages) AS row_num
            FROM impactful_pages
        ),
        grouped_impactful_pages AS (
            SELECT
                title || ' (part ' || CEIL(row_num / 200) || ')' AS title_part,
                pdf_url_encoded,
                ai_assessment_with_pages
            FROM numbered_impactful_pages
        )
        SELECT
            '{run_id}' AS run_id,
            pdf_url_encoded,
            title_part AS title,
            ai_query("{model}", system_prompt || concatenated_assessments) AS document_summary,
            '{model}' AS model_used,
            "{prompt}" AS prompt
        FROM (
            SELECT
                pdf_url_encoded,
                title_part,
                "{prompt}" AS system_prompt,
                concat_ws('"\n\n\n ==================" \n\n', collect_list(ai_assessment_with_pages)) AS concatenated_assessments
            FROM grouped_impactful_pages
            GROUP BY pdf_url_encoded, title_part
        );
    """


In [0]:
def construct_query_3(run_id, document_target_table, project_target_table, prompt, model):
    return f"""
        INSERT INTO {project_target_table}
        WITH doc_summaries AS (
            SELECT *, "Document: " || title || "\n" || "URL: " || pdf_url_encoded || "\n" || "Summary: " || document_summary AS doc_assessment_with_titles
            FROM {document_target_table}
            WHERE run_id = '{run_id}'
        )
        SELECT
            '{run_id}' AS run_id,
            ai_query("{model}", system_prompt || concatenated_doc_assessments) AS project_impacts,
            '{model}' AS model_used, "{prompt}" AS prompt
        FROM (
            SELECT "{prompt}" AS system_prompt,
            concat_ws('"\n\n\n ==================" \n\n', collect_list(doc_assessment_with_titles)) AS concatenated_doc_assessments
            FROM doc_summaries
        );
    """

### RUN FUNCTIONS

Step 1: filter pages/paragraphs

In [0]:
query_1 = construct_query_1(RUN_ID, PARAGRAPH_TARGET_TABLE, prompt_1, MODEL_1, pdf_urls)
spark.sql(query_1)


Step 2: summarise individual documents

In [0]:
query_2 = construct_query_2(RUN_ID, PARAGRAPH_TARGET_TABLE, DOCUMENT_TARGET_TABLE, prompt_2, MODEL_2)
spark.sql(query_2)

Step 3: Summarise project

In [0]:
query_3 = construct_query_3(RUN_ID, DOCUMENT_TARGET_TABLE, PROJECT_TARGET_TABLE, prompt_3, MODEL_3)
spark.sql(query_3)

### PRINT SUMMARY

In [0]:
summary = spark.sql(f"SELECT project_impacts FROM {PROJECT_TARGET_TABLE} WHERE run_id = '{RUN_ID}'").collect()[0]["project_impacts"]
print(summary)