## Setup

In [1]:
import os
from __future__ import annotations
    
from azure.core.credentials import AzureKeyCredential
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings    
from langchain_core.prompts.prompt import PromptTemplate
from langchain.chains.base import Chain
from langchain.chains.llm import LLMChain
from langchain_core.callbacks.manager import CallbackManager, CallbackManagerForChainRun
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts.base import BasePromptTemplate
from pydantic import Field

from langchain_community.chains.graph_qa.prompts import (
    GRAPHDB_QA_PROMPT,
    GRAPHDB_SPARQL_FIX_PROMPT,
    GRAPHDB_SPARQL_GENERATION_PROMPT,
)
from langchain_community.graphs import OntotextGraphDBGraph
from IPython.display import Image, display

from langsmith import Client
from langsmith import traceable

from langchain_community.graphs import OntotextGraphDBGraph
from typing import TYPE_CHECKING, Any, Dict, List, Optional
if TYPE_CHECKING:
    import rdflib

from rdflib.plugins.sparql import prepareQuery
from rdflib import Graph
from rdflib import Namespace
    
import textwrap
import pandas as pd

from dotenv import load_dotenv
load_dotenv(os.path.join("..", "Azure OpenAI credentials.env"))

True

In [2]:
os.environ["AZURESEARCH_FIELDS_ID"] = "id"
os.environ["AZURESEARCH_FIELDS_CONTENT"] = "chunk"
os.environ["AZURESEARCH_FIELDS_CONTENT_VECTOR"] = "embedding"

from langchain.vectorstores import AzureSearch

In [3]:
azure_endpoint = os.environ['GLOBAL_AZURE_ENDPOINT']
openai_api_key = os.environ['GLOBAL_OPENAI_API_KEY']

openai_deployment_name = os.environ['GLOBAL_GPT_DEPLOYMENT_NAME']
openai_api_version = os.environ['GLOBAL_OPENAI_API_VERSION']
embedding_model = os.environ['GLOBAL_EMBEDDING_MODEL']
embedding_deployment_name = os.environ['GLOBAL_EMBEDDING_DEPLOYMENT_NAME']

search_endpoint = os.environ['SEARCH_ENDPOINT']
search_api_key = os.environ['SEARCH_API_KEY']
search_api_version = os.environ['SEARCH_API_VERSION']
search_service_name = os.environ['SEARCH_SERVICE_NAME']

# langsmith_api_key = os.environ['LANGSMITH_API_KEY']

search_url = f"https://{search_service_name}.search.windows.net/"
search_credential = AzureKeyCredential(search_api_key)

llm = AzureChatOpenAI(
    deployment_name=openai_deployment_name, 
    openai_api_version=openai_api_version, 
    openai_api_key=openai_api_key, 
    azure_endpoint=azure_endpoint, 
    temperature=0
)

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=embedding_deployment_name,
    api_version=openai_api_version,
    api_key=openai_api_key,
    azure_endpoint=azure_endpoint,
)

index_name: str = "crd-vector-store"
ontology_index_name: str = "crd-vector-store-ontologies"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=search_endpoint,
    azure_search_key=search_api_key,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

ontology_vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=search_endpoint,
    azure_search_key=search_api_key,
    index_name=ontology_index_name,
    embedding_function=embeddings.embed_query,
)

## Parameters

Graph database config

In [33]:
ontology_path = os.path.join("..", "ontology", "CRA V15.3.ttl")
inst_ontology_path = os.path.join("..", "ontology", "CRA V15.3.ttl")
g = Graph()
g.parse(inst_ontology_path)

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#dateTime, Converter=<function parse_datetime at 0x0000026ABB8198A0>
Traceback (most recent call last):
  File "c:\Users\ITLS104415\AppData\Local\Programs\Python\Python312\Lib\site-packages\isodate\isodatetime.py", line 51, in parse_datetime
    datestring, timestring = datetimestring.split('T')
    ^^^^^^^^^^^^^^^^^^^^^^
ValueError: not enough values to unpack (expected 2, got 1)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\ITLS104415\AppData\Local\Programs\Python\Python312\Lib\site-packages\rdflib\term.py", line 2119, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
           ^^^^^^^^^^^^^^^^^^
  File "c:\Users\ITLS104415\AppData\Local\Programs\Python\Python312\Lib\site-packages\isodate\isodatetime.py", line 53, in parse_datetime
    raise ISO8601Error("ISO 8601 time designator 'T' missin

<Graph identifier=Ncbc1113f11f24ae8ba5b685db96b115a (<class 'rdflib.graph.Graph'>)>

In [34]:
# Graph test

# Define namespaces
CRO = Namespace("http://WSP.org/ontology/cro#")
XSD = Namespace("http://www.w3.org/2001/XMLSchema#")

# Bind the namespaces to the graph (if not already bound)
g.bind("cro", CRO)
g.bind("xsd", XSD)

# Define the SPARQL query to retrieve all predicates and objects for DPO_Operation
cro_query = """
PREFIX cro: <http://WSP.org/ontology/cro#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?predicate ?object
WHERE {
    cro:DPO_Operation ?predicate ?object .
}
"""

# Run the query
results = g.query(cro_query)

# Iterate over the results and print
for row in results:
    print(f"Predicate: {row.predicate}, Object: {row.object}")

Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://www.w3.org/2002/07/owl#NamedIndividual
Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://WSP.org/ontology/cro#Operation
Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://WSP.org/ontology/cro#Port
Predicate: http://WSP.org/ontology/cro#hasProject, Object: http://WSP.org/ontology/cro#DPO_Dampier_Resilience_Project
Predicate: http://WSP.org/ontology/cro#hasRiskAsessmentReport, Object: http://WSP.org/ontology/cro#DPO_CriticalRiskAassesmentReport_2023
Predicate: http://WSP.org/ontology/cro#hasTerminal, Object: http://WSP.org/ontology/cro#DPO_East_Intercourse_Island
Predicate: http://WSP.org/ontology/cro#hasTerminal, Object: http://WSP.org/ontology/cro#DPO_Parker_Point
Predicate: http://WSP.org/ontology/cro#hasAddress, Object: PO Box 21, 6713 Dampier WA, AU
Predicate: http://WSP.org/ontology/cro#hasCRAType, Object: MFL
Predicate: http://WSP.org/ontology/cro#hasDateOfSu

In [35]:
with open(ontology_path, 'r', encoding='utf-8') as file:
    ontology = file.read()

In [36]:
individuals_start_string = "#    Individuals"
ontology = ontology[:ontology.find(individuals_start_string)]

Query parameters

In [37]:
max_sparql_retries = 5  # Max number of attempts at generating a correct SPARQL query
max_query_retries = 3   # Max number of attempts at generating a SPARQL query that returns at least one result

## Prompts

In [38]:
SPARQL_SELECT_TEMPLATE = """
Task: Generate a SPARQL SELECT statement for querying a graph database.

For instance, a question could be: "List all the Critical Risk Scenarios that impact 'Dampier Port'".

The following sparql query in backticks would be suitable:
```
PREFIX cro: <http://WSP.org/ontology/cro#>
SELECT ?criticalRiskScenario
WHERE {{
    ?operation cro:hasName "Dampier Port Operations" .
    ?criticalRiskScenario cro:impactsOperation ?operation .
}}
```
Instructions:
Use only the node types and properties provided in the schema.
Do not use any node types and properties that are not explicitly provided.
Do not wrap the query in backticks.
Include all necessary prefixes.
Schema:
{schema}
Note: Be as concise as possible.
Do not include any explanations or apologies in your responses.
Do not respond to any questions that ask for anything else than for you to construct a SPARQL query.
Do not include any text except the SPARQL query generated.

Useful individuals for classes and properties that can help formulating the query:
{individuals}

The question is:
{prompt}
"""
SPARQL_GENERATION_SELECT_PROMPT = PromptTemplate(
    input_variables=["schema", "individuals", "prompt"], 
    template=SPARQL_SELECT_TEMPLATE
)

In [39]:
GRAPHDB_SPARQL_NO_RESULT_TEMPLATE = """
This following SPARQL query delimited by triple backticks
```
{generated_sparql}
```
is valid, but it didn't return any results from the graph.
Please try a slightly different SPARQL query.
This is the question in natural language that should be answered by the query, delimited by triple backticks:
```
{prompt}
```
Do not include any explanations or apologies in your responses.
Do not wrap the query in backticks.
Do not include any text except the SPARQL query generated.
The ontology schema delimited by triple backticks in Owl format is:
```
{schema}
```
"""

GRAPHDB_SPARQL_NO_RESULT_PROMPT = PromptTemplate(
    input_variables=["generated_sparql", "prompt", "schema"],
    template=GRAPHDB_SPARQL_NO_RESULT_TEMPLATE,
)

In [40]:
# Standard prompts

sparql_generation_prompt: BasePromptTemplate = SPARQL_GENERATION_SELECT_PROMPT

sparql_fix_prompt: BasePromptTemplate = GRAPHDB_SPARQL_FIX_PROMPT

qa_prompt: BasePromptTemplate = GRAPHDB_QA_PROMPT

no_result_prompt: BasePromptTemplate = GRAPHDB_SPARQL_NO_RESULT_PROMPT

In [41]:
sparql_generation_chain = sparql_generation_prompt | llm

sparql_fix_chain = sparql_fix_prompt | llm

qa_chain = qa_prompt | llm

no_result_chain = no_result_prompt | llm

## Classes

In [45]:
_run_manager = CallbackManagerForChainRun.get_noop_manager()
callbacks = _run_manager.get_child()

In [46]:
class SPARQLQueryHandler:
    def __init__(self, max_sparql_retries: int = 3):
        self.max_sparql_retries = max_sparql_retries

    def log_prepared_sparql_query(self, _run_manager, generated_query: str) -> None:
        _run_manager.on_text("Generated SPARQL:", end="\n", verbose=True)
        _run_manager.on_text(generated_query, color="green", end="\n", verbose=True)

    def log_invalid_sparql_query(self, _run_manager, generated_query: str, error_message: str) -> None:
        _run_manager.on_text("Invalid SPARQL query: ", end="\n", verbose=True)
        _run_manager.on_text(generated_query, color="red", end="\n", verbose=True)
        _run_manager.on_text("SPARQL Query Parse Error: ", end="\n", verbose=True)
        _run_manager.on_text(error_message, color="red", end="\n\n", verbose=True)

    def prepare_sparql_query(self, _run_manager, generated_sparql: str) -> str:
        prepareQuery(generated_sparql)
        self.log_prepared_sparql_query(_run_manager, generated_sparql)
        return generated_sparql

    def get_prepared_sparql_query(self, _run_manager, generated_sparql: str, ontology_schema: str) -> str:
        try:
            return self.prepare_sparql_query(_run_manager, generated_sparql)
        except Exception as e:
            retries = 0
            error_message = str(e)
            self.log_invalid_sparql_query(_run_manager, generated_sparql, error_message)
            print("Error message: ", error_message)

            while retries < self.max_sparql_retries:
                try:
                    sparql_fix_chain_result = sparql_fix_chain.invoke(
                        {
                            "error_message": error_message,
                            "generated_sparql": generated_sparql,
                            "schema": ontology_schema,
                        }
                    )
                    generated_sparql = sparql_fix_chain_result.content
                    return self.prepare_sparql_query(_run_manager, generated_sparql)
                except Exception as e:
                    retries += 1
                    parse_exception = str(e)
                    print("Error message (parse_exception): ", parse_exception)
                    self.log_invalid_sparql_query(_run_manager, generated_sparql, parse_exception)

            print("The generated SPARQL query is invalid.")
            return None

    def execute_query(self, g, query: str) -> List[rdflib.query.ResultRow]:
        try:
            rdf_results = g.query(query)

            results_list = []
            for row in rdf_results:
                results_list.append(row)

            return results_list
        except Exception:
            print("Failed to execute the generated SPARQL query.")
            return []

In [47]:
class VectorStoreHandler:
    def __init__(self, llm, vector_store, ontology_vector_store, k: int = 5):
        self.llm = llm
        self.vector_store = vector_store
        self.ontology_vector_store = ontology_vector_store
        self.k = k
    
    def perform_search(self, system_prompt: str, input_query: str):
        raw_query = str(system_prompt + input_query)    
        search_query = self.llm.invoke(raw_query)
        
        # Documents Vector Store        
        index_results = self.vector_store.hybrid_search(
            query=search_query.content, k=self.k
        )
        
        # Ontology Vector Store
        ontology_index_results = self.ontology_vector_store.hybrid_search(
            query=search_query.content, k=self.k
        )
        
        return index_results, ontology_index_results

## Walkthrough

### Query graph

In [48]:
input_query = "What are the common recommendations between CLO and DPO?"

In [49]:
# Instantiate classes
sparql_handler = SPARQLQueryHandler(max_sparql_retries=3)
vectorstore_handler = VectorStoreHandler(llm, vector_store, ontology_vector_store, k=5)

In [168]:
useful_individuals = """
cro:CLO_Operation rdf:type owl:NamedIndividual ,
                           cro:Operation ;
                  cro:hasAddress "PO Box 21, 6720 Wickham WA, AU" ;
                  cro:hasCRAType "MFL" ;
                  cro:hasDateOfSurvey "2023-08-29T00:00:00" ,
                                      "2023-09-01T00:00:00" ;
                  cro:hasLatitude -20.6011 ;
                  cro:hasLongitude 117.1701 ;
                  cro:hasName "Cape Lambert Operations" ;
                  cro:hasPhone "+61 (0) 8 9183 7004" .
"""

In [50]:
raw_sparql = """PREFIX cro: <http://WSP.org/ontology/cro#>
SELECT ?recommendation
WHERE {
    ?recommendation cro:improvesOperation ?operation1, ?operation2 .
    ?operation1 cro:hasName "Cape Lambert Operations" .
    ?operation2 cro:hasName "Dampier Port Operations" .
}
"""

In [51]:
# SPARQL Query Generation
sparql_generation_chain_result = sparql_generation_chain.invoke(
    {"prompt": input_query, "individuals": useful_individuals, "schema": ontology}
)

raw_sparql = sparql_generation_chain_result.content

print(raw_sparql)

NameError: name 'useful_individuals' is not defined

In [30]:
generated_sparql = sparql_handler.get_prepared_sparql_query(_run_manager, raw_sparql, ontology)

print(generated_sparql)

PREFIX cro: <http://WSP.org/ontology/cro#>
SELECT ?recommendation
WHERE {
    ?recommendation cro:improvesOperation ?operation1, ?operation2 .
    ?operation1 cro:hasName "Cape Lambert Operations" .
    ?operation2 cro:hasName "Dampier Port Operations" .
}



In [31]:
sparql_results = sparql_handler.execute_query(g, generated_sparql)

sparql_results

[]

### Adding a second iteration

In [None]:
# If no results, iterate up to a max number of attempts
query_retries = 0
while sparql_results == [] and query_retries < max_query_retries:
    print(f"## No results retrieved by the query, generating a new query. Attempt {query_retries + 1}")
    query_retries += 1

    # Use no_result_chain to generate a new query
    no_result_chain_result = no_result_chain.invoke(
        {
            "generated_sparql": generated_sparql,
            "prompt": input_query,
            "schema": ontology,
        }
    )

    # Get the newly generated SPARQL query and execute it
    generated_sparql = no_result_chain_result.content
    generated_sparql = sparql_handler.get_prepared_sparql_query(_run_manager, generated_sparql, ontology)
    sparql_results = sparql_handler.execute_query(generated_sparql)

# Handle case when no results were retrieved after max iterations
if sparql_results == []:
    print(f"No results after {max_query_retries} attempts for query: {input_query}")

### Answer

In [19]:
# Answer generation
qa_chain_result = qa_chain.invoke(
    {"prompt": input_query, "context": sparql_results}
)

In [None]:
result = qa_chain_result.content

In [20]:
print(result)

Here are the critical risk scenarios:

1. **DPO-01**: A 1 in 100-year Category 5 tropical cyclone directly impacts the Karratha and Dampier townships and the Dampier port operations, causing extensive damage in the region, storm surge, and flooding. Local buildings, houses, and infrastructure, including hospitals, schools, electricity, and water supplies, suffer a range of damage. Non-essential personnel without safe housing are evacuated. Older steel-clad buildings at Dampier port operations lose some cladding. Torrential rain causes flooding in rail dumper vaults and some conveyor tunnels, and water ingress occurs into substations. Damage occurs to the causeway and approach jetty decking at EII. The shipping channel requires surveying and some subsequent dredging. Loss of some navigation aids occurs, but no major complete loss of Dampier port operations structures or facilities happens.

2. **DPO-04**: One of the two Parker Point shiploaders is lost due to extreme weather conditions,

## Testing

In [25]:
questions_xlsx = os.path.join("..", "Generated Questions.xlsx")

questions_df = pd.read_excel(questions_xlsx)
questions = questions_df["Question"].tolist()

In [26]:
results_df = pd.DataFrame(columns=["input_query", "generated_sparql", "query_results", "result"])

In [None]:
# Parameters
sparql_handler = SPARQLQueryHandler(max_sparql_retries=3)
vectorstore_handler = VectorStoreHandler(llm, vector_store, ontology_vector_store, k=5)

In [None]:
for input_query in questions[0:5]:
    # SPARQL Query Generation
    sparql_generation_chain_result = sparql_generation_chain.invoke(
        {"prompt": input_query, "schema": ontology}
    )

    raw_sparql = sparql_generation_chain_result.content
    generated_sparql = sparql_handler.get_prepared_sparql_query(_run_manager, raw_sparql, ontology)
    sparql_results = sparql_handler.execute_query(generated_sparql)

    # If no results, iterate up to a max number of attempts
    query_retries = 0
    while sparql_results == [] and query_retries < max_query_retries:
        print(f"## No results retrieved by the query, generating a new query. Attempt {query_retries + 1}")
        query_retries += 1

        # Use no_result_chain to generate a new query
        no_result_chain_result = no_result_chain.invoke(
            {
                "generated_sparql": generated_sparql,
                "prompt": input_query,
                "schema": ontology,
            }
        )

        # Get the newly generated SPARQL query and execute it
        generated_sparql = no_result_chain_result.content
        generated_sparql = sparql_handler.get_prepared_sparql_query(_run_manager, generated_sparql, ontology)
        sparql_results = sparql_handler.execute_query(generated_sparql)

    # Handle case when no results were retrieved after max iterations
    if sparql_results == []:
        print(f"No results after {max_query_retries} attempts for query: {input_query}")

    # Answer generation
    qa_chain_result = qa_chain.invoke(
        {"prompt": input_query, "context": sparql_results}
    )

    result = qa_chain_result.content
    print(result)

    # Store results
    test_result = pd.DataFrame([{
        "input_query": str(input_query),
        "generated_sparql": str(generated_sparql),
        "query_results": str(sparql_results),
        "result": str(result),
    }])

    results_df = pd.concat([results_df, test_result], ignore_index=True)

Store results

In [17]:
# Generate question codes
code_sequence = [f"CRD-{str(i).zfill(2)}" for i in range(1, len(results_df) + 1)]
try:
    results_df.insert(0, 'code', code_sequence)
except:
    pass

In [18]:
# Add 'Outcome' and 'Validation' columns
results_df['outcome'] = results_df['query_results'].apply(lambda x: 0 if x == "[]" else 1)
positive_outcome_count = results_df['outcome'].sum()
total_outcome_count = len(results_df)
results_df['validation'] = ((positive_outcome_count / total_outcome_count) * 100).round(2)

In [19]:
# Calculate summary statistics
validation_percentage = (positive_outcome_count / total_outcome_count) * 100
failed_questions_df = results_df[results_df['outcome'] == 0]
failed_question_count = len(failed_questions_df)
failed_question_codes = failed_questions_df['code'].tolist()

In [20]:
# Write data to Excel
timestamp = pd.Timestamp.now().strftime("%Y_%m_%d_%H_%M_%S")

with pd.ExcelWriter(f"validation/Sparql Validation_{timestamp}.xlsx", engine='xlsxwriter') as writer:

    # Save results data to Sheet1
    results_df.to_excel(writer, sheet_name='Sheet1', index=False)

    # Create summary data for Sheet2
    summary_data = {
        'Statistic': ['Accuracy', 'Error Count', 'Error Codes'],
        'Value': [f'{validation_percentage:.2f}%', failed_question_count, ', '.join(failed_question_codes)]
    }
    summary_df = pd.DataFrame(summary_data)

    summary_df.to_excel(writer, sheet_name='Sheet2', index=False)

    # Access the workbook and worksheet for formatting
    workbook = writer.book
    worksheet1 = writer.sheets['Sheet1']
    worksheet2 = writer.sheets['Sheet2']

    # Set column width
    worksheet1.set_column(0, len(results_df.columns) - 1, 30)
    worksheet2.set_column(0, 1, 20)

    # Add conditional formatting to the outcome column in Sheet1
    outcome_range = f'F2:F{len(results_df) + 1}'
    format_pass = workbook.add_format({'bg_color': '#77DD77'})  # Green for pass (1)
    format_fail = workbook.add_format({'bg_color': '#FF0000'})  # Red for fail (0)

    worksheet1.conditional_format(outcome_range, {
        'type': 'cell',
        'criteria': '==',
        'value': 1,
        'format': format_pass
    })

    worksheet1.conditional_format(outcome_range, {
        'type': 'cell',
        'criteria': '==',
        'value': 0,
        'format': format_fail
    })