## Setup

In [31]:
import os
from rdflib import Graph    
import base64
import openai
import random
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import requests
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes.models import SynonymMap
import tqdm 
#from docx import Document
import re

from dotenv import load_dotenv
load_dotenv(os.path.join("..", "Azure OpenAI credentials.env"))

True

In [32]:
# Variables

azure_endpoint = os.environ['GLOBAL_AZURE_ENDPOINT']
openai_api_key = os.environ['GLOBAL_OPENAI_API_KEY']

openai_deployment_name = os.environ['GLOBAL_GPT_DEPLOYMENT_NAME']
openai_api_version = os.environ['GLOBAL_OPENAI_API_VERSION']
embedding_model = os.environ['GLOBAL_EMBEDDING_MODEL']
embedding_deployment_name = os.environ['GLOBAL_EMBEDDING_DEPLOYMENT_NAME']
search_api_key = os.environ['SEARCH_API_KEY']
search_endpoint = os.environ['SEARCH_ENDPOINT']
search_service_name = os.environ['SEARCH_SERVICE_NAME']

search_url = f"https://{search_service_name}.search.windows.net/"
search_credential = AzureKeyCredential(search_api_key)
index_name = "crd-ontologies-desc"
search_client = SearchClient(search_endpoint, index_name, search_credential)

In [33]:
from azure.core.credentials import AzureKeyCredential
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings    
from langchain_core.prompts.prompt import PromptTemplate
from langchain.chains.base import Chain
from langchain.chains.llm import LLMChain
from langchain_core.callbacks.manager import CallbackManager, CallbackManagerForChainRun
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts.base import BasePromptTemplate
from langchain_core.prompts import ChatPromptTemplate

In [34]:
llm = AzureChatOpenAI(
    deployment_name=openai_deployment_name, 
    openai_api_version=openai_api_version, 
    openai_api_key=openai_api_key, 
    azure_endpoint=azure_endpoint, 
    temperature=0
)

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=embedding_deployment_name,
    api_version=openai_api_version,
    api_key=openai_api_key,
    azure_endpoint=azure_endpoint,
)

In [35]:
def get_embeddings(text, azure_endpoint, api_key, api_version, deployment_name):
#generate embeddings for a document chunk 
    client = openai.AzureOpenAI(
        azure_endpoint=azure_endpoint,
        api_key=api_key,
        api_version=api_version,
    ) 
    embedding = client.embeddings.create(input=[text], model=deployment_name)
    return embedding.data[0].embedding

## Convert RDF ontology to list

In [29]:
ontology_path = "CRA V17.1 MFL & NLE Instantiated Ontology.ttl"

##"CRA V16.2 MFL Instantiated Ontology.ttl"

file_path = os.path.join("..", "ontology/inst", ontology_path)

In [30]:
with open(file_path, 'r', encoding='utf-8') as file:
        ontology = file.read()

In [36]:
print(ontology)

@prefix : <http://WSP.org/ontology/cro#> .
@prefix ns1: <http://WSP.org/ontology/cro#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@base <http://WSP.org/ontology/cro> .

<http://WSP.org/ontology/cro> rdf:type owl:Ontology ;
                               owl:versionIRI <http://WSP.org/ontology/cro-1.16> .

#################################################################
#    Annotation properties
#################################################################

###  http://WSP.org/ontology/cro#consultingEntity
ns1:consultingEntity rdf:type owl:AnnotationProperty .


###  http://WSP.org/ontology/cro#dependsOn
ns1:dependsOn rdf:type owl:AnnotationProperty .


###  http://WSP.org/ontology/cro#hasCatastrophicRisk
ns1:hasCatastrophicRisk rdf:type owl:AnnotationProp

In [46]:
# Remove the text before the word "Individuals" in the ontology

individuals_start_string = "#    Individuals"
individuals_end_string = "###  Generated by the OWL API"

ont = ontology[ontology.find(individuals_start_string):]
ont = ont[:ont.find(individuals_end_string)]

ont = ont.replace(individuals_start_string, "")
ont



### Ontology for SPARQL generation

In [47]:
placeholder_hardControls = 'ns1:hardControls "Hard Controls";'
placeholder_softControls = 'ns1:softControls "Soft Controls";'
placeholder_inc_desc = 'ns1:incidentDescription "Description of the incident";'
placeholder_inh_controls = 'ns1:inherentControls "Inherent Controls";' 
placeholder_inh_hazards = 'ns1:inherentHazards "Inherent Hazards";'
placeholder_repairperiod = 'ns1:repairPeriod "Repair Period";'
placeholder_utility_supply = 'ns1:utilityOrSupplyDescription "Description of Utility or Supply";'
placeholder_assumpt_desc = 'ns1:assumptionDescription "Description of the assumption";'
placeholder_dep_desc = 'ns1:otherDependenciesDescription "Description of the other dependencies";'
placeholder_prod_desc = 'ns1:finalProductDescription "Description of the final product";'
placeholder_workaround = 'ns1:workaround "Workaround";'
placeholder_rec_det = 'ns1:recommendationDetailText "Recommendation Detail Text";'
placeholder_siteresp = 'ns1:siteResponseOrComments "Site Response";'
placeholder_proj_desc = 'ns1:projectDescription "Project Description";'
placeholder_risk_manage = 'ns1:riskManagementSubElementInformation "Risk Management SubElement Information";'
placeholder_hazardsAndControls = 'ns1:hazardsAndControls "Hazards and Controls";'

In [48]:
modified_text = re.sub(r'ns1:hardControls(.*?)(\n)',
                        rf'{placeholder_hardControls} \2',
                        ont,
                        flags=re.DOTALL)

modified_text = re.sub(r'ns1:softControls(.*?)(\n)',
                        rf'{placeholder_softControls} \2',
                        modified_text,
                        flags=re.DOTALL)

modified_text = re.sub(r'ns1:incidentDescription(.*?)(\n)', 
                       rf'{placeholder_inc_desc} \2', 
                       modified_text, 
                       flags=re.DOTALL)
modified_text = re.sub(r'ns1:inherentControls(.*?)(\n)', 
                       rf'{placeholder_inh_controls} \2', 
                       modified_text, 
                       flags=re.DOTALL)
modified_text = re.sub(r'ns1:inherentHazards(.*?)(\n)', 
                       rf'{placeholder_inh_hazards} \2', 
                       modified_text, 
                       flags=re.DOTALL)
modified_text = re.sub(r'ns1:repairPeriod(.*?)(\n)',
                        rf'{placeholder_repairperiod} \2',
                        modified_text,
                        flags=re.DOTALL)
modified_text = re.sub(r'ns1:utilityOrSupplyDescription(.*?)(\n)',
                        rf'{placeholder_utility_supply} \2',
                        modified_text,
                        flags=re.DOTALL)
modified_text = re.sub(r'ns1:assumptionDescription(.*?)(\n)', 
                       rf'{placeholder_assumpt_desc} \2', 
                       modified_text, 
                       flags=re.DOTALL)
modified_text = re.sub(r'ns1:otherDependenciesDescription(.*?)(\n)', 
                       rf'{placeholder_dep_desc} \2', 
                       modified_text, 
                       flags=re.DOTALL)
modified_text = re.sub(r'ns1:finalProductDescription(.*?)(\n)', 
                       rf'{placeholder_prod_desc} \2', 
                       modified_text, 
                       flags=re.DOTALL)
modified_text = re.sub(r'ns1:workaround(.*?)(\n)',
                        rf'{placeholder_workaround} \2',
                        modified_text,
                        flags=re.DOTALL)
modified_text = re.sub(r'ns1:recommendationDetailText(.*?)(\n)',
                        rf'{placeholder_rec_det} \2',
                        modified_text,
                        flags=re.DOTALL)
modified_text = re.sub(r'ns1:siteResponseOrComments(.*?)(\n)',
                        rf'{placeholder_siteresp} \2',
                        modified_text,
                        flags=re.DOTALL)
modified_text = re.sub(r'ns1:projectDescription(.*?)(\n)',
                        rf'{placeholder_proj_desc} \2',
                        modified_text,
                        flags=re.DOTALL)
modified_text = re.sub(r'ns1:riskManagementSubElementInformation(.*?)(\n)',
                        rf'{placeholder_risk_manage} \2',
                        modified_text,
                        flags=re.DOTALL)
modified_text = re.sub(r'ns1:hazardsAndControls(.*?)(\n)',
                        rf'{placeholder_hazardsAndControls} \2',
                        modified_text,
                        flags=re.DOTALL)

print(modified_text)


#################################################################

###  http://WSP.org/ontology/cro#CLO-02
ns1:CLO-02 rdf:type owl:NamedIndividual .


###  http://WSP.org/ontology/cro#CLO-02-MFL
ns1:CLO-02-MFL rdf:type owl:NamedIndividual ,
                        ns1:CriticalRiskScenario ;
               ns1:impactsOperation ns1:clo_operation ;
               ns1:riskImpactsEquipment ns1:clo_cla_wharf ,
                                        ns1:clo_clb_wharf ,
                                        ns1:clo_shiploader_11 ,
                                        ns1:clo_shiploader_12 ;
               ns1:BISummary "The reported annual business interruption value is USD17,761.59M. Therefore: CLA & One CLB Shiploader: (USD17,761.59M x 56.0 weeks/52 weeks x 20%) = USD3,825.57M. Initial Port Closure (CLA & CLB): (USD17,761.59M x 1.0 week/52 weeks x 95%) = USD324.49M. Only CLA Operational: (USD17,761.59M x 3.0 weeks/52 weeks x 50%) = USD512.35M." ;
               ns1:businessInterruptio

In [42]:
# For each line in modified_text, check if the number of characters is greater than 1000. If it is, print it out

for line in modified_text.split("\n"):
    if len(line) > 2000:
        print(line)

                                                                                     "Loss of containment effects and a Dam Break Study have been completed by ATC Williams (February 2020) and a TSF Emergency Response Plan has been developed for a series of escalating failure mode events. (updated in 2021). Whilst four breach locations were assessed for modelling purposes, the credibility of the event occurring was also considered. A breach towards the northern end of the facility is credible due to water ponding and foundation conditions within the area. Dam break studies used the following scenarios: - North east breach - seismic and piping induced failure. - South east breach - no credible failure mode. - North west breach - seismic induced failure. - South west breach - no credible failure. For north east breach, the impacted area is increased significantly with tailings entering additional drainage channels beyond 30 km of the site. In 24 hours, 14 herder winter shelters and 15 her

In [49]:
# Remove lines that start with ###

modified_text = re.sub(r'###(.*?)(\n)', '', modified_text)

In [50]:
# Separate each individual

raw_list = modified_text.split("\n\n")

In [51]:
raw_list

['',
 'ns1:CLO-02 rdf:type owl:NamedIndividual .',
 '\nns1:CLO-02-MFL rdf:type owl:NamedIndividual ,\n                        ns1:CriticalRiskScenario ;\n               ns1:impactsOperation ns1:clo_operation ;\n               ns1:riskImpactsEquipment ns1:clo_cla_wharf ,\n                                        ns1:clo_clb_wharf ,\n                                        ns1:clo_shiploader_11 ,\n                                        ns1:clo_shiploader_12 ;\n               ns1:BISummary "The reported annual business interruption value is USD17,761.59M. Therefore: CLA & One CLB Shiploader: (USD17,761.59M x 56.0 weeks/52 weeks x 20%) = USD3,825.57M. Initial Port Closure (CLA & CLB): (USD17,761.59M x 1.0 week/52 weeks x 95%) = USD324.49M. Only CLA Operational: (USD17,761.59M x 3.0 weeks/52 weeks x 50%) = USD512.35M." ;\n               ns1:businessInterruptionUSDM 4662.4 ;\n               ns1:hardControls "Hard Controls"; \n               ns1:incidentDescription "Description of the inciden

In [52]:
# Remove RDF syntax and clean up the text

clean_list = []
for element in raw_list:
    element = element.replace("a owl: ,", " is a ").replace(" a ns1:"," is a ").replace(" a owl:"," is a ").replace("rdf:type owl:", "is a ")
    element = element.replace("###", "").replace("ns1:", ":")
    element = element[element.find("\n")+1:]
    element = " ".join(element.split())
    # For each element, remove the text string between "incidentDescription" and " ;" (if it exists)
        
    clean_list.append(element)
    
# Remove empty strings
clean_list = list(filter(None, clean_list))
    
clean_list = clean_list[5:]

clean_list

[':CLO-04-MFL is a NamedIndividual , :CriticalRiskScenario ; :impactsOperation :clo_operation ; :riskImpactsEquipment :clo_shipping_channel ; :BISummary "The reported annual business interruption value is USD17,761.59M. Therefore: Salvage period of grounded vessel: (USD17,761.59M x 11.6 weeks/52 weeks x 20%) = USD792.44M. Port closure (CLA & CLB): (USD17,761.59M x 0.4 weeks/52 weeks x 95%) = USD129.80M." ; :assetGroup "Port Facilities" ; :businessInterruptionUSDM 922.2 ; :hardControls "Hard Controls"; :incidentDescription "Description of the incident"; :incidentSummary "Shipping channel blocked by grounded vessel" ; :inherentControls "Inherent Controls"; :inherentHazards "Inherent Hazards"; :likelihood "Unlikely" ; :lossScenarioPeril "Blockage" ; :otherCostsUSDM 0.0 ; :preincident_and_workaroundARP "\uf0b7 An Asset Reinstatement Plan is prepared for this scenario. mobilised from a location such as Singapore with 4 weeks mobilisation." ; :propertyDamageUSDM 0.0 ; :recommendationDescript

In [16]:
len(clean_list)

710

## Generate summary description

In [None]:
non_inst_ont_path = "CRA V16 MFL.ttl"

non_inst_ont_path = os.path.join("..", "ontology/non_inst", non_inst_ont_path)

with open(non_inst_ont_path, 'r', encoding='utf-8') as file:
        non_inst_ontology = file.read()

In [None]:
desc_template = """
Your task is to provide a text description of the following individual. 
The individual is a part of a larger RDF ontology, that is provided below.
Your description should be detailed and provide information about the individual's class, relationships, and any other relevant details. 
The individual is described here:
{individual}

This is an example of the expected output:

'The individual :CLO-07 is a NamedIndividual and is classified as a :CriticalRiskScenario.\n
### Class and Relationships:
- Class: :CLO-07 is a part of the Critical Risk Scenario class.
- Impacts Operation: :CLO-07 impacts the operation identified as :clo_operation.
- Risk Impacts Facility: The risk scenario affects the facility known as :clo_clb_terminal.\n
### Detailed Attributes:
- Business Interruption Summary (BISummary): The reported annual business interruption value is USD 17,761.59 million
- Asset Group: The scenario pertains to the "Port Facilities" asset group.
- Business Interruption USD: The financial impact due to business interruption is quantified at USD 239.1 million.
- Hard Controls: The scenario includes "Hard Controls" as a mitigation measure.
- Incident Description: The incident is described as "Description of the incident."
- Incident Summary: The summary of the incident states that a ship suffers damage at the CLB berth.
- Inherent Controls: The scenario includes "Inherent Controls" as part of its risk management.
- Inherent Hazards: The scenario involves "Inherent Hazards."
- Likelihood: The likelihood of this scenario occurring is categorized as "Rare."
- Loss Scenario Peril: The peril associated with the loss scenario is described as "Blockage Asset Group Port Facilities."
- Pre-incident and Workaround ARP: An Asset Reinstatement Plan (ARP) is prepared for this scenario, with mobilization from a location such as Singapore, taking approximately 4 weeks.\n
- Property Damage USD: The property damage is estimated at USD 10.0 million.
- Recommendation Description: There are no applicable recommendations for this scenario.
- Recovery Repair: Depending on the circumstances, the ship salvage and any wharf repair costs could be entirely the ship owner\'s responsibility.
- Repair Period: The repair period is noted but not specified in detail.
- Soft Controls: The scenario includes "Soft Controls" as part of its risk management.
- Workaround: A workaround is noted but not specified in detail.
- Hazards and Controls: The scenario involves "Hazards and Controls."
- Total Loss USD: The total financial loss from this scenario is estimated at USD 249.1 million.\n'
"""

In [None]:
desc_prompt = PromptTemplate(
    input_variables=["individual"], template=desc_template
)

desc_generation_prompt: BasePromptTemplate = desc_prompt

desc_generation_chain = desc_generation_prompt | llm

In [None]:
# Example

individual_description = desc_generation_chain.invoke(
    {"individual": clean_list[1]}
)

individual_description.content

'The individual :CLO-09 is a NamedIndividual and is classified as a :CriticalRiskScenario. This individual is associated with several key attributes and relationships that provide a comprehensive overview of the risk scenario it represents.\n\n### Class and Relationships:\n- **Class**: :CLO-09 is a part of the :CriticalRiskScenario class, indicating that it represents a significant risk scenario that could have critical impacts on operations and facilities.\n- **Impacts Operation**: :CLO-09 impacts the operation identified as :clo_operation.\n- **Risk Impacts Facility**: The risk scenario affects the facility known as :clo_clb_terminal.\n\n### Detailed Attributes:\n- **Business Interruption Summary (BISummary)**: The reported annual business interruption value is USD 17,761.59 million. The calculation for repairs to CLB Outloading conveyors is provided as follows: (USD 17,761.59M x 4.0 weeks/52 weeks x 50%) = USD 683.14 million.\n- **Business Interruption USD**: The financial impact du

In [74]:
# Create a dataframe with the individual and the corresponding description

import pandas as pd

individuals = clean_list

individual_descriptions = []

for individual in tqdm.tqdm(individuals):
    individual_description = desc_generation_chain.invoke(
        {"individual": individual}
    )
    individual_descriptions.append(individual_description.content)
    
individuals_df = pd.DataFrame({"Individual": individuals, "Description": individual_descriptions})

100%|██████████| 710/710 [52:45<00:00,  4.46s/it]  


In [75]:
individuals_df

Unnamed: 0,Individual,Description
0,":CLO-07 is a NamedIndividual , :CriticalRiskSc...",The individual :CLO-07 is a NamedIndividual an...
1,":CLO-09 is a NamedIndividual , :CriticalRiskSc...",The individual :CLO-09 is a NamedIndividual an...
2,":CLO-10 is a NamedIndividual , :CriticalRiskSc...",The individual :CLO-10 is a NamedIndividual an...
3,":CLO-11 is a NamedIndividual , :CriticalRiskSc...",The individual :CLO-11 is a NamedIndividual an...
4,":CLO-12 is a NamedIndividual , :CriticalRiskSc...",The individual :CLO-12 is a NamedIndividual an...
...,...,...
705,":dpo_ymps_turbine_5 is a NamedIndividual , :Tu...",The individual :dpo_ymps_turbine_5 is a NamedI...
706,":horizon_power is a NamedIndividual , :Company...",The individual :horizon_power is a NamedIndivi...
707,<http://WSP.org/ontology/cro#CLO_Plant_&_Equip...,The individual :CLO_Plant_&_Equipment is a Nam...
708,<http://WSP.org/ontology/cro#OTC_Plant_&_Equip...,The individual :OTC_Plant_&_Equipment is a Nam...


## Index elements from list

In [88]:
chunkcount = 0 

for index, row in tqdm.tqdm(individuals_df.iterrows(), total=len(individuals_df), desc="Uploading to Azure Search"):
    chunkcount += 1
    
    ind = row["Individual"]
    desc = row["Description"]
    
    key = str(chunkcount)+'-'+ ontology_path + str(random.randint(1000000000, 9999999999))

    # creating a unique key
    input_bytes = key.encode('utf-8')
    base64_bytes = base64.urlsafe_b64encode(input_bytes)
    base64_string = base64_bytes.decode('utf-8')
    key=base64_string

    response = get_embeddings(desc, azure_endpoint, openai_api_key, openai_api_version, embedding_deployment_name)
    embeddings_data = response

    DOCUMENT = {
        "id": key,
        "doc_path": ontology_path,
        "individual": ind,
        "summary": desc,
        "embedding": embeddings_data, 
    }

    search_client = SearchClient(search_url, index_name, search_credential)
    result = search_client.upload_documents(documents=[DOCUMENT])

Uploading to Azure Search: 100%|██████████| 710/710 [27:28<00:00,  2.32s/it]


In [None]:
chunkcount = 0 

for para in clean_list:
    chunkcount += 1
    key = str(chunkcount)+'-'+ ontology_path + str(random.randint(1000000000, 9999999999))

    # creating a unique key
    input_bytes = key.encode('utf-8')
    base64_bytes = base64.urlsafe_b64encode(input_bytes)
    base64_string = base64_bytes.decode('utf-8')
    key=base64_string

    response = get_embeddings(para, azure_endpoint, openai_api_key, openai_api_version, embedding_deployment_name)
    embeddings_data = response  

    DOCUMENT = {
        "id": key,
        "doc_path": ontology_path,
        "chunk": para,
        "embedding": embeddings_data, 
    }

    search_client = SearchClient(search_url, index_name, search_credential)
    result = search_client.upload_documents(documents=[DOCUMENT])
    print(f"Chunk n. {chunkcount} uploaded")

## Delete elements

In [20]:
docs_to_be_deleted = ["PTO CRA V15-2.ttl"]

In [21]:
for blob_path in docs_to_be_deleted:
    field_value = blob_path
    filter_query = f"doc_path eq '{field_value}'"
    
    # Run the loop until no more documents are found
    while True:
        documents = search_client.search(search_text="*", include_total_count=False, filter=filter_query)
        document_list = list(documents)
        
        if not document_list:
            print(f"No more documents found for '{field_value}'. Moving to the next document.")
            break  # Exit the loop if no documents are found
        
        # Delete all found documents in the current search iteration
        for result in document_list:
            document_fields = {key: value for key, value in result.items() if key != "@search.documentId"}
            search_client.delete_documents(documents=[document_fields])
            print(f"Chunk with doc_name eq '{field_value}' deleted successfully!")
            

Chunk with doc_name eq 'PTO CRA V15-2.ttl' deleted successfully!
Chunk with doc_name eq 'PTO CRA V15-2.ttl' deleted successfully!
Chunk with doc_name eq 'PTO CRA V15-2.ttl' deleted successfully!
Chunk with doc_name eq 'PTO CRA V15-2.ttl' deleted successfully!
Chunk with doc_name eq 'PTO CRA V15-2.ttl' deleted successfully!
Chunk with doc_name eq 'PTO CRA V15-2.ttl' deleted successfully!
Chunk with doc_name eq 'PTO CRA V15-2.ttl' deleted successfully!
Chunk with doc_name eq 'PTO CRA V15-2.ttl' deleted successfully!
Chunk with doc_name eq 'PTO CRA V15-2.ttl' deleted successfully!
Chunk with doc_name eq 'PTO CRA V15-2.ttl' deleted successfully!
Chunk with doc_name eq 'PTO CRA V15-2.ttl' deleted successfully!
Chunk with doc_name eq 'PTO CRA V15-2.ttl' deleted successfully!
Chunk with doc_name eq 'PTO CRA V15-2.ttl' deleted successfully!
Chunk with doc_name eq 'PTO CRA V15-2.ttl' deleted successfully!
Chunk with doc_name eq 'PTO CRA V15-2.ttl' deleted successfully!
Chunk with doc_name eq 'P