## Setup

In [4]:
from __future__ import annotations
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.prompts.base import BasePromptTemplate
from pydantic import Field
from langchain_core.prompts.prompt import PromptTemplate
from azure.core.credentials import AzureKeyCredential
from langchain_community.graphs import OntotextGraphDBGraph
from rdflib.plugins.sparql import prepareQuery
from rdflib import Graph
import os
import pandas as pd

from dotenv import load_dotenv
load_dotenv(os.path.join("..", "Azure OpenAI credentials.env"))

True

In [2]:
azure_endpoint = os.environ['GLOBAL_AZURE_ENDPOINT']
openai_api_key = os.environ['GLOBAL_OPENAI_API_KEY']

openai_deployment_name = os.environ['GLOBAL_GPT_DEPLOYMENT_NAME']
openai_api_version = os.environ['GLOBAL_OPENAI_API_VERSION']
embedding_model = os.environ['GLOBAL_EMBEDDING_MODEL']
embedding_deployment_name = os.environ['GLOBAL_EMBEDDING_DEPLOYMENT_NAME']

search_endpoint = os.environ['SEARCH_ENDPOINT']
search_api_key = os.environ['SEARCH_API_KEY']
search_api_version = os.environ['SEARCH_API_VERSION']
search_service_name = os.environ['SEARCH_SERVICE_NAME']

# langsmith_api_key = os.environ['LANGSMITH_API_KEY']

search_url = f"https://{search_service_name}.search.windows.net/"
search_credential = AzureKeyCredential(search_api_key)

llm = AzureChatOpenAI(
    deployment_name=openai_deployment_name, 
    openai_api_version=openai_api_version, 
    openai_api_key=openai_api_key, 
    azure_endpoint=azure_endpoint, 
    temperature=0
)

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=embedding_deployment_name,
    api_version=openai_api_version,
    api_key=openai_api_key,
    azure_endpoint=azure_endpoint,
)

## Parameters

In [3]:
QUESTION_GENERATOR_TEMPLATE = """
Your task is to produce a list of questions in natural language that can be answered using the given ontology schema.
The desired ooutput is a list that is as diverse as possible, covering a wide range of topics and types of questions.
Sample output:
"What is the estimated financial loss for a fire in an electrical substation?", "How do the estimated losses compare between the fire on a moored fuel tanker and structural integrity issues on the wharf?", "When is the recommendation to replace reclaimers expected to be implemented?"
Do not include any explanations or apologies in your output.
Do not include any text except for the questions.
Do not include a number at the beginning of the question.
Generate 30 questions.
The ontology schema delimited by triple backticks in Turtle format is:
```
{schema}
```
"""

QUESTION_GENERATOR_PROMPT = PromptTemplate(
    input_variables=["schema"],
    template=QUESTION_GENERATOR_TEMPLATE,
)

question_generator_prompt: BasePromptTemplate = QUESTION_GENERATOR_PROMPT

In [5]:
file_path = os.path.join("..", "ontology", "V12_DPO_Individuals.ttl")

g = Graph()
g.parse(file_path)

with open(file_path, 'r', encoding='utf-8') as file:
    ontology = file.read()

## Chain

In [6]:
question_generator_chain = question_generator_prompt | llm

question_generator_chain_result = question_generator_chain.invoke(
    {"schema": ontology}
)

output = question_generator_chain_result.content

In [7]:
print(output)

What is the name of the company that owns Hamersley Iron Pty Ltd?

Which division operates the Dampier Port?

What is the material handled at Parker Point?

How much iron ore does East Intercourse Island handle annually?

What type of equipment is used for inloading at Parker Point?

What is the description of the Perth Operations Centre?

When was the Remote Draft Survey System retired?

What is the annual budget of the SMART Program?

How many recommendations were completed since the last review in 2023?

What is the expected response time for Recommendation 1?

What is the incident description for the "Fixed Plant Incident"?

What is the planned increase in Dampier shipping for 2023?

What is the estimated cost for the Parker Point Reclaimer Replacement project?

What is the project description for the "Dampier Fuel Wharf Improvement"?

What is the material description of the SB 10 product?

What is the water supply source for Dampier and Wickham towns?

What is the audit and review

In [8]:
questions_list = output.split("\n\n")
questions_list

['What is the name of the company that owns Hamersley Iron Pty Ltd?',
 'Which division operates the Dampier Port?',
 'What is the material handled at Parker Point?',
 'How much iron ore does East Intercourse Island handle annually?',
 'What type of equipment is used for inloading at Parker Point?',
 'What is the description of the Perth Operations Centre?',
 'When was the Remote Draft Survey System retired?',
 'What is the annual budget of the SMART Program?',
 'How many recommendations were completed since the last review in 2023?',
 'What is the expected response time for Recommendation 1?',
 'What is the incident description for the "Fixed Plant Incident"?',
 'What is the planned increase in Dampier shipping for 2023?',
 'What is the estimated cost for the Parker Point Reclaimer Replacement project?',
 'What is the project description for the "Dampier Fuel Wharf Improvement"?',
 'What is the material description of the SB 10 product?',
 'What is the water supply source for Dampier a

## Export

In [9]:
df = pd.DataFrame(columns=["Question"])

for question in questions_list:
    test_result = pd.DataFrame([{
        "Question": str(question),
    }])

    df = pd.concat([df, test_result], ignore_index=True)

# Generate question codes
code_sequence = [f"CRD-{str(i).zfill(2)}" for i in range(1, len(df) + 1)]
try:
    df.insert(0, 'Code', code_sequence)
except:
    pass

In [12]:
output_file_path = os.path.join("..", "validation", "Generated Questions_V12.xlsx")

writer = pd.ExcelWriter(output_file_path, engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', index=False)

writer.close()