## Setup

In [1]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery
from azure.core.credentials import AzureKeyCredential

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langgraph.graph import END, StateGraph

from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
from langchain_community.utilities.alpha_vantage import AlphaVantageAPIWrapper
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.tools import DuckDuckGoSearchRun
from langgraph.graph import END, StateGraph, START
from langgraph.prebuilt import ToolNode
from langgraph.prebuilt import tools_condition
from langchain_community.retrievers import AzureAISearchRetriever
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.prompts.base import BasePromptTemplate

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

from langchain_core.messages import (
    BaseMessage,
    ToolMessage,
    HumanMessage,
)
from IPython.display import Image, display

from langsmith import Client
from langsmith import traceable
import os
import requests

import pprint
import textwrap
import xlsxwriter
import pandas as pd
from typing import List
import datetime
import time

#Azure Search
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.models import VectorizedQuery
from azure.search.documents.indexes.models import (
    ComplexField,
    CorsOptions,
    SearchIndex,
    ScoringProfile,
    SearchFieldDataType,
    SimpleField,
    SearchField,
    SearchableField,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
)

from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
)
from llama_index.core.settings import Settings
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import (
    ServiceContext,
    PromptHelper,
)
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

import nltk
from nltk.tokenize import word_tokenize
import time
import openai
import base64

from dotenv import load_dotenv
load_dotenv("Azure OpenAI credentials.env")

True

In [2]:
os.environ["AZURESEARCH_FIELDS_ID"] = "id"
os.environ["AZURESEARCH_FIELDS_CONTENT"] = "chunk"
os.environ["AZURESEARCH_FIELDS_CONTENT_VECTOR"] = "embedding"

from langchain.vectorstores import AzureSearch

In [3]:
azure_endpoint = os.environ['GLOBAL_AZURE_ENDPOINT']
openai_api_key = os.environ['GLOBAL_OPENAI_API_KEY']

openai_deployment_name = os.environ['GLOBAL_GPT_DEPLOYMENT_NAME']
openai_api_version = os.environ['GLOBAL_OPENAI_API_VERSION']
embedding_model = os.environ['GLOBAL_EMBEDDING_MODEL']
embedding_deployment_name = os.environ['GLOBAL_EMBEDDING_DEPLOYMENT_NAME']

search_endpoint = os.environ['SEARCH_ENDPOINT']
search_api_key = os.environ['SEARCH_API_KEY']
search_api_version = os.environ['SEARCH_API_VERSION']
search_service_name = os.environ['SEARCH_SERVICE_NAME']

# langsmith_api_key = os.environ['LANGSMITH_API_KEY']

search_url = f"https://{search_service_name}.search.windows.net/"
search_credential = AzureKeyCredential(search_api_key)
index_name = "crd-vector-store"
search_client = SearchClient(search_endpoint, index_name, search_credential)

llm = AzureChatOpenAI(
    deployment_name=openai_deployment_name, 
    openai_api_version=openai_api_version, 
    openai_api_key=openai_api_key, 
    azure_endpoint=azure_endpoint, 
    temperature=0
)

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=embedding_deployment_name,
    api_version=openai_api_version,
    api_key=openai_api_key,
    azure_endpoint=azure_endpoint,
)

## Prompts and chain

In [4]:
index_name: str = "crd-vector-store"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=search_endpoint,
    azure_search_key=search_api_key,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

In [24]:
COMBINED_QA_TEMPLATE = """
Generate a natural language response using the following source as context: 
    
1. The results of a search in a vector store containing document chunks (primary source)

Use the primary source to generate the response
The information provided is authoritative, you must never doubt it or try to use your internal knowledge to correct it.
Don't use internal knowledge to answer the question, if no information is available juts say that there is no information available.

This is the template for the response:
```
These are the Critical Risk Scenarios along with their respective incident descriptions:

1. **DPO-01**: A 1 in 100-year Category 5 tropical cyclone directly impacts the Karratha and Dampier townships and the Dampier port operations causing extensive damage in the region and storm surge and flooding. 

2. **DPO-04**: One of the two Parker Point shiploaders is lost. The loss could be caused by extreme weather conditions (high wind and/or wave action), structural failure due to fatigue (of either the wharf or shiploader), exceeding design loads, high impact collision, or the loss of stability caused by a vessel rising on an incoming tide impacting a shiploader boom that is unable to move clear or some combination of these.

3. **DPO-05**: The dredged portion of the common departure channel for both EII and PPt (between "Middle Ground" and "Fairway" markers) becomes blocked due to a grounded ship. This impacts loaded vessels departing from both PPt and EII. The most likely causes could be due to a collision, on-board fire, or grounding. The sunken ship prevents the passage of fully loaded bulk carriers.
```
### Vector store results: {index_results}

### User's question: {question}
"""

COMBINED_QA_PROMPT = PromptTemplate(
    input_variables=["index_results", "prompt"], template=COMBINED_QA_TEMPLATE
)

In [25]:
combined_prompt: BasePromptTemplate = COMBINED_QA_PROMPT
combined_chain = combined_prompt | llm

## Walkthrough

In [32]:
system_prompt = "Extract the keywords from this user's question. They will be used for a search in a vector store. The output should be only a text string. User's query: "

In [33]:
input_query = "Tell me about the 2021 Dredging campaign"

In [39]:
raw_query = str(system_prompt + input_query)
raw_query

"Extract the keywords from this user's question. They will be used for a search in a vector store. The output should be only a text string. User's query: Tell me about the 2021 Dredging campaign"

In [37]:
search_query = llm.invoke(raw_query)

In [28]:
search_query.content

'2021 Dredging campaign'

In [29]:
# Vector store search
index_results = vector_store.hybrid_search(
query=search_query.content, k=10
)

In [30]:
index_results

[Document(metadata={'doc_path': 'Dampier_Port_CRA_MFL_2023_3.pdf', 'id': 'NTY0LURhbXBpZXJfUG9ydF9DUkFfTUZMXzIwMjNfMy5wZGY=', '@search.score': 0.03226646035909653, '@search.reranker_score': None, '@search.highlights': None, '@search.captions': None}, page_content='This remains  in progress.\nHawcroft  2021:  Capital  funds  are approved  for work  to commence  on site following  the construction  of purpose\nbuilt jig lifting  frame  for the lay by berth.  As works  have  not be completed  this remains  in progress.\nDPO  2021:  Vendor  engaged  to commence  works  in Q4 2021.\nHawcroft  2022:  This work  is underway.\nDP0 2022:  The estimated  loss for this scenario  is considerably  reduced  as we have  blast  and painted  all critical\ncorrosion  areas  on the 9 of the 12 walkways  at a cost of approximately  $4.5M.  During  this work,  actual\ncorrosion  below  rust was somewhat  less than expected  and no structural  patching  was required  for any areas\ngiven  corrosion  allowanc

In [31]:
# Answer generation
combined_chain_result = combined_chain.invoke(
    {"question": input_query, "index_results": index_results}
)

result = combined_chain_result.content
print(result)

The 2021 Dredging Campaign at Dampier Port involved the use of a trailer suction dredge, survey vessels, and a sweep vessel. The total expenditure for this campaign was approximately USD 15 million. Key highlights of the campaign include:

- A total of 107,000 cubic meters of material was dredged from channels, turning basins, and berth pockets.
- There were no environmental incidents or marine fauna fatalities during the campaign.
- Environmental monitoring was completed in accordance with Sea Dumping Permits (SDP), and SDP compliance reporting was successfully submitted.


## Validation

In [6]:
questions_df = pd.read_excel("validation/Generated Questions.xlsx")
questions = questions_df["Question"].tolist()

In [7]:
results_df = pd.DataFrame(columns=["input_query", "result"])

In [None]:
for input_query in questions:
    
    raw_query = str(system_prompt + input_query)
    
    search_query = llm.invoke(raw_query)
    
    # Vector store search
    index_results = vector_store.hybrid_search(
    query=search_query, k=10
    )
    
    # Answer generation
    combined_chain_result = combined_chain.invoke(
        {"question": input_query, "index_results": index_results}
    )

    response = combined_chain_result.content
    test_result = pd.DataFrame([{
        "input_query": str(input_query),
        "result": str(response),
    }])

    results_df = pd.concat([results_df, test_result], ignore_index=True)

In [11]:
# Generate question codes
code_sequence = [f"CRD-{str(i).zfill(2)}" for i in range(1, len(results_df) + 1)]
try:
    results_df.insert(0, 'code', code_sequence)
except:
    pass

# Write data to Excel
timestamp = pd.Timestamp.now().strftime("%Y_%m_%d_%H_%M_%S")

with pd.ExcelWriter(f"validation/Vector Validation_{timestamp}.xlsx", engine='xlsxwriter') as writer:

    # Save results data to Sheet1
    results_df.to_excel(writer, sheet_name='Sheet1', index=False)

    # Access the workbook and worksheet for formatting
    workbook = writer.book
    worksheet1 = writer.sheets['Sheet1']

    # Set column width
    worksheet1.set_column(0, len(results_df.columns) - 1, 30)