## Setup

In [23]:
import os
import streamlit as st
import openai
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery
from azure.core.credentials import AzureKeyCredential
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core.schema import TextNode
from llama_index.core import VectorStoreIndex, PromptHelper, ServiceContext
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core.settings import Settings

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.tools import BaseTool
from langchain.pydantic_v1 import BaseModel, Field
from langchain.callbacks.manager import CallbackManagerForToolRun
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_openai import AzureChatOpenAI
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables import (
    RunnableLambda,
    ConfigurableFieldSpec,
    RunnablePassthrough
)
from langchain_community.chat_message_histories import ChatMessageHistory

from typing import Optional, Type
from typing import List, OrderedDict
import requests
import json
import time

In [24]:
azure_endpoint="https://esiapoc.openai.azure.com/"
openai_api_key="ade47e4a9dec4972b10542b3ab571790"

# GPT and emebedding model
openai_deployment_name="llm-gpt35"
openai_api_version="2024-02-01"
embedding_model = 'text-embedding-ada-002' 
embedding_deployment_name = "embedding-model"

# Azure AI Search
search_endpoint = "https://esg-ai-search.search.windows.net/"
search_api_key='qRCn1L9kDgvXvhX5D46rPKi5aZMe0AjGaT0TgtX6WAAzSeAY8Y03'
search_api_version = "2024-03-01-preview"
search_service_name = 'esg-ai-search'

index_name = "esia_ebrd_index"

## Testing queries

In [25]:
query = "What do you know about SESA Benban?"

In [26]:
headers = {
    "Content-Type": "application/json",
    "api-key": search_api_key  # Replace with your actual API key
}

params = {'api-version': search_api_version}

k = 5

agg_search_results = dict()

# Define the request payload
search_payload = {
    "search": query,
    "select": "id, doc_path, energy_sector, chunk",
    "vectorQueries": [{"kind": "text", "k": k, "fields": "embedding", "text": query}],
    "count": "true",
    "top": k
}

In [29]:
response = requests.post(search_endpoint + "indexes/" + index_name + "/docs/search",
                        data=json.dumps(search_payload), headers=headers, params=params)

search_results = response.json()
agg_search_results[index_name] = search_results

reranker_threshold = 0

In [30]:
search_results

{'@odata.context': "https://esg-ai-search.search.windows.net/indexes('esia_ebrd_index')/$metadata#docs(*)",
 '@odata.count': 1090,
 'value': [{'@search.score': 0.032258063554763794,
   'id': 'MTAyLUM6XFVzZXJzXElUTFMxMDQ0MTVcRGVza3RvcFxQcm9nZXR0aVxFU0lBIGRhdGFiYXNlXDIwMTZcUFZcRWd5cHQgUmVuZXdhYmxlIEZlZWQtSW4tVGFyaWZmIEZyYW1ld29yayAtIEVneXB0XDQ4MTIzc2VzYS5wZGY=',
   'chunk': 'Page 12 of 210 and meet all permitting requirements. If ESIAs were to be carried out for each of the 41 \nprojects, this would lead to duplication of effort, unnecessary cost, and loss of time. It would \nalso be confusing to local communities and other stakeholders because of the large number \nof public meetings required as part of the ESIA process.  \n \nIt is customary for the EEAA to require a Strategic/Regional Environmental and Social \nAssessment (SESA) in the case of large projects with a cluster of similar/adjacent sub-\nprojects, as is the case with the Benban Solar Park. NREA, supported by EBRD, initiated

In [11]:
content = dict()
ordered_content = OrderedDict()

for index, search_results in agg_search_results.items():
    for result in search_results['value']:
        # Show results that are at least N% of the max possible score=4
        if result['@search.score'] > reranker_threshold:
            content[result['id']] = {
                "title": result['energy_sector'],
                "name": result['energy_sector'],
                "chunk": result['chunk'],
                "location": result['doc_path'],
                # "caption": result['@search.captions'][0]['text'],
                "score": result['@search.score'],
                "index": index
            }

KeyError: 'value'

In [None]:
topk = k

count = 0  # To keep track of the number of results added
for id in sorted(content, key=lambda x: content[x]["score"], reverse=True):
    ordered_content[id] = content[id]
    count += 1
    if count >= topk:  # Stop after adding topK results
        break

## Process PDF

In [1]:
from docx import Document

In [6]:
docx_path = r"C:\Users\ITLS104415\Desktop\Progetti\ESIA database\2016\Oil and gas\Hussein Thermal Power Station Repowering - Jordan\AP Zarqa CCGT Updated ESIA Volume 3 ISSUE 3_FINAL.docx"

doc = Document(docx_path)

num = 0

for para in doc.paragraphs:
    style = para.style.name
    if 'Heading' in style:
        print(f"Heading: {para.text} (Style: {style})")
    else:
            num = num + 1
            print(f"Paragraph n. {num}: {para.text}")

Paragraph n. 1:  	 
Paragraph n. 2:  
Paragraph n. 3:  
Paragraph n. 4:  
Paragraph n. 5: ACWA Power Zarqa CCGT Project Zarqa, Jordan 
Paragraph n. 6:  
Paragraph n. 7: Updated Environmental and Social Impact Assessment  
Paragraph n. 8: Volume 3 – Outline Environmental & Social Management and Monitoring Plan 
Paragraph n. 9:   
Paragraph n. 10: Prepared for: 
Paragraph n. 11: ACWA Power 
Paragraph n. 12: July 2016 
Paragraph n. 13: 5 Capitals Environmental and Management Consulting 
Paragraph n. 14: PO Box 119899, Sheikh Zayed Road, Dubai, UAE 
Paragraph n. 15: Tel: +971 4 343 5955Fax: +971 4 343 9366 
Paragraph n. 16:  
Paragraph n. 17: 
Paragraph n. 18: Document Information 
Paragraph n. 19:  
Paragraph n. 20: Document Control 
Paragraph n. 21:  
Paragraph n. 22:  	 
Paragraph n. 23: Tables 
Paragraph n. 24: Table 3-1 Proposed Roles and Responsibilities of the EPC Contractor Team (including sub-
Paragraph n. 25: contractors and visitors) .............................................

In [38]:
def split_long_section(text, word_limit):
    words = text.split()
    chunks = []
    for i in range(0, len(words), word_limit):
        chunk = " ".join(words[i:i + word_limit])
        chunks.append(chunk)
    return chunks

In [80]:
sections = []
section = ""

for para in doc.paragraphs:
    style = para.style.name
    if 'Heading 1' in style or 'Heading 2' in style:
        # Before appending, check if the current section is too short
        if len(section.split()) < 50 and sections:
            sections[-1] += " " + section
        else:
            sections.append(section)
        section = para.text
    else:
        section = section + " " + para.text

# Handle the last section
if len(section.split()) < 50 and sections:
    sections[-1] += " " + section
else:
    sections.append(section)
    
'''
# Process sections to split long ones
processed_sections = []
for sec in sections:
    count = 0
    if len(sec.split()) > 700:
        processed_sections.extend(split_long_section(sec, 700))
        count += 1
        print(f"Splitted {count} sections")
    else:
        processed_sections.append(sec)
'''

# Print the processed sections
for i, section in enumerate(sections):
    print(f"Section {i+1}: {section}")

Section 1:   	        ACWA Power Zarqa CCGT Project Zarqa, Jordan    Updated Environmental and Social Impact Assessment   Volume 3 – Outline Environmental & Social Management and Monitoring Plan     Prepared for:  ACWA Power  July 2016  5 Capitals Environmental and Management Consulting  PO Box 119899, Sheikh Zayed Road, Dubai, UAE  Tel: +971 4 343 5955Fax: +971 4 343 9366     Document Information    Document Control     	  Tables  Table 3-1 Proposed Roles and Responsibilities of the EPC Contractor Team (including sub- contractors and visitors) ................................................................................................................. 12 Table 3-2 Complimentary Plans and Procedures .............................................................................. 18  Table 5-1 Air Quality Mitigation & Management Measures – Construction Phase ..................... 22  Table 5-2 Air Quality Mitigation & Management Measures – Operational Phase ...................... 25  

In [84]:
for i, section in enumerate(sections):
    print(f"Length of section {i+1}: {len(section)}")

print(f"Average length of sections: {sum([len(sec) for sec in sections])/len(sections)}")

Length of section 1: 4334
Length of section 2: 1552
Length of section 3: 1720
Length of section 4: 1015
Length of section 5: 6107
Length of section 6: 2681
Length of section 7: 314
Length of section 8: 1022
Length of section 9: 2991
Length of section 10: 1357
Length of section 11: 1272
Length of section 12: 1472
Length of section 13: 677
Length of section 14: 1288
Length of section 15: 3121
Length of section 16: 478
Length of section 17: 3918
Length of section 18: 447
Length of section 19: 5809
Length of section 20: 1018
Length of section 21: 457
Length of section 22: 458
Length of section 23: 800
Length of section 24: 433
Length of section 25: 800
Length of section 26: 2678
Length of section 27: 5479
Length of section 28: 770
Average length of sections: 1945.2857142857142


In [55]:
processed_sections[3]

'2.1 \tObjectives of Environmental & Social Management and Monitoring Plans  The main objective of ESMMPs is to ensure that the various adverse impacts associated with the project are properly mitigated and managed. The objective of the ESMMP at various stages of the project planning and implementation are as follows: '

In [29]:
# Print the longest section
longest_section = max(sections, key=lambda x: len(x.split()))
print(f"Longest section: {len(longest_section.split())} words")
print(f"{longest_section}")

Longest section: 741 words
Lender Requirements  EBRD  The following is referenced from EBRD Performance Requirement 1 in regard to Environmental and Social Management Plans:  ‘Taking into account the findings of the environmental and social assessment process and the outcomes of stakeholder engagement, the client will develop and implement a programme of actions to address the identified project’s environmental and social impacts and issues and other performance improvement measures to meet the PRs. Depending on the project, the programme may consist of a combination of documented operational policies, management systems, procedures, plans, practices and capital investments, collectively known as Environmental and Social Management Plans (ESMPs).   The ESMP will reflect the mitigation hierarchy and, where technically and financially feasible, favour the avoidance and prevention of impacts over minimisation, mitigation or compensation, and ensure that all relevant stages of the project 

In [37]:
longest_section
# Wrap the text of the longest section
import textwrap

wrapped_text = textwrap.fill(longest_section, width=170)
print(wrapped_text)

Lender Requirements  EBRD  The following is referenced from EBRD Performance Requirement 1 in regard to Environmental and Social Management Plans:  ‘Taking into account
the findings of the environmental and social assessment process and the outcomes of stakeholder engagement, the client will develop and implement a programme of actions
to address the identified project’s environmental and social impacts and issues and other performance improvement measures to meet the PRs. Depending on the project, the
programme may consist of a combination of documented operational policies, management systems, procedures, plans, practices and capital investments, collectively known as
Environmental and Social Management Plans (ESMPs).   The ESMP will reflect the mitigation hierarchy and, where technically and financially feasible, favour the avoidance
and prevention of impacts over minimisation, mitigation or compensation, and ensure that all relevant stages of the project are structured to meet appl

In [7]:
# Create a list to sztore the document seciotns. Each section starts with a heading
sections = []
section = ""

for para in doc.paragraphs:
    style = para.style.name
    if 'Heading' in style:
        sections.append(section)
        section = para.text
    else:
        section = section + " " + para.text
        
sections.append(section)

# Print the sections

for i, section in enumerate(sections):
    print(f"Section {i+1}: {section}")


Section 1:   	        ACWA Power Zarqa CCGT Project Zarqa, Jordan    Updated Environmental and Social Impact Assessment   Volume 3 – Outline Environmental & Social Management and Monitoring Plan     Prepared for:  ACWA Power  July 2016  5 Capitals Environmental and Management Consulting  PO Box 119899, Sheikh Zayed Road, Dubai, UAE  Tel: +971 4 343 5955Fax: +971 4 343 9366     Document Information    Document Control     	  Tables  Table 3-1 Proposed Roles and Responsibilities of the EPC Contractor Team (including sub- contractors and visitors) ................................................................................................................. 12 Table 3-2 Complimentary Plans and Procedures .............................................................................. 18  Table 5-1 Air Quality Mitigation & Management Measures – Construction Phase ..................... 22  Table 5-2 Air Quality Mitigation & Management Measures – Operational Phase ...................... 25  

In [10]:
# Count the number of words in each section
word_count = []

for section in sections:
    words = section.split()
    word_count.append(len(words))
    
# Print the word count for each section
for i, count in enumerate(word_count):
    print(f"Section {i+1} has {count} words")

Section 1 has 381 words
Section 2 has 239 words
Section 3 has 255 words
Section 4 has 49 words
Section 5 has 49 words
Section 6 has 48 words
Section 7 has 7 words
Section 8 has 123 words
Section 9 has 741 words
Section 10 has 396 words
Section 11 has 50 words
Section 12 has 148 words
Section 13 has 54 words
Section 14 has 114 words
Section 15 has 161 words
Section 16 has 15 words
Section 17 has 97 words
Section 18 has 7 words
Section 19 has 176 words
Section 20 has 170 words
Section 21 has 205 words
Section 22 has 101 words
Section 23 has 173 words
Section 24 has 40 words
Section 25 has 413 words
Section 26 has 71 words
Section 27 has 169 words
Section 28 has 372 words
Section 29 has 26 words
Section 30 has 6 words
Section 31 has 70 words
Section 32 has 623 words
Section 33 has 181 words
Section 34 has 146 words
Section 35 has 43 words
Section 36 has 11 words
Section 37 has 11 words
Section 38 has 43 words
Section 39 has 12 words
Section 40 has 11 words
Section 41 has 43 words
Section 

In [11]:
# If a section has less than 20 words, append it to the previous section
new_sections = []
section = ""

for i, section in enumerate(sections):
    if word_count[i] < 20:
        section = section + " " + sections[i+1]
    else:
        new_sections.append(section)
        section = sections[i+1]
        
new_sections.append(section)

# Print the new sections
for i, section in enumerate(new_sections):
    print(f"Section {i+1}: {section}")

IndexError: list index out of range