## Setup

In [12]:
import os
import requests

import pprint
import textwrap
import xlsxwriter
import pandas as pd
from typing import List
import datetime
import time

from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings  

#Azure Search
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.models import VectorizedQuery
from azure.search.documents.indexes.models import (
    ComplexField,
    CorsOptions,
    SearchIndex,
    ScoringProfile,
    SearchFieldDataType,
    SimpleField,
    SearchField,
    SearchableField,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch
)

from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
)
from llama_index.core.settings import Settings
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import (
    ServiceContext,
    PromptHelper,
)
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

import nltk
from nltk.tokenize import word_tokenize
import time
import openai
import base64

from dotenv import load_dotenv
load_dotenv(os.path.join("..", "Azure OpenAI credentials.env"))

True

In [13]:
azure_endpoint = os.environ['GLOBAL_AZURE_ENDPOINT']
openai_api_key = os.environ['GLOBAL_OPENAI_API_KEY']

openai_deployment_name = os.environ['GLOBAL_GPT_DEPLOYMENT_NAME']
openai_api_version = os.environ['GLOBAL_OPENAI_API_VERSION']
embedding_model = os.environ['GLOBAL_EMBEDDING_MODEL']
embedding_deployment_name = os.environ['GLOBAL_EMBEDDING_DEPLOYMENT_NAME']

search_endpoint = os.environ['SEARCH_ENDPOINT']
search_api_key = os.environ['SEARCH_API_KEY']
search_api_version = os.environ['SEARCH_API_VERSION']
search_service_name = os.environ['SEARCH_SERVICE_NAME']

# langsmith_api_key = os.environ['LANGSMITH_API_KEY']

search_url = f"https://{search_service_name}.search.windows.net/"
search_credential = AzureKeyCredential(search_api_key)
index_name = "crd-ontologies-desc"
search_client = SearchClient(search_endpoint, index_name, search_credential)

llm = AzureChatOpenAI(
    deployment_name=openai_deployment_name, 
    openai_api_version=openai_api_version, 
    openai_api_key=openai_api_key, 
    azure_endpoint=azure_endpoint, 
    temperature=0
)

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=embedding_deployment_name,
    api_version=openai_api_version,
    api_key=openai_api_key,
    azure_endpoint=azure_endpoint,
)

dimensionality = 1536

## Define index structure

In [14]:
search_url = f"https://{search_service_name}.search.windows.net/"
index_name = "crd-ontologies-desc"
client = SearchIndexClient(search_url, search_credential)
dimensionality = 1536

In [None]:
# Get a client
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="summary", type=SearchFieldDataType.String, searchable=True),
    SearchableField(name="individual", type=SearchFieldDataType.String, searchable=True),
    SearchableField(name="doc_path", type=SearchFieldDataType.String, searchable=True, sortable=True, filterable=True),
    SearchField(name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=dimensionality, searchable=True, vector_search_profile_name="myExhaustiveKnnProfile"),
]

cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)

### vector_search_profile_name
vector_search = VectorSearch(
    profiles=[VectorSearchProfile(name="myExhaustiveKnnProfile", algorithm_configuration_name="my-algorithms-config", vectorizer_name="myVectorizer")],
    algorithms=[HnswAlgorithmConfiguration(name="my-algorithms-config")],
    vectorizers=[
        AzureOpenAIVectorizer(
            vectorizer_name="myVectorizer",
            parameters=AzureOpenAIVectorizerParameters(
                deployment_name=embedding_deployment_name,
                model_name = embedding_model,
                api_key=openai_api_key,
                resource_url=azure_endpoint,
            ),
        )
    ]
)

scoring_profiles: List[ScoringProfile] = []
index = SearchIndex(name=index_name, fields=fields, scoring_profiles=scoring_profiles, cors_options=cors_options, vector_search=vector_search)

In [18]:
try:
    result = client.create_index(index)
    print(f"Index '{index_name}' created successfully.")
except Exception as ex:
    print(f"Failed to create index '{index_name}'.")
    print(f"Exception: {ex}")

Index 'crd-ontologies-desc' created successfully.


## Create indexer class

In [6]:
def get_embeddings(text, azure_endpoint, api_key, api_version, deployment_name):
#generate embeddings for a document chunk 
    client = openai.AzureOpenAI(
        azure_endpoint=azure_endpoint,
        api_key=api_key,
        api_version=api_version,
    ) 
    embedding = client.embeddings.create(input=[text], model=deployment_name)
    return embedding.data[0].embedding

In [7]:
def count_tokens(text):
#counts the number of tokens for a given input string
    tokens = word_tokenize(text)
    token_count = len(tokens)
    return token_count

In [8]:
date_time = datetime.datetime.now()
timestamp = datetime.datetime.fromtimestamp(
    time.time()).strftime('%Y-%m-%d %H:%M:%S') 

def generate_custom_id(payload):
    '''
    function that generates a unique id 
    '''
    key = str(timestamp) + payload
    input_bytes = key.encode('utf-8')
    base64_bytes = base64.urlsafe_b64encode(input_bytes)
    base64_string = base64_bytes.decode('utf-8')
    key = base64_string
    return key

In [9]:
class IndexManager:
    '''
    Class that helps creating, managing and updating the Azure AI Search Index 
    Attributes - 
    index_name: Name of the index being managed, can be updated or change for relevant functions
    search_service_name: Azure search service resource name 
    index_api_key: 
    dimensionality: Dimension specified for the embedding/vector field (determines the storage size of one vector)

    
    '''

    def __init__(self, index_name,search_service_name, search_service_api_key, dimensionality,doc_type = "doc"):  
        self.index_name = index_name
        self.doc_type = doc_type
        self.search_url = f"https://{search_service_name}.search.windows.net/"
        self.credential = AzureKeyCredential(search_service_api_key)
        self.dimensionality = dimensionality
    
    def create_index(self, index_name = None):
        """
        function to create image index with the updated schema
        """
        if index_name is None: 
            index_name = self.index_name    
        try:
            # Get a client
            client = SearchIndexClient(self.search_url, self.credential)

            fields = [
                SimpleField(name="id", type=SearchFieldDataType.String, key=True),
                SearchableField(name="chunk", type=SearchFieldDataType.String, searchable=True),
                SearchableField(name="doc_path", type=SearchFieldDataType.String, searchable=True, sortable=True, filterable=True),
                SearchField(name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=dimensionality, searchable=True,vector_search_profile_name="myExhaustiveKnnProfile"),
            ]                 

            cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)

            ### vector_search_profile_name
            vector_search = VectorSearch(
                profiles=[VectorSearchProfile(name="myExhaustiveKnnProfile", algorithm_configuration_name="my-algorithms-config")],
                algorithms=[HnswAlgorithmConfiguration(name="my-algorithms-config")],
            )

            scoring_profiles: List[ScoringProfile] = []
            index = SearchIndex(name=index_name, fields=fields, scoring_profiles=scoring_profiles, cors_options=cors_options, vector_search=vector_search)

            result = client.create_index(index)
            print(f"Index '{index_name}' created successfully.")

        except Exception as ex:
            print(f"An error occurred: {ex}")

    
    def delete_index_documents(self, field, field_value):
        """
        Function to copy source index to target index
        """
        try:
            
            # Initialize the SearchIndexClient for source and target indexes
            search_client = SearchClient(self.search_url, self.index_name, self.credential)

            client = SearchIndexClient(self.search_url, self.credential)
            # Get all documents from the source index
            if field == "Region":
             filter_query = f"region eq '{field_value}'"
            
            if field == "Document Name":
             filter_query = f"doc_name eq '{field_value}'"

            documents = search_client.search(search_text="*", include_total_count=False, filter=filter_query)
            
            for result in documents:
                # Extract document ID and fields 
                document_fields = {key: value for key, value in result.items() if key != "@search.documentId"}
                # print(document_fields)

                # Delete document from the index
                search_client.delete_documents(documents=[document_fields]) 

            # print(documents)
            print(f"Documents for region: {field_value} deleted successfully!")

        except Exception as ex:
            print(f"An error occurred: {ex}")

In [10]:
def load_index(indx, chunk_content, chunkcount, file_path):
        key = str(chunkcount)+'-'+ file_path

        # creating a unique key
        input_bytes = key.encode('utf-8')
        base64_bytes = base64.urlsafe_b64encode(input_bytes)
        base64_string = base64_bytes.decode('utf-8')
        key=base64_string

        response = get_embeddings(chunk_content, azure_endpoint, openai_api_key, openai_api_version, embedding_deployment_name)
        embeddings_data = response  

        DOCUMENT = {
            "id": key,
            "doc_path": file_path,
            "chunk": chunk_content,
            "embedding": embeddings_data, 
        }

        search_client = SearchClient(indx.search_url, indx.index_name, indx.credential)
        result = search_client.upload_documents(documents=[DOCUMENT])

## Index documents

In [11]:
directory = "reports"

all_docs = []
for file in os.listdir(directory):
    if file.endswith(".pdf"):
        all_docs.append(file)
        
all_docs

['Cape_Lambert_Operations_CRA_MFL_2023_3 1.pdf',
 'Dampier_Port_CRA_MFL_2023.pdf',
 'Oyu_Tolgoi_Copper_Mine_CRA_MFL_2023_3 1.pdf']

In [12]:
# Change cwd to the directory where the pdfs are stored
os.chdir(directory)
os.getcwd()

'c:\\Users\\ITLS104415\\Desktop\\GitHub Repositories\\Critical-Risk-Digitization\\Retrieval\\indexing\\reports'

In [None]:
from tqdm import tqdm  # Correct import

indx = IndexManager(index_name, search_service_name, search_api_key, dimensionality) 

for file_path in all_docs[1:]:
    
    nodes = []
     
    max_tokens = 7500
    documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
    print(f"Document {file_path} loaded")
    sentence_text_splitter = SentenceSplitter(chunk_size=max_tokens, chunk_overlap=50)

    pipeline = IngestionPipeline(
        transformations=[
            SentenceSplitter(chunk_size=max_tokens, chunk_overlap=100),
        ]
    )

    # run the pipeline
    split_documents = pipeline.run(documents=documents)

    embedding_model = AzureOpenAIEmbedding(
        model="text-embedding-ada-002",
        deployment_name=embedding_deployment_name,
        api_key=openai_api_key,
        azure_endpoint=azure_endpoint,
        api_version=openai_api_version,
    )

    splitter = SemanticSplitterNodeParser(
        buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embedding_model
    )
    
    for doc in tqdm(split_documents, desc="Processing documents"):
        nodes.extend(splitter.get_nodes_from_documents([doc]))
        
    print(f"Total number of nodes: {len(nodes)}")
    
    chunkcount = 0 

    for section_info in nodes:
        chunk_content=section_info.get_content()
        
        if count_tokens(chunk_content) >= max_tokens:
            # divide in sub chunks of max token count
            sentence_text_splitter = SentenceSplitter(chunk_size=max_tokens, chunk_overlap=50)
            text = chunk_content
            subchunks = sentence_text_splitter.split_text(text) 
            for subchunk in subchunks:
                subchunk_content=str(subchunk)
                #if the chunk is empty do not try to load
                if subchunk_content != '':
                    chunkcount = chunkcount + 1
                    load_index(indx, chunk_content, chunkcount, file_path)
                    print(f"Chunk {chunkcount} has been indexed (subchunk)")

        else: 
            if chunk_content != '':
                chunkcount = chunkcount + 1
                load_index(indx, chunk_content, chunkcount, file_path)
                print(f" Chunk {chunkcount} has been indexed")    

## Delete elements from index

In [3]:
docs_to_be_deleted = ["CRA V16 MFL Instantiated Ontology.ttl"]

index_name = "crd-vector-store-ontologies"
search_client = SearchClient(search_endpoint, index_name, search_credential)

In [4]:
for blob_path in docs_to_be_deleted:
    field_value = blob_path
    filter_query = f"doc_path eq '{field_value}'"
    
    # Run the loop until no more documents are found
    while True:
        documents = search_client.search(search_text="*", include_total_count=False, filter=filter_query)
        document_list = list(documents)
        
        if not document_list:
            print(f"No more documents found for '{field_value}'. Moving to the next document.")
            break  # Exit the loop if no documents are found
        
        # Delete all found documents in the current search iteration
        for result in document_list:
            document_fields = {key: value for key, value in result.items() if key != "@search.documentId"}
            search_client.delete_documents(documents=[document_fields])
            print(f"Chunk with doc_name eq '{field_value}' deleted successfully!")

Chunk with doc_name eq 'CRA V16 MFL Instantiated Ontology.ttl' deleted successfully!
Chunk with doc_name eq 'CRA V16 MFL Instantiated Ontology.ttl' deleted successfully!
Chunk with doc_name eq 'CRA V16 MFL Instantiated Ontology.ttl' deleted successfully!
Chunk with doc_name eq 'CRA V16 MFL Instantiated Ontology.ttl' deleted successfully!
Chunk with doc_name eq 'CRA V16 MFL Instantiated Ontology.ttl' deleted successfully!
Chunk with doc_name eq 'CRA V16 MFL Instantiated Ontology.ttl' deleted successfully!
Chunk with doc_name eq 'CRA V16 MFL Instantiated Ontology.ttl' deleted successfully!
Chunk with doc_name eq 'CRA V16 MFL Instantiated Ontology.ttl' deleted successfully!
Chunk with doc_name eq 'CRA V16 MFL Instantiated Ontology.ttl' deleted successfully!
Chunk with doc_name eq 'CRA V16 MFL Instantiated Ontology.ttl' deleted successfully!
Chunk with doc_name eq 'CRA V16 MFL Instantiated Ontology.ttl' deleted successfully!
Chunk with doc_name eq 'CRA V16 MFL Instantiated Ontology.ttl' de