In [1]:
import dotenv
dotenv.load_dotenv()

True

In [2]:
import base64

# Python port of .NET's HttpServerUtility.UrlTokenDecode
# for decoding the storage path from the metadata (which is encoded in base64url)
def url_token_decode(encoded_string: str) -> bytes:
    # Get the padding count from the last character of the encoded string
    padding_count = ord(encoded_string[-1]) - ord("0")
    
    # Remove the last character (which indicates padding)
    encoded_string = encoded_string[:-1]
    
    # Replace - and _ to revert to standard base64
    encoded_string = encoded_string.replace("-", "+").replace("_", "/")
    
    # Add back the removed padding
    encoded_string += "=" * padding_count
    
    # Base64 decode
    return base64.b64decode(encoded_string).decode("utf-8")



In [13]:
# This is a custom version of the AzureCognitiveSearchRetriever which adds the required
# request headers to get '@search.highlights' from the Azure Cognitive Search API

# Based on AzureCognitiveSearchRetriver
# Source: https://github.com/langchain-ai/langchain/blob/b574507c51c0a2183ed4cde41efa5b2e8c0d98f7/libs/langchain/langchain/retrievers/azure_cognitive_search.py#L20
# LICENSE: MIT

"""Retriever for the Azure Cognitive Search service."""

from __future__ import annotations

import json
from typing import Dict, List, Optional

import aiohttp
import requests
from pydantic import Extra, root_validator

from langchain.callbacks.manager import (
    AsyncCallbackManagerForRetrieverRun,
    CallbackManagerForRetrieverRun,
)
from langchain.schema import BaseRetriever, Document
from langchain.utils import get_from_dict_or_env


class ExtendedAzureCognitiveSearchRetrieverWithHighlights(BaseRetriever):
    """Retriever for the Azure Cognitive Search service."""

    service_name: str = ""
    """Name of Azure Cognitive Search service"""
    index_name: str = ""
    """Name of Index inside Azure Cognitive Search service"""
    api_key: str = ""
    """API Key. Both Admin and Query keys work, but for reading data it's
    recommended to use a Query key."""
    # api_version: str = "2020-06-30"
    api_version: str = "2021-04-30-Preview"  # <--- UPDATED DEFAULT VERSION
    """API version"""
    aiosession: Optional[aiohttp.ClientSession] = None
    """ClientSession, in case we want to reuse connection for better performance."""
    content_key: str = "content"
    """Key in a retrieved result to set as the Document page_content."""
    top_k: Optional[int] = None
    """Number of results to retrieve. Set to None to retrieve all results."""

    # Added below
    return_highlights: bool = True
    """Whether to return highlights in metadata"""
    use_highlights_as_documents: bool = False
    """Whether to use highlights as returned documents (full documents available in metadata) If this is set, return_highlights is ignored"""

    class Config:
        extra = Extra.forbid
        arbitrary_types_allowed = True

    @root_validator(pre=True)
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that service name, index name and api key exists in environment."""
        values["service_name"] = get_from_dict_or_env(
            values, "service_name", "AZURE_COGNITIVE_SEARCH_SERVICE_NAME"
        )
        values["index_name"] = get_from_dict_or_env(
            values, "index_name", "AZURE_COGNITIVE_SEARCH_INDEX_NAME"
        )
        values["api_key"] = get_from_dict_or_env(
            values, "api_key", "AZURE_COGNITIVE_SEARCH_API_KEY"
        )
        return values

    def _build_search_url(self, query: str) -> str:
        base_url = f"https://{self.service_name}.search.windows.net/"
        endpoint_path = f"indexes/{self.index_name}/docs?api-version={self.api_version}"
        top_param = f"&$top={self.top_k}" if self.top_k else ""
        return_highlights = f"&highlight={self.content_key}" if self.return_highlights or self.use_highlights_as_documents else ""
        return base_url + endpoint_path + f"&search={query}" + top_param + return_highlights

    @property
    def _headers(self) -> Dict[str, str]:
        return {
            "Content-Type": "application/json",
            "api-key": self.api_key,
        }

    def _search(self, query: str) -> List[dict]:
        search_url = self._build_search_url(query)
        response = requests.get(search_url, headers=self._headers)
        if response.status_code != 200:
            raise Exception(f"Error in search request: {response}")

        return json.loads(response.text)["value"]

    async def _asearch(self, query: str) -> List[dict]:
        search_url = self._build_search_url(query)
        if not self.aiosession:
            async with aiohttp.ClientSession() as session:
                async with session.get(search_url, headers=self._headers) as response:
                    response_json = await response.json()
        else:
            async with self.aiosession.get(
                search_url, headers=self._headers
            ) as response:
                response_json = await response.json()

        return response_json["value"]

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        search_results = self._search(query)

        if self.use_highlights_as_documents:
            documents = []
            for result in search_results:
                if "@search.highlights" in result:
                    highlights = "\n".join(result["@search.highlights"][self.content_key])
                    documents.append(Document(page_content=highlights, metadata=result))
            return documents

        return [
            Document(page_content=result.pop(self.content_key), metadata=result)
            for result in search_results
        ]

    async def _aget_relevant_documents(
        self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun
    ) -> List[Document]:
        search_results = await self._asearch(query)

        if self.use_highlights_as_documents:
            documents = []
            for result in search_results:
                if "@search.highlights" in result:
                    highlights = "\n".join(result["@search.highlights"][self.content_key])
                    documents.append(Document(page_content=highlights, metadata=result))
            return documents

        return [
            Document(page_content=result.pop(self.content_key), metadata=result)
            for result in search_results
        ]


In [18]:
query = input("Enter a query for Cognitive Search")

In [20]:
retriever = ExtendedAzureCognitiveSearchRetrieverWithHighlights(top_k=10, use_highlights_as_documents=True)
documents = retriever.get_relevant_documents(query)

In [21]:
print(f"Found {len(documents)} results")

for document in documents:
    storage_path = url_token_decode(document.metadata["metadata_storage_path"]).strip()
    search_score = document.metadata["@search.score"]
    full_highlights_content_length = len(document.page_content)
    full_content_length = len(document.metadata["content"])
    highlight_count = len(document.metadata["@search.highlights"]["content"]])
    print(f"Search score: {search_score} Document: {storage_path}) (full document length: {full_content_length})")
    print(f"{highlight_count} highlights found totaling {full_highlights_content_length} characters")



Found 10 results
Search score: 13.246855 Document: https://mrowrpurr.blob.core.windows.net/mrowrpurr-searchdocs-1/CreationKitScripts/Actor.html) (full document length: 32768)
1 highlights found totaling 1877 characters
Search score: 11.928217 Document: https://mrowrpurr.blob.core.windows.net/mrowrpurr-searchdocs-1/CreationKitScripts/ObjectReference/ObjectReference_OnHit.html) (full document length: 26190)
1 highlights found totaling 1054 characters
Search score: 11.631902 Document: https://mrowrpurr.blob.core.windows.net/mrowrpurr-searchdocs-1/CreationKitScripts/Form.html) (full document length: 32768)
1 highlights found totaling 2243 characters
Search score: 11.424819 Document: https://mrowrpurr.blob.core.windows.net/mrowrpurr-searchdocs-1/CreationKitScripts/ActiveMagicEffect.html) (full document length: 32768)
1 highlights found totaling 2955 characters
Search score: 11.1389475 Document: https://mrowrpurr.blob.core.windows.net/mrowrpurr-searchdocs-1/CreationKitScripts/Game.html) (ful