# Grounding Generative AI using Enterprise Search Results

_This colab demonstrates examples of Retrieval Augmented Generation (RAG)
and Reasoning + Acting (ReAct) workflows that make use of Google Enterprise Search (ES) and Large Language Models (LLMs). Langchain is also used for many Quality of Life features. Please make a separate copy of this notebook before making any changes._


* _**Credit to and  inspiration** `rthallam@` for their great work on langchain integration [generative-ai Github repo](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/language/examples/langchain-intro)_

* _**Author**: `saurabhmangal@`_
* _**Last Update**: June 31th 2023_
* _**Context**: [RAG Arxiv Paper](https://arxiv.org/pdf/2005.11401.pdf) & [ReACT Arxiv Paper](https://arxiv.org/abs/2210.03629.pdf)_


In [1]:
# from google.colab import auth
# from google.auth import default

# auth.authenticate_user()
# creds, _ = default()

# Installation
**NOTE: This requires shutting down the runtime.**

In [2]:
# # Install langchain
# ! pip install langchain --upgrade

# # Install Enterprise Search SDK and Vertex PaLM endpoint
# ! pip install google_cloud_discoveryengine

# # Install Google Cloud Platform
# ! pip install google-cloud-aiplatform --upgrade

# import IPython
# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

# Imports and classes

In [3]:
from google.cloud import discoveryengine_v1beta
from google.protobuf.json_format import MessageToDict


from langchain.llms import VertexAI
from langchain.embeddings import VertexAIEmbeddings

from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.base import Chain
from langchain.chains.question_answering import load_qa_chain
from langchain.agents import AgentType, initialize_agent
from langchain.tools import Tool
from langchain.callbacks.manager import Callbacks, CallbackManagerForChainRun
from langchain.schema import AgentAction, AgentFinish, Document

from typing import Any, Mapping, List, Dict, Optional, Tuple, Sequence, Union
import json, re

In [4]:
#@title Additional Enterprise Search Classes and helper functions

class EnterpriseSearchRetriever():
  """Retriever class to fetch documents or snippets from a search engine."""
  def __init__(self,
               project,
               search_engine,
               location='global',
               serving_config_id='default_config'):
    self.search_client = discoveryengine_v1beta.SearchServiceClient()
    self.serving_config: str = self.search_client.serving_config_path(
            project=project,
            location=location,
            data_store=search_engine,
            serving_config=serving_config_id,
            )

  def _search(self, query:str):
    """Helper function to run a search"""
    request = discoveryengine_v1beta.SearchRequest(serving_config=self.serving_config, query=query)
    return self.search_client.search(request)

  def get_relevant_documents(self, query: str) -> List[Document]:
    """Retrieve langchain Documents from a search response"""
    res = self._search(query)
    documents = []
    for result in res.results:
        data = MessageToDict(result.document._pb)
        metadata = data.copy()
        if 'derivedStructData' in metadata:
            del metadata['derivedStructData']
        if 'structData' in metadata:
            del metadata['structData']
        if data.get('derivedStructData') is None:
            content = json.dumps(data.get('structData', {}))
        else:
            content = json.dumps([d.get('snippet') for d in data.get('derivedStructData', {}).get('snippets', []) if d.get('snippet') is not None])
        documents.append(Document(page_content=content, metadata=metadata))
    return documents

  def get_relevant_snippets(self, query: str) -> List[str]:
    """Retrieve snippets from a search query"""
    res = self._search(query)
    snippets = []
    for result in res.results:
        data = MessageToDict(result.document._pb)
        if data.get('derivedStructData', {}) == {}:
            snippets.append(json.dumps(data.get('structData', {})))
        else:
            snippets.extend([d.get('snippet') for d in data.get('derivedStructData', {}).get('snippets', []) if d.get('snippet') is not None])
    return snippets


class EnterpriseSearchChain(Chain):
    """Chain that queries an Enterprise Search Engine and summarizes the responses."""

    chain: Optional[LLMChain]
    search_client: Optional[EnterpriseSearchRetriever]

    def __init__(self,
                 project,
                 search_engine,
                 chain,
                 location='global',
                 serving_config_id='default_config'):
        super().__init__()
        self.chain = chain
        self.search_client = EnterpriseSearchRetriever(project, search_engine, location, serving_config_id)

    @property
    def input_keys(self) -> List[str]:
        return ['query']

    @property
    def output_keys(self) -> List[str]:
        return ['summary']

    def _call(self, inputs: Dict[str, Any]) -> Dict[str, str]:
        _run_manager = CallbackManagerForChainRun.get_noop_manager()
        query = inputs['query']
        _run_manager.on_text(query, color="green", end="\n", verbose=self.verbose)
        snippets = self.search_client.get_relevant_snippets(query)
        documents = self.search_client.get_relevant_documents(query) ## new line
        # get_relevant_documents
        _run_manager.on_text(snippets, color="white", end="\n", verbose=self.verbose)
        summary = self.chain.run(snippets)
        return {'summary': summary, 'documents' : documents} ## modified line


    @property
    def _chain_type(self) -> str:
        return "google_enterprise_search_chain"

## Setup and Configurations 

In [5]:
#@title ### You will need to update these values

VERTEX_API_PROJECT = PROJECT_ID = "my-project-0004-346516" #'your-project' #@param {"type": "string"}
VERTEX_API_LOCATION =REGION= 'us-central1' #@param {"type": "string"}

import vertexai
vertexai.init(project=VERTEX_API_PROJECT, location=VERTEX_API_LOCATION)


In [6]:
#@title Initialise the LLM
GCP_PROJECT = VERTEX_API_PROJECT
SEARCH_ENGINE = 'customer-store_1695195086499'

# "customer-app_1695194973554" #@param {type: "string"} test002_1686801972135/servingConfigs/default_search:search
# LLM_MODEL = "text-bison@001" #@param {type: "string"}
LLM_MODEL = "text-bison@latest" #@param {type: "string"}


MAX_OUTPUT_TOKENS = 1024 #@param {type: "integer"}
TEMPERATURE = 0.4 #@param {type: "number"}
TOP_P = 0.8 #@param {type: "number"}
TOP_K = 40 #@param {type: "number"}
VERBOSE = True #@param {type: "boolean"}
llm_params = dict(
    model_name=LLM_MODEL,
    max_output_tokens=MAX_OUTPUT_TOKENS,
    temperature=TEMPERATURE,
    top_p=TOP_P,
    top_k=TOP_K,
    verbose=VERBOSE,
)

llm = VertexAI(**llm_params)

In [7]:
#@title Example - summarize financial results
SEARCH_QUERY = 'Does the policy cover hereditary or genetic conditions?' #@param {type: "string"}

SEARCH_QUERY = 'what type of animals can be insured' #@param {type: "string"}

PROMPT_STRING = "Please parse these search results for the question: {results} , in case of no results say nothing found" #@param {type: "string"}


# '''
# Strictly say that "information not found in the policy document. For specific
# details on this aspect, it is best to refer to the full policy
# documentation or directly consult with the insurance provide" if it is not found in the content above. First search the information in the context above and based on on the exact match try to answer the question. Also add reference location to where this information is found (in '[]' brackets)

# Does the policy cover hereditary or genetic conditions?
# '''

# Combine the LLM with a prompt to make a simple chain
prompt = PromptTemplate(input_variables=['results'],
                        template=PROMPT_STRING)

chain = LLMChain(llm=llm, prompt=prompt, verbose=True)

# Combine this chain with Enterprise Search in a new chain
es_chain = EnterpriseSearchChain(project=GCP_PROJECT,
                                 search_engine=SEARCH_ENGINE,
                                 chain=chain)
print(SEARCH_QUERY)
result = es_chain.run(SEARCH_QUERY)

result.split('\n')

what type of animals can be insured


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mPlease parse these search results for the question: [] , in case of no results say nothing found[0m

[1m> Finished chain.[0m


[' Nothing found']

In [8]:
#@title Example - summarize financial results
SEARCH_QUERY = 'what type of animals can be insured' #@param {type: "string"}
PROMPT_STRING = "Please parse these search results of : {results} " #@param {type: "string"}

# Combine the LLM with a prompt to make a simple chain
prompt = PromptTemplate(input_variables=['results'],
                        template=PROMPT_STRING)

chain = LLMChain(llm=llm, prompt=prompt, verbose=True)

# Combine this chain with Enterprise Search in a new chain
es_chain = EnterpriseSearchChain(project=GCP_PROJECT,
                                 search_engine=SEARCH_ENGINE,
                                 chain=chain)

result = es_chain.run(SEARCH_QUERY)

result.split('\n')



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mPlease parse these search results of : [] [0m

[1m> Finished chain.[0m


[' ```',
 '{',
 '  "took": 1,',
 '  "timed_out": false,',
 '  "_shards": {',
 '    "total": 1,',
 '    "successful": 1,',
 '    "skipped": 0,',
 '    "failed": 0',
 '  },',
 '  "hits": {',
 '    "total": 0,',
 '    "max_score": null,',
 '    "hits": []',
 '  }',
 '}',
 '```']

In [9]:
#@title Example - Answer a question about a search query
SEARCH_QUERY = 'dogs' #@param {type: "string"}
YEAR = ' cats' #@param {type:"string"}

PROMPT_STRING = """
Please parse these search results and summarize them to answer
the following question.
Results:
{results}

Question:'How many types of insurance available in """ + YEAR + """
Answer:
"""

# Combine the LLM with a prompt to make a simple chain
prompt = PromptTemplate(input_variables=['results'],
                        template=PROMPT_STRING)

chain = LLMChain(llm=llm, prompt=prompt, verbose=True)

# Combine this chain with Enterprise Search in a new chain
# This chain simply combines a Langchain LLMChain with our EnterpriseSearchRetriever
es_chain = EnterpriseSearchChain(project=GCP_PROJECT,
                                 search_engine=SEARCH_ENGINE,
                                 chain=chain)

result = es_chain.run(SEARCH_QUERY)

result.split('\n')



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Please parse these search results and summarize them to answer
the following question.
Results:
[]

Question:'How many types of insurance available in  cats
Answer:
[0m

[1m> Finished chain.[0m


[' 0']

In [10]:
#@markdown ## Choose a search engine and define a complex query
COMPLEX_QUERY = 'policy terms, what are they?' #@param {"type": "string"}

# Initialise an Enterprise Search Retriever
retriever = EnterpriseSearchRetriever(GCP_PROJECT, SEARCH_ENGINE)

prompt = PromptTemplate(input_variables=["complex_query"], template="""Extract the most specific search terms from the following query:

Query:
'{complex_query}'

Search Terms:
* """)

#@markdown ## Fetch results from the LLM
chain = LLMChain(llm=llm, prompt=prompt, verbose=True)
terms = chain.run(COMPLEX_QUERY)

#@markdown ```
#@markdown * "revenue"
#@markdown * "budget"
#@markdown * "travel"
#@markdown ```

terms



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mExtract the most specific search terms from the following query:

Query:
'policy terms, what are they?'

Search Terms:
* [0m

[1m> Finished chain.[0m


' policy \n* terms'

In [11]:
#@title Clean up the response
#@markdown ## Select suitable terms and clean up unneccessary characters
lines_to_ignore = 0 #@param {"type": "integer"}
max_terms_to_search = 4 #@param {"type": "integer"}

clean_terms = [re.sub('[^\d\w\s]', '', q).strip() for q in terms.split('\n')[lines_to_ignore:lines_to_ignore + max_terms_to_search]]

#@markdown `['disclosure', 'appraisal', 'consultation', 'draft', 'sep']`
clean_terms

['policy', 'terms']

In [12]:
#@markdown ## Search each term and keep the top `n` results
num_results = 5 #@param {"type": "integer"}

results = []
for q in clean_terms:
  snippets = retriever.get_relevant_snippets(q)
  results.extend([s for s in snippets[:num_results]])

results = list(set(results)) # Deduplicate to keep prompt length down
results

[]

In [13]:
#@markdown ## Combine the search results into an answer using an LLM
# Combine the LLM with a prompt to make a simple chain
prompt = PromptTemplate(input_variables=['query', 'results'],
                        template="""Please summarize the following contextual data to answer the following question. Provide references to the context in your answer:
Question: {query}
Context:
{results}
Answer with citations:""")
chain = LLMChain(llm=llm, prompt=prompt, verbose=True)

summary = chain.run({"query": COMPLEX_QUERY, "results": results})

summary.split('\n')



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mPlease summarize the following contextual data to answer the following question. Provide references to the context in your answer:
Question: policy terms, what are they?
Context:
[]
Answer with citations:[0m

[1m> Finished chain.[0m


[' A policy term is a word or phrase used in an insurance policy that defines the coverage provided. ',
 '']

In [14]:
#@title We are using the same search engine, terms and question as the previous example


## Here is the function being used to convert search results to documents
 ## (already encoded in the `EnterpriseSearchRetriever` class)

# def search_response_to_documents(res) -> List[Document]:
#     """Retrieve langchain Documents from a search response"""
#     documents = []
#     for result in res.results:
#         data = MessageToDict(result.document._pb)
#         metadata = data.copy()
#         del metadata['derivedStructData']
#         del metadata['structData']
#         if data.get('derivedStructData') is None:
#             content = json.dumps(data.get('structData', {}))
#         else:
#             content = json.dumps([d.get('snippet') for d in data.get('derivedStructData', {}).get('snippets', []) if d.get('snippet') is not None])
#         documents.append(Document(page_content=content, metadata=metadata))
#     return documents

#@markdown ## Search for each search term and extract into a langchain `Document` format
#@markdown * This format just contains the snippets as `page_content` and the document title and link as `metadata`

document_responses = []
for t in clean_terms:
    document_responses.append(retriever.get_relevant_documents(t))

# This chain will run one LLM call for every document, so we likely do not want to keep all of the context if the document count is very large
for idx, d in enumerate(document_responses):
  print(f"Search {idx + 1}: {len(d)} documents")

Search 1: 1 documents
Search 2: 1 documents


In [15]:
# There are 65 documents total, so we will just keep the top 3 from each search
final_documents = [d for r in document_responses for d in r[:3]]

len(final_documents)

2

In [16]:
#@title Load and run the chain using the documents
#@markdown **Reminder**: the question is 'What are the voice and data packages available for roaming in Europe?'

#@markdown Here we are printing the intermediate steps so you can see the end to end process

chain = load_qa_chain(llm, chain_type="refine", return_refine_steps=True)

chain({"input_documents": final_documents, "question": COMPLEX_QUERY}, return_only_outputs=True)

{'intermediate_steps': [' policy terms are the conditions and rules that govern an insurance policy',
  ' Policy terms are the conditions and rules that govern an insurance policy. \n\nInsurance companies use policy terms to outline the coverage, exclusions, and limitations of their policies. Policy terms can vary significantly from one insurance company to another, so it is important to read and understand the policy terms before purchasing an insurance policy.'],
 'output_text': ' Policy terms are the conditions and rules that govern an insurance policy. \n\nInsurance companies use policy terms to outline the coverage, exclusions, and limitations of their policies. Policy terms can vary significantly from one insurance company to another, so it is important to read and understand the policy terms before purchasing an insurance policy.'}

### Step 1 - Create tools
In the **Imports and classes** section at the beginning of this notebook we defined a custom `EnterpriseSearchRetriever` class. This class exposes methods to retrieve search snippets and Documents. We pass these methods as 'tools' for the ReAct agent to use. Currently we have:

* `get_relevant_snippets` and `get_relevant_documents`:
  * Return a `List[str]` or `List[Document]` respectively of search results for a given search query

We also define a function later to use an LLM to split a complex query into multiple search terms, as we did in the previous pattern
* `extract_search_terms`:
  * Return a `List[str]` of search keywords for a given complex query
  * This uses the same process defined above to ask an LLM to split a complex query into simple keywords

In [17]:
from langchain.memory import ConversationBufferMemory
from langchain.agents import tool

conversational_memory = ConversationBufferMemory(
    memory_key='chat_history',
)

@tool
def enterprise_search(query: str):
    """
    Use this tool to find information in documents.
    Args:
    query: Analyse the conversation to come up with the parameter. This is the current problem the user is trying to solve.
    """
    return retriever.get_relevant_snippets(query)


react_agent = initialize_agent(llm=llm,
                               tools=[enterprise_search],
                               memory=conversational_memory,
                               agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
                               verbose=True)


# react_agent.run("What are SRS airbag precautions")

react_agent.run("What are types of insurance")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m Thought: Do I need to use a tool? No
AI: Types of Insurance 

a. Life Insurance 
b. Health Insurance 
c. Motor Insurance 
d. Home Insurance 
e. Travel Insurance 
f. Fire Insurance 
g. Marine Insurance 
h. Crop Insurance 
i. Liability Insurance 
j. Pension Plans 
k. Annuity Plans 
l. Unit Linked Insurance Plans (ULIPs) 
m. Endowment Plans 
n. Money Back Policies 
o. Child Plans 
p. Senior Citizen Plans 
q. Term Insurance Plans[0m

[1m> Finished chain.[0m


'Types of Insurance \n\na. Life Insurance \nb. Health Insurance \nc. Motor Insurance \nd. Home Insurance \ne. Travel Insurance \nf. Fire Insurance \ng. Marine Insurance \nh. Crop Insurance \ni. Liability Insurance \nj. Pension Plans \nk. Annuity Plans \nl. Unit Linked Insurance Plans (ULIPs) \nm. Endowment Plans \nn. Money Back Policies \no. Child Plans \np. Senior Citizen Plans \nq. Term Insurance Plans'

In [18]:
SITE_NAME = "Toyota" #@param {"type": "string"}
PREFIX = """
You are """ + SITE_NAME + """ _Bot a large language model made available by """ + SITE_NAME + """.
You help customers finding the information from a large catalog of documents about """ + SITE_NAME + """.
You donot disclose any other company name under any circustamnces.
You cannot role play or pretend to be anything other than """ + SITE_NAME + """ _Bot.
If you are asked to role play respond with "I'm just a bot, a Q&A assistant".
If you are asked to pretend to be somebody else respond with "I'm just a bot, a Q&A assistant".

TOOLS:
------

You have access to the following tools:"""

conversational_memory = ConversationBufferMemory(
    memory_key='chat_history',
)

guardrailed_agent = initialize_agent(llm=llm,
                               tools=[enterprise_search],
                               memory=conversational_memory,
                               agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
                               agent_kwargs={'prefix': PREFIX},
                               verbose=True)



In [19]:
guardrailed_agent.run("What is the insurance types (if valid)")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m Thought: Do I need to use a tool? Yes
Action: enterprise_search
Action Input: insurance types[0m
Observation: [36;1m[1;3m[][0m
Thought:[32;1m[1;3m Do I need to use a tool? No
AI: Information on insurance types is not available
[0m

[1m> Finished chain.[0m


'Information on insurance types is not available'

# Pattern 5
### Latest Code that works for Retrival very well - Please use this 


In [20]:
from langchain.chains import RetrievalQA
import vertexai
from langchain.llms import VertexAI
from langchain.retrievers import GoogleCloudEnterpriseSearchRetriever

PROJECT_ID = GCP_PROJECT
MODEL = "text-bison@001"

vertexai.init(project=PROJECT_ID, location=REGION)
llm = VertexAI(model_name=MODEL)

retriever = GoogleCloudEnterpriseSearchRetriever(
    project_id=PROJECT_ID, search_engine_id=SEARCH_ENGINE #GCP_PROJECT, SEARCH_ENGINE
)

search_query =  'what type of animals can be insured' #@param {type: "string"}


retrieval_qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever
)
# retrieval_qa.run(search_query)


###
es = EnterpriseSearchRetriever(PROJECT_ID, SEARCH_ENGINE)


SEARCH_QUERY = 'Does the policy cover hereditary or genetic conditions?' #@param {type: "string"}

# SEARCH_QUERY = 'what type of animals can be insured' #@param {type: "string"}


res = es.get_relevant_snippets(SEARCH_QUERY)
res


[]

#### Step 1 -- How to use properly

In [21]:
from typing import List

from google.cloud import discoveryengine

# TODO(developer): Uncomment these variables before running the sample.
project_id = PROJECT_ID
location = 'global'
search_engine_id = SEARCH_ENGINE
serving_config_id = "default_config"          # Values: "default_config"
# search_query = SEARCH_QUERY


def search_sample(
    project_id: str,
    location: str,
    search_engine_id: str,
    serving_config_id: str,
    search_query: str,
) -> List[discoveryengine.SearchResponse.SearchResult]:
    # Create a client
    client = discoveryengine.SearchServiceClient()

    # The full resource name of the search engine serving config
    # e.g. projects/{project_id}/locations/{location}
    serving_config = client.serving_config_path(
        project=project_id,
        location=location,
        data_store=search_engine_id,
        serving_config=serving_config_id,
        # content_search_spec = {"extractive_content_spec": {"max_extractive_answer_count": 1}}
        #{"query":"what is google cloud","content_search_spec": {"extractive_content_spec": {"max_extractive_answer_count": 1}}}

    )

    request = discoveryengine.SearchRequest(
        serving_config=serving_config, query=search_query, page_size=10
    )
    response = client.search(request)
    for result in response.results:
        print(result)

    return response.results



In [22]:
# search_query = "What is the Combination meter layout"

# search_query = "how to change  temperature display in prius"

search_query = 'what type of animals can be insured' #@param {type: "string"}

# projects/255766800726/locations/us-central1/collections/default_collection/dataStores/grab02-sm_1690440104277
# projects/255766800726/locations/global/collections/default_collection/dataStores/grab02-sm_1690440104277/branches/0/documents/581eedaab781dc7ead70f28af4491766

In [23]:
search_output = search_sample(PROJECT_ID,location,SEARCH_ENGINE,serving_config_id,search_query)

for document in search_output:
    # print(document)
    data = MessageToDict(document._pb)
    # print(data)
    data1 = data.get('document', {})
    # print(data1)
    data2 = data1.get('derivedStructData', {})
    data3 = data2.get('extractive_answers', {})
    data4 = data3[0]
    query_result = data4.get('content', {})
    
    print("Final Answer", query_result )

id: "6808bc60247fbff372dc2d9745e5944a"
document {
  name: "projects/255766800726/locations/global/collections/default_collection/dataStores/igloo-store_1695195086499/branches/0/documents/6808bc60247fbff372dc2d9745e5944a"
  id: "6808bc60247fbff372dc2d9745e5944a"
  derived_struct_data {
    fields {
      key: "link"
      value {
        string_value: "gs://my-project-0004-bucket/matching_engine/Igloo_policy.txt"
      }
    }
    fields {
      key: "extractive_answers"
      value {
        list_value {
          values {
            struct_value {
              fields {
                key: "content"
                value {
                  string_value: "3. The object of insurance is a pet that is kept by the insured as a daily companion that must be cared for and fulfilled for their needs and have a proper place. 4. Pets in this policy are: 4.1. Dogs that are registered and have a Stambum 4.2. Cats that are registered and have a certificate from ICA 4.3."
                }
       

In [24]:
# search_output = search_sample(PROJECT_ID,location,SEARCH_ENGINE,serving_config_id,search_query)

for document in search_output:
    # print(document)
    data = MessageToDict(document._pb)
    # print(data)
    data1 = data.get('document', {})
    # print(data1)
    data2 = data1.get('derivedStructData', {})
    data3 = data2.get('extractive_answers', {})
    data4 = data3[0]
    pagenumber_result = data4.get('pageNumber', {})
    
    print("Final Answer pagenumber", pagenumber_result )

Final Answer pagenumber {}


In [25]:
print("Final Answer", query_result )

Final Answer 3. The object of insurance is a pet that is kept by the insured as a daily companion that must be cared for and fulfilled for their needs and have a proper place. 4. Pets in this policy are: 4.1. Dogs that are registered and have a Stambum 4.2. Cats that are registered and have a certificate from ICA 4.3.


# Pattern 6

### Less Instructive Answerrs and simple human readable summary

In [26]:
overall_prompt = "here is some information from database : " + query_result + " and here is the question user asked -- : " + search_query + " give a summary "
print(overall_prompt)

react_agent.run(overall_prompt)

here is some information from database : 3. The object of insurance is a pet that is kept by the insured as a daily companion that must be cared for and fulfilled for their needs and have a proper place. 4. Pets in this policy are: 4.1. Dogs that are registered and have a Stambum 4.2. Cats that are registered and have a certificate from ICA 4.3. and here is the question user asked -- : what type of animals can be insured give a summary 


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m Thought: Do I need to use a tool? No
AI: Only dogs and cats can be insured. [0m

[1m> Finished chain.[0m


'Only dogs and cats can be insured.'

### More Instructive Answers


In [27]:
# overall_prompt = "here is some information from database : " + query_result + " and here is the question user asked -- : " + search_query + " give a summary based on the inforamtion from database but keep as detailed as possible "
# print(overall_prompt)

# react_agent.run(overall_prompt)

### Very high level summary

In [28]:
query_result = query_result #@param {"type": "string"}
PREFIX = """
You are helpful agent able to answer any question only
based on the information from information here """ + query_result 

conversational_memory = ConversationBufferMemory(
    memory_key='chat_history',
)

guardrailed_agent = initialize_agent(llm=llm,
                               tools=[enterprise_search],
                               memory=conversational_memory,
                               agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
                               agent_kwargs={'prefix': PREFIX},
                               verbose=True)

guardrailed_agent.run(search_query)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? No
AI: The object of insurance is a pet that is kept by the insured as a daily companion that must be cared for and fulfilled for their needs and have a proper place. Pets in this policy are: 4.1. Dogs that are registered and have a Stambum 4.2. Cats that are registered and have a certificate from ICA 4.3.[0m

[1m> Finished chain.[0m


'The object of insurance is a pet that is kept by the insured as a daily companion that must be cared for and fulfilled for their needs and have a proper place. Pets in this policy are: 4.1. Dogs that are registered and have a Stambum 4.2. Cats that are registered and have a certificate from ICA 4.3.'

In [29]:
# query_result = "What is the Combination meter , maintenance teams says it needs to be replaced "

overall_prompt = "here is some information from database : " + query_result + " and here is the question user asked -- : " + search_query + " give a summary based on the inforamtion from database but keep as detailed as possible "
print(overall_prompt)

react_agent.run(overall_prompt)

here is some information from database : 3. The object of insurance is a pet that is kept by the insured as a daily companion that must be cared for and fulfilled for their needs and have a proper place. 4. Pets in this policy are: 4.1. Dogs that are registered and have a Stambum 4.2. Cats that are registered and have a certificate from ICA 4.3. and here is the question user asked -- : what type of animals can be insured give a summary based on the inforamtion from database but keep as detailed as possible 


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m Thought: Do I need to use a tool? No
AI: Only dogs and cats can be insured, and they must be registered and have the appropriate documentation. [0m

[1m> Finished chain.[0m


'Only dogs and cats can be insured, and they must be registered and have the appropriate documentation.'

# Extra Function-- house keeping
## Data Sore of ES - Querying direct 
!pip install google-cloud-discoveryengine --quiet

In [30]:
# !pip install google-cloud-discoveryengine --quiet

## Check Data Store Index Status
<!-- Using the list_documents method, we can do a check to see if the data store has finished indexing. -->

In [31]:
import time
from google.cloud import discoveryengine_v1
from google.api_core import operations_v1, grpc_helpers
from google.longrunning import operations_pb2
from typing import List, Optional

def list_documents(
    project_id: str, location: str, datastore_id: str, rate_limit: int = 1):
  """Gets a list of docs in a datastore."""
  client = discoveryengine_v1.DocumentServiceClient()

  request = discoveryengine_v1.ListDocumentsRequest(
      parent=f'projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{datastore_id}/branches/0',
      page_size=1000
  )

  res = client.list_documents(request=request)

  # setup the list with the first batch of docs
  docs = res.documents

  while res.next_page_token:
    # implement a rate_limit to prevent quota exhaustion
    time.sleep(rate_limit)

    request = discoveryengine_v1.ListDocumentsRequest(
      parent=f'projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{datastore_id}/branches/0',
      page_size=1000,
      page_token=res.next_page_token
    )

    res = client.list_documents(request=request)
    docs.extend(res.documents)

  return docs

def list_indexed_urls(
    project_id: str,
    location: str,
    datastore_id: str,
    docs: Optional[List[discoveryengine_v1.Document]] = None):
  """Get the list of docs in datastore, then parse to only urls."""
  if not docs:
    docs = list_documents(project_id, location, datastore_id)
  urls = [doc.content.uri for doc in docs]

  return urls

def search_url(urls: List[str], url: str):
  """Searches a url in a list of urls."""
  for item in urls:
    if url in item:
      print(item)

def search_doc_id(
    doc_id: str, docs: Optional[List[discoveryengine_v1.Document]] = None):
  """Searches a doc_id in a list of docs."""
  if not docs:
    docs = list_documents(project_id, location, datastore_id)

  doc_found = False
  for doc in docs:
    if doc.parent_document_id == document_id:
      doc_found = True
      print(doc)

  if not doc_found:
    print(f"Document not found for provided Doc ID: `{doc_id}`")


def get_operations_status(operation_id: str):
  """Get the status of an import operation for Discovery Engine."""
  host = "discoveryengine.googleapis.com"
  channel = grpc_helpers.create_channel(host)
  client = operations_v1.OperationsClient(channel)

  response = client.get_operation(operation_id)

  return response

PENDING_MESSAGE = """
No docs found.\n\nIt\'s likely one of two issues: \n  [1] Your data store is not finished indexing. \n  [2] Your data store failed indexing.\n
If you just added your data store, it can take up to 4 hours before it will become available.
"""

In [32]:
datastore_id = SEARCH_ENGINE

docs = list_documents(project_id, location, datastore_id)

if len(docs) == 0:
  print(PENDING_MESSAGE)
else:
  SUCCESS_MESSAGE = f"""
  Success! 🎉\n
  Your indexing is complete.\n
  Your index contains {len(docs)} documents.
  """
  print(SUCCESS_MESSAGE)


  Success! 🎉

  Your indexing is complete.

  Your index contains 1 documents.
  


In [33]:
from google.cloud import discoveryengine_v1
from typing import List

def list_documents(project_id: str, location: str, datastore_id: str):
  """Gets a list of docs in a datastore."""
  client = discoveryengine_v1.DocumentServiceClient()

  request = discoveryengine_v1.ListDocumentsRequest(
      parent=f'projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{datastore_id}/branches/0'
  )

  res = client.list_documents(request=request)

  return list(res)

def list_indexed_urls(project_id: str, location: str, datastore_id: str):
  """Get the list of docs in datastore, then parse to only urls."""
  docs = list_documents(project_id, location, datastore_id)
  urls = [doc.content.uri for doc in docs]

  return urls

def search_url(urls: List[str], url: str):
  """Searches a url in a list of urls."""
  for item in urls:
    if url in item:
      print(item)

In [34]:
# project_id = 'pmarlow-ccai-dev'
# location = 'global'
# datastore_id = 'grab02-sm_1690440104277'

# datastore_id = "ntu-vide-transcript-combo-_1692590014861"

project_id = project="my-project-0004-346516" 
location=  'global' #"us-central1"


## List Documents (in Data Store)

In [35]:
docs = list_documents(project_id, location, datastore_id)
docs[0]

struct_data {
}
name: "projects/255766800726/locations/global/collections/default_collection/dataStores/igloo-store_1695195086499/branches/0/documents/6808bc60247fbff372dc2d9745e5944a"
id: "6808bc60247fbff372dc2d9745e5944a"
schema_id: "default_schema"
content {
  uri: "gs://my-project-0004-bucket/matching_engine/Igloo_policy.txt"
  mime_type: "text/plain"
}
parent_document_id: "6808bc60247fbff372dc2d9745e5944a"

## List Indexed URLs

In [36]:
urls = list_indexed_urls(project_id, location, datastore_id)
urls[0]

'gs://my-project-0004-bucket/matching_engine/Igloo_policy.txt'

## Search Indexed URLs

In [37]:
search_url(urls, 'https://www.yeti.com/bags/luggage')

In [38]:
search_url(urls, 'toyota')