In [1]:
%pip install google.cloud.discoveryengine



In [2]:
import vertexai
from vertexai.generative_models import GenerativeModel, Tool
from vertexai.preview import generative_models as preview_generative_models
from google.cloud import discoveryengine
import time
import pandas as pd
from io import StringIO

In [3]:
# Initialize Vertex AI
vertexai.init(project="clear-ranger-448523-t4", location="us-central1")

# Initialize Discovery Engine client
client_discoveryengine = discoveryengine.DataStoreServiceClient()
PROJECT_ID = "clear-ranger-448523-t4"

# Define vector store paths
safety_datastore = "safety-manuals-vector-store_1737741618139"
gdocs_datastore = "gdocs_1738027566165"

In [4]:
# Generate correct path for both data stores
path_safety = client_discoveryengine.collection_path(
    project=PROJECT_ID, location="us", collection="default_collection"
) + f"/dataStores/{safety_datastore}"

path_gdocs = client_discoveryengine.collection_path(
    project=PROJECT_ID, location="us", collection="default_collection"
) + f"/dataStores/{gdocs_datastore}"

In [5]:
# Define two Vertex AI search tools
safety_search_tool = Tool.from_retrieval(
retrieval=preview_generative_models.grounding.Retrieval(
source=preview_generative_models.grounding.VertexAISearch(datastore=path_safety)
)
)

gdocs_search_tool = Tool.from_retrieval(
retrieval=preview_generative_models.grounding.Retrieval(
source=preview_generative_models.grounding.VertexAISearch(datastore=path_gdocs)
)
)

In [6]:
# Initialize Gemini with both tools
safety_model = GenerativeModel(
"gemini-1.5-pro",
generation_config={"temperature": 0.2},
tools=[safety_search_tool] # Uses safety-manual datastore only
)

gdocs_model = GenerativeModel(
"gemini-1.5-pro",
generation_config={"temperature": 0.2},
tools=[gdocs_search_tool] # Uses gdocs datastore only
)

In [7]:
# Define categories and multi-shot examples including chain-of-thought reasoning steps
safety_categories = ["fall", "electric"]

multi_shot_examples = {
    "electric":
            {"text_1":
                "All work involving 600 V power sources or greater will be completed by an **electrical** contractor",
             "text_2":
                "All cord sets and cord-plug **electrical** equipment, tools or appliances containing 120 volts or greater will be connected to a ground fault circuit interrupter (GFCI)",
             "text_3":
                "Personnel working near live **electrical** circuits above 480 V must wear insulated gloves and use protective barriers."
            },
    "fall":
        {"text_1":
                "Section 4.1.2 details fall protection: Workers over 6 feet must use harnesses",
         "text_2":
                "Fall protection shall be provided whenever employees can fall four (4) feet or more.",
         "text_3":
                "All Authority employees must follow the fall protection standard for work 4’ or more above a lower level."
        }
}

In [None]:
def retrieve_documents_by_category_safety_manual():
    category_documents = {}

    for category in safety_categories:
        chat = safety_model.start_chat()

        # Initial prompt
        initial_prompt = f"""You are a **{category} safety regulations expert**. Use the following examples as a guide to retrieve document names which mention **{category}** and the sentence where it is mentioned in a table format."""

        # Ensure all examples are used
        examples_used = set()

        for key, example in multi_shot_examples.get(category, {}).items():
            initial_prompt += f"Example {key}:\n{example}\n\n"
            examples_used.add(key)

        response_step1 = chat.send_message(initial_prompt)
        response_text_step1 = response_step1.candidates[0].content.parts[0]._raw_part.text

        # Verify findings with a new chat and request modifications
        verification_chat = safety_model.start_chat()
        verification_prompt = f"""Verify and modify the extracted information for **{category}** to ensure accuracy. Instead of an inference, provide the corrected list of document names and exact sentences where **{category}** is mentioned."""

        response_verification = verification_chat.send_message(verification_prompt + "\n\n" + response_text_step1)
        response_text_verification = response_verification.candidates[0].content.parts[0]._raw_part.text

        # Follow-up prompt for table format
        follow_up = f""" **STRICT RULES:**\n
                      Give the response in the below table format \n
                      ** Table Format:**\n
                      | Document Title | {category} Metric | Reference Sentence |\n
                      |---------------|------------------|--------------------|\n
                      | document1     | X (feet/Voltage) | Sentence here     |\n
                      | document2     | X (feet/Voltage) | Sentence here     |\n
                    """
        response_step2 = chat.send_message(follow_up)
        response_text_step2 = response_step2.candidates[0].content.parts[0]._raw_part.text

        # Store the results in dictionary
        category_documents[category] = response_text_step2

    return category_documents

In [None]:
def retrieve_documents_by_category_google():
    category_documents = {}

    for category in safety_categories:
        chat = gdocs_model.start_chat()

        # Initial prompt
        initial_prompt = f"""You are a **{category} safety regulations expert**. Use the following examples as a guide to retrieve document names which mention **{category}** and the sentence where it is mentioned in a table format."""

        # Ensure all examples are used
        examples_used = set()

        for key, example in multi_shot_examples.get(category, {}).items():
            initial_prompt += f"Example {key}:\n{example}\n\n"
            examples_used.add(key)

        response_step1 = chat.send_message(initial_prompt)
        response_text_step1 = response_step1.candidates[0].content.parts[0]._raw_part.text

        # Verify findings with a new chat and request modifications
        verification_chat = gdocs_model.start_chat()
        verification_prompt = f"""Verify and modify the extracted information for **{category}** to ensure accuracy. Instead of an inference, provide the corrected list of document names and exact sentences where **{category}** is mentioned."""

        response_verification = verification_chat.send_message(verification_prompt + "\n\n" + response_text_step1)
        response_text_verification = response_verification.candidates[0].content.parts[0]._raw_part.text

        # Follow-up prompt for table format
        follow_up = f""" **STRICT RULES:**\n
                      Give the response in the below table format \n
                      ** Table Format:**\n
                      | Document Title | {category} Metric | Reference Sentence |\n
                      |---------------|------------------|--------------------|\n
                      | document1     | X (feet/Voltage) | Sentence here     |\n
                      | document2     | X (feet/Voltage) | Sentence here     |\n
                    """
        response_step2 = chat.send_message(follow_up)
        response_text_step2 = response_step2.candidates[0].content.parts[0]._raw_part.text

        # Store the results in dictionary
        category_documents[category] = response_text_step2

    return category_documents

In [15]:
def process_markdown_tables(documents, dict_name):
    if dict_name == 'category_documents_google':
        vector_value = 'gdoc'
    elif dict_name == 'category_documents_safety':
        vector_value = 'safety_manual'
    else:
        vector_value = 'unknown'

    dfs = {}

    # Process each markdown table.
    for category, markdown in documents.items():
        # Split the markdown string into lines.
        lines = markdown.strip().splitlines()

        # Remove the markdown header separator row (assumed to be the second line).
        cleaned_lines = [lines[0]] + lines[2:]

        # Re-join the lines into a single string.
        table_str = "\n".join(cleaned_lines)

        # Use StringIO to simulate a file for pandas.
        df = pd.read_csv(StringIO(table_str), sep='|', engine='python')

        # Remove any unnamed columns and strip whitespace from the column names.
        df = df.loc[:, ~df.columns.str.contains('Unnamed')]
        df.columns = [col.strip() for col in df.columns]

        # Trim whitespace from each cell if it is a string using apply and map.
        df = df.apply(lambda col: col.map(lambda x: x.strip() if isinstance(x, str) else x))

        # Standardize the metric column name for the given category.
        # For example, if the category is 'fall', we want the metric column to be named "Fall Metric".
        if category.lower() == 'fall':
            # Find any column whose name (in lowercase) contains both "fall" and "metric"
            new_cols = {}
            for col in df.columns:
                if "fall" in col.lower() and "metric" in col.lower():
                    new_cols[col] = "Fall Metric"
            df.rename(columns=new_cols, inplace=True)
        elif category.lower() == 'electric':
            # Similarly, standardize for electric if needed.
            new_cols = {}
            for col in df.columns:
                if "electric" in col.lower() and "metric" in col.lower():
                    new_cols[col] = "Electric Metric"
            df.rename(columns=new_cols, inplace=True)

        # Add a new column 'vector_store' with the appropriate value.
        df["vector_store"] = vector_value

        # Save the dataframe to our dictionary using the category as the key.
        dfs[category] = df

    return dfs

def combine_category_dataframes(list_of_dfs):
    combined = {}

    # Collect dataframes for each category.
    for dfs in list_of_dfs:
        for category, df in dfs.items():
            if category not in combined:
                combined[category] = [df]
            else:
                combined[category].append(df)

    # Concatenate the dataframes for each category.
    for category in combined:
        combined[category] = pd.concat(combined[category], ignore_index=True)

    return combined

In [16]:
category_documents_safety = retrieve_documents_by_category_safety_manual()
category_documents_google = retrieve_documents_by_category_google()
dfs_google = process_markdown_tables(category_documents_google, 'category_documents_google')
dfs_safety = process_markdown_tables(category_documents_safety, 'category_documents_safety')
combined_dfs = combine_category_dataframes([dfs_google, dfs_safety])

In [19]:
fall = combined_dfs['fall']
fall

Unnamed: 0,Document Title,Fall Metric,Reference Sentence,vector_store
0,"DOE-0346, Rev. 2-6 Hanford Site Fall Protectio...",4 feet,"For General Industry activities, fall protecti...",gdoc
1,"DOE-0346, Rev. 2-6 Hanford Site Fall Protectio...",6 feet,"For Construction activities, including demolit...",gdoc
2,"DOE-0346, Rev. 2-6 Hanford Site Fall Protectio...",24 feet,Fall Protection is required when using fixed o...,gdoc
3,"DOE-0346, Rev. 2-6 Hanford Site Fall Protectio...",6 feet,Any fall exposure of 6 feet or greater shall b...,gdoc
4,"DOE-0346, Rev. 2-6 Hanford Site Fall Protectio...",2 feet,Positioning devices: Shall be rigged so an emp...,gdoc
5,"DOE-0346, Rev. 2-6 Hanford Site Fall Protectio...",6 feet,"While using a positioning device, when the emp...",gdoc
6,"DOE-0346, Rev. 2-6 Hanford Site Fall Protectio...",6 feet,Leading Edge Work Employees who are constructi...,gdoc
7,"DOE-0360, Rev. 1 Hanford Site Confined Space P...",,Provisions for fall protection and retrieval s...,gdoc
8,"DOE-0344, Rev. 4-7 Hanford Site Excavating, Tr...",,Ensure fall protection is provided as required...,gdoc
9,Health_and_Safety_Manual,4 feet,All Authority employees must follow the fall p...,safety_manual


In [21]:
electric = combined_dfs['electric']
electric

Unnamed: 0,Document Title,Electric Metric,Reference Sentence,vector_store
0,"DOE-0336, Rev. 2B Hanford Site Lockout/Tagout ...",50 volts,Live parts operating at 50 volts or more to wh...,gdoc
1,"DOE-0359, Rev. 5 Hanford Site Electrical Safet...",50 volts,"NOTE: Class 2 power supplies, listed low volta...",gdoc
2,Messiah_university_safety_manual.pdf,600 V,All work involving **600 V** power sources or ...,safety_manual
3,HITT-Health-and-Safety-Manual_External.pdf,120 volts,All cord sets and cord-plug electrical equipme...,safety_manual
4,EV_construction_Safety-Manual_All_Sections_202...,50 volts,All live parts of operating electrical equipme...,safety_manual
5,Idaho Employee Safety Manual.pdf,240 volts,If live parts are not placed in an electricall...,safety_manual
