# Search product location in a store using structured data 
Author: lessismore@ </br>
Date: 04/04/2024

In [1]:
#import libraries
import pandas as pd
from google.api_core.client_options import ClientOptions
from google.cloud import discoveryengine_v1beta as discoveryengine

In [2]:
#set project parameters
PROJECT = !gcloud config get-value project
PROJECT_ID = PROJECT[0]
LOCATION = "global" 
DATA_STORE_ID = "bq-product-table_1712144412828"  

## Method-1: Search with Vertex AI Search

* Data imported into a datastore in Vertex AI Search via the UI </br> 
https://cloud.google.com/generative-ai-app-builder/docs/prepare-data#bigquery-structured
* See the GitHub repo for other examples with Vertex AI Search </br>
https://github.com/GoogleCloudPlatform/generative-ai/tree/main/search

Function to search the Vertex AI Search data store

In [38]:
def search_data_store(
    project_id: str,
    location: str,
    data_store_id: str,
    search_query: str,
) -> discoveryengine.SearchResponse:
    #  For more information, refer to:
    # https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )

    # Create a client
    client = discoveryengine.SearchServiceClient(client_options=client_options)

    # The full resource name of the search engine serving config
    # e.g. projects/{project_id}/locations/{location}/dataStores/{data_store_id}/servingConfigs/{serving_config_id}
    serving_config = client.serving_config_path(
        project=project_id,
        location=location,
        data_store=data_store_id,
        serving_config="default_config",
    )

    # Optional: Configuration options for search
    # Refer to the `ContentSearchSpec` reference for all supported fields:
    # https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest.ContentSearchSpec
    content_search_spec = discoveryengine.SearchRequest.ContentSearchSpec(
        # For information about snippets, refer to:
        # https://cloud.google.com/generative-ai-app-builder/docs/snippets
        snippet_spec=discoveryengine.SearchRequest.ContentSearchSpec.SnippetSpec(
            return_snippet=True
        ),
        extractive_content_spec=discoveryengine.SearchRequest.ContentSearchSpec.ExtractiveContentSpec(
            max_extractive_answer_count=5,
            max_extractive_segment_count=1,
        ),
        # For information about search summaries, refer to:
        # https://cloud.google.com/generative-ai-app-builder/docs/get-search-summaries
        summary_spec=discoveryengine.SearchRequest.ContentSearchSpec.SummarySpec(
            summary_result_count=5,
            include_citations=True,
            ignore_adversarial_query=False,
            ignore_non_summary_seeking_query=False,
        ),
    )

    # Refer to the `SearchRequest` reference for all supported fields:
    # https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest
    request = discoveryengine.SearchRequest(
        serving_config=serving_config,
        query=search_query,
        page_size=5,
        content_search_spec=content_search_spec,
        query_expansion_spec=discoveryengine.SearchRequest.QueryExpansionSpec(
            condition=discoveryengine.SearchRequest.QueryExpansionSpec.Condition.AUTO,
        ),
        spell_correction_spec=discoveryengine.SearchRequest.SpellCorrectionSpec(
            mode=discoveryengine.SearchRequest.SpellCorrectionSpec.Mode.AUTO
        ),
    )

    response = client.search(request)
    return response

In [39]:
response = search_data_store(PROJECT_ID, LOCATION, DATA_STORE_ID, "Where are the bananas?")

response.summary.summary_text

'You can find bananas at A09-S12. [1] They are fresh and organic, and are a good source of potassium. [1]'

In [40]:
response = search_data_store(PROJECT_ID, LOCATION, DATA_STORE_ID, "Where can I find peanut butter?")

response.summary.summary_text

'You can find Peter Pan Creamy Peanut Butter on A09-S2. [1] You can also find Skippy Creamy Peanut Butter on A09-S3. [2] Jif Creamy Peanut Butter is on A09-S4. [3]'

In [41]:
response = search_data_store(PROJECT_ID, LOCATION, DATA_STORE_ID, "Donde puedo encontrar las bananas?")

response.summary.summary_text

'Puedes encontrar bananas en la sección A09-S12. [1] Las bananas Dole son frescas y orgánicas, y son una buena fuente de potasio. [1]'

In [48]:
response = search_data_store(PROJECT_ID, LOCATION, DATA_STORE_ID, "where can i find juice?")

response.summary.summary_text

'You can find juice at A03-S3. [1, 3]'

In [53]:
response = search_data_store(PROJECT_ID, LOCATION, DATA_STORE_ID, "sofas?")

response.summary.summary_text

'There is not enough information to answer the query.'

## Method-2: Similarity Search in BigQuery 

* With ML.GENERATE_EMBEDDING in BQ, embeddings for product names/descriptions are created and stored in BigQuery Table </br>
https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-generate-embedding
* With ML.DISTANCE in BQ, the distance between the customer query and the product embeddings is computed with similarity search in BigQuery </br>
https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-distance

In [10]:
%%bigquery

# Print product table.
SELECT *
FROM `tadelle-372416.demo.product_table`

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,product,location,store
0,1,Kellogg's Frosted Flakes Cereal - Sweet and Cr...,A01-S2,Walworth Rd
1,11,Minute Maid Original Orange Juice - No Pulp (5...,A03-S1,Walworth Rd
2,17,Dave's Killer Bread Organic 21 Whole Grains an...,A03-S2,Walworth Rd
3,23,Tropicana Pure Premium Orange Juice - No Pulp ...,A03-S2,Butterfly Walk
4,5,"Silk Unsweetened Almond Milk - Dairy-free, Pla...",A03-S2,Walworth Rd
5,18,Nature's Own Honey Wheat Bread - Soft & Delicious,A03-S3,Walworth Rd
6,24,Florida's Natural Orange Juice - 100% Original...,A03-S3,Walworth Rd
7,7,"Chobani Greek Yogurt - Plain, Non-Fat, High Pr...",A03-S7,Walworth Rd
8,9,Hidden Valley Original Ranch Salad Dressing - ...,A05-S5,Walworth Rd
9,10,"Ritz Crackers - Original, Buttery Flavor",A07-S2,Walworth Rd


In [13]:
%%bigquery

# Create embeddings in BQ.
create or replace table `tadelle-372416.demo.product_table_emb` as
SELECT *
  FROM
    ML.GENERATE_EMBEDDING(
      MODEL `tadelle-372416.demo.embedding-gecko`,
      (select id, product as content, location, store FROM `tadelle-372416.demo.product_table`),
      STRUCT(TRUE AS flatten_json_output)
    );

Query is running:   0%|          |

In [51]:
%%bigquery

# Find similarity in embeddings.
SELECT * FROM (
SELECT
  content,
  ML.DISTANCE(
    (SELECT ml_generate_embedding_result
      FROM ML.GENERATE_EMBEDDING(MODEL `tadelle-372416.demo.embedding-gecko`,
                                 (select "Where can I find Greek yogurt?" as content)
    )),
    ml_generate_embedding_result,
    'COSINE'
  ) AS distance_to_query
FROM
  `tadelle-372416.demo.product_table_emb`
  where store = "Walworth Rd"       #filters for current store or other attributes can be applied.
  #where store = "Butterfly Walk" 
ORDER BY distance_to_query)
WHERE distance_to_query < 0.1


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,content,distance_to_query
0,"Chobani Greek Yogurt - Plain, Non-Fat, High Pr...",0.061903


## Method-3: Vector Search in BigQuery

* Identify semantically similar entities using Vector Search in BigQuery. To perform a vector search, you use the VECTOR_SEARCH function and optionally a Vector Index (minimum 5k rows required for creating Vector Index). When a vector index is used, VECTOR_SEARCH uses the Approximate Nearest Neighbor search technique to help improve vector search performance. Brute force is used to return exact results when a vector index isn't available, and you can choose to use brute force to get exact results even when a vector index is available. </br>
https://cloud.google.com/bigquery/docs/vector-search-intro

In [4]:
%%bigquery

SELECT query.query, base.content as product, base.location, base.store, round(distance,5) as distance
FROM VECTOR_SEARCH(
  TABLE `tadelle-372416.demo.product_table_emb`, 'ml_generate_embedding_result',
  (SELECT ml_generate_embedding_result, content as query
      FROM ML.GENERATE_EMBEDDING(MODEL `tadelle-372416.demo.embedding-gecko`,(select "Where can I find pasta sauce?" as content))),
  top_k => 3, distance_type => 'COSINE', options => '{"fraction_lists_to_search": 1.0}')
  where distance <0.1;


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,query,product,location,store,distance
0,Where can I find pasta sauce?,Barilla Classic Marinara Pasta Sauce - Traditi...,A09-S3,Walworth Rd,0.09515


In [33]:
# Running queries with parameters with Python client for BigQuery.
from google.cloud import bigquery

# Parameters for the query.
PROMPT = "Where can I find Greek yogurt?"
DISTANCE = 0.1

# Construct a BigQuery client object.
client = bigquery.Client()

query = """
    SELECT query.query, base.content as product, base.location, base.store, round(distance,5) as distance
    FROM VECTOR_SEARCH(
      TABLE `tadelle-372416.demo.product_table_emb`, 'ml_generate_embedding_result',
      (SELECT ml_generate_embedding_result, content as query
      FROM ML.GENERATE_EMBEDDING(MODEL `tadelle-372416.demo.embedding-gecko`,
      (select @prompt as content))),
      top_k => 5, distance_type => 'COSINE', options => '{"fraction_lists_to_search": 1.0}')
      where distance <@distance;

"""
job_config = bigquery.QueryJobConfig(
    query_parameters=[
        bigquery.ScalarQueryParameter("prompt", "STRING", PROMPT),
        bigquery.ScalarQueryParameter("distance", "FLOAT64", DISTANCE),
    ]
)
query_job = client.query(query, job_config=job_config)  

for row in query_job:
    print("{}: \t{}".format(row.product, row.distance))

Chobani Greek Yogurt - Plain, Non-Fat, High Protein: 	0.05797


## Method-4: Vertex AI Vector Search

* Vector Search is based on vector search technology developed by Google research. With Vector Search you can leverage the same infrastructure that provides a foundation for Google products such as Google Search, YouTube, and Play.</br>
https://cloud.google.com/vertex-ai/docs/vector-search/overview </br>
https://github.com/GoogleCloudPlatform/generative-ai/blob/main/embeddings/vector-search-quickstart.ipynb


In [6]:
BUCKET_URI = "gs://tadelle-bucket/genai/projects/morrisons" 
! gsutil cp "gs://github-repo/data/vs-quickstart/product-embs.json" "$BUCKET_URI"

Copying gs://github-repo/data/vs-quickstart/product-embs.json [Content-Type=application/json]...
- [1 files][ 79.3 MiB/ 79.3 MiB]                                                
Operation completed over 1 objects/79.3 MiB.                                     


In [8]:
! gsutil cp "gs://github-repo/data/vs-quickstart/product-embs.json" . # for query tests
! pwd

Copying gs://github-repo/data/vs-quickstart/product-embs.json...
- [1 files][ 79.3 MiB/ 79.3 MiB]                                                
Operation completed over 1 objects/79.3 MiB.                                     
/home/jupyter


#### Build and Deploy a Vector Search Index

##### Create Index

Now it's ready to load the embeddings to Vector Search. Its APIs are available under the [aiplatform](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform) package of the SDK.

In [10]:
# init the aiplatform package
from google.cloud import aiplatform

LOCATION = "us-central1" 

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [17]:
# create Index
product_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name="retail_product_index",
    contents_delta_uri=BUCKET_URI,
    dimensions=768,
    approximate_neighbors_count=5,
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/81492352225/locations/us-central1/indexes/4293430178744369152/operations/29580195681796096
MatchingEngineIndex created. Resource name: projects/81492352225/locations/us-central1/indexes/4293430178744369152
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/81492352225/locations/us-central1/indexes/4293430178744369152')


#### Create Index Endpoint and deploy the Index

To use the Index, you need to create an [Index Endpoint](https://cloud.google.com/vertex-ai/docs/vector-search/deploy-index-public). It works as a server instance accepting query requests for your Index.

In [18]:
# create IndexEndpoint
product_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name="retail_product_index_endpoint", public_endpoint_enabled=True
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/81492352225/locations/us-central1/indexEndpoints/4932941325830979584/operations/1445962278489817088
MatchingEngineIndexEndpoint created. Resource name: projects/81492352225/locations/us-central1/indexEndpoints/4932941325830979584
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/81492352225/locations/us-central1/indexEndpoints/4932941325830979584')


In [19]:
DEPLOYED_INDEX_ID = "retail_product_index_endpoint"
# deploy the Index to the Index Endpoint
product_index_endpoint.deploy_index(index=product_index, deployed_index_id=DEPLOYED_INDEX_ID)

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/81492352225/locations/us-central1/indexEndpoints/4932941325830979584
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/81492352225/locations/us-central1/indexEndpoints/4932941325830979584/operations/7011566992989618176
MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/81492352225/locations/us-central1/indexEndpoints/4932941325830979584


<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7f3b4c423940> 
resource name: projects/81492352225/locations/us-central1/indexEndpoints/4932941325830979584

In [20]:
# Get an embedding to run a query: First, load the embedding JSON file to build a dict of product names and embeddings.
import json

# build dicts for product names and embs
product_names = {}
product_embs = {}
with open("product-embs.json") as f:
    for l in f.readlines():
        p = json.loads(l)
        id = p["id"]
        product_names[id] = p["name"]
        product_embs[id] = p["embedding"]

In [21]:
# get the embedding for ID 6523 "cloudveil women's excursion short"
# you can also try with other IDs such as 12711, 18090, 19536 and 11863

query_emb = product_embs["6523"]





In [22]:
# run query
response = product_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID, queries=[query_emb], num_neighbors=5
)

# show the results
for idx, neighbor in enumerate(response[0]):
    print(f"{neighbor.distance:.2f} {product_names[neighbor.id]}")

1.00 cloudveil women's excursion short
0.82 quiksilver womens cruiser short
0.80 xcvi women's alisal short
0.80 cloudveil men's kahuna short
0.78 ibex women's gozo short


The end.