In [None]:
!pip install --quiet --upgrade google_cloud_firestore google_cloud_aiplatform langchain langchain-google-vertexai langchain_community langchain_experimental pymupdf

In [2]:
import vertexai
from vertexai.language_models import TextEmbeddingModel
from vertexai.generative_models import GenerativeModel

import pickle
from IPython.display import display, Markdown

from langchain_google_vertexai import VertexAIEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_experimental.text_splitter import SemanticChunker

from google.cloud import firestore
from google.cloud.firestore_v1.vector import Vector
from google.cloud.firestore_v1.base_vector_query import DistanceMeasure

In [3]:
vertexai.init(project='qwiklabs-gcp-02-a5948b2e0850', location='us-central1')

In [42]:
embedding_model = VertexAIEmbeddings(model_name="text-embedding-004")

In [6]:
!gcloud storage cp gs://partner-genai-bucket/genai069/nyc_food_safety_manual.pdf .

Copying gs://partner-genai-bucket/genai069/nyc_food_safety_manual.pdf to file://./nyc_food_safety_manual.pdf

Average throughput: 77.6MiB/s


In [15]:
loader = PyMuPDFLoader('./nyc_food_safety_manual.pdf')

data = loader.load()

In [17]:
data[0]



In [18]:
def clean_page(page):
  return page.page_content.replace("-\n","")\
                          .replace("\n"," ")\
                          .replace("\x02","")\
                          .replace("\x03","")\
                          .replace("fo d P R O T E C T I O N  T R A I N I N G  M A N U A L","")\
                          .replace("N E W  Y O R K  C I T Y  D E P A R T M E N T  O F  H E A L T H  &  M E N T A L  H Y G I E N E","")

In [19]:
cleaned_pages = [clean_page(page) for page in data]

In [44]:
from langchain_experimental.text_splitter import SemanticChunker

# Create a text splitter
text_splitter = SemanticChunker(embedding_model)

# Split the first five cleaned pages into chunks
first_five_pages = cleaned_pages[:5]
chunked_content = []
for page in first_five_pages:
    chunks = text_splitter.split_text(page)
    chunked_content.extend(chunks)

In [45]:
# Generate embeddings for the chunks
chunked_embeddings = embedding_model.embed_documents(chunked_content)

In [46]:
!gsutil cp gs://partner-genai-bucket/genai069/chunked_content.pkl .
!gsutil cp gs://partner-genai-bucket/genai069/chunked_embeddings.pkl .

chunked_content = pickle.load(open("chunked_content.pkl", "rb"))
chunked_embeddings = pickle.load(open("chunked_embeddings.pkl", "rb"))

Copying gs://partner-genai-bucket/genai069/chunked_content.pkl...
/ [1 files][280.7 KiB/280.7 KiB]                                                
Operation completed over 1 objects/280.7 KiB.                                    
Copying gs://partner-genai-bucket/genai069/chunked_embeddings.pkl...
/ [1 files][  1.8 MiB/  1.8 MiB]                                                
Operation completed over 1 objects/1.8 MiB.                                      


In [57]:
db = firestore.Client(project="qwiklabs-gcp-02-a5948b2e0850")

In [64]:
collection = db.collection('food-safety')

In [78]:
for i, (content, embedding) in enumerate(zip(chunked_content, chunked_embeddings)):
    doc_ref = collection.document(f"doc_{i}")
    doc_ref.set({
        "content": content,
        "embedding": Vector(embedding)
    })

In [None]:
!gcloud firestore indexes composite create \
--collection-group=food-safety \
--query-scope=COLLECTION \
--field-config field-path=embedding,vector-config='{"dimension":"768", "flat": "{}"}' \
--project="qwiklabs-gcp-02-a5948b2e0850"

In [80]:
from google.cloud.firestore_v1.base_vector_query import DistanceMeasure
from google.cloud.firestore_v1.vector import Vector

collection = db.collection('food-safety')

def search_vector_database(query: str):
  context = ""

  query_embedding = embedding_model.embed_query(query)

  vector_query = collection.find_nearest(
    vector_field='embedding',
    query_vector=Vector(query_embedding),
    distance_measure=DistanceMeasure.EUCLIDEAN,
    limit=5,
  )

  docs = vector_query.stream()

  context = [result.to_dict()['content'] for result in docs]

  return context

In [81]:
search_vector_database("How should I store food?")

[' Store foods away from dripping condensate , at least six inches above the floor and with enough space between items to encourage air circulation. Freezer Storage Freezing is an excellent method for prolonging the shelf life of foods. By keeping foods frozen solid, the bacterial growth is minimal at best. However, if frozen foods are thawed and then refrozen, then harmful bacteria can reproduce to dangerous levels when thawed for the second time. In addition to that, the quality of the food is also affected. Never refreeze thawed foods, instead use them immediately. Keep the following rules in mind for freezer storage:  Use First In First Out method of stock rotation. All frozen foods should be frozen solid with temperature at 0°F or lower. Always use clean containers that are clearly labeled and marked, and have proper and secure lids. Allow adequate spacing between food containers to allow for proper air circulation. Never use the freezer for cooling hot foods. * * Tip: When receiv