Build a Generative AI solution using a RAG Framework: Challenge Lab L400

https://partner.cloudskillsboost.google/course_templates/982/labs/463255

In [None]:
!pip install --upgrade --user google-cloud-aiplatform google-cloud-storage firebase-admin
!pip install langchain_community
!pip install google-cloud-aiplatform
!pip install google-cloud-storage
!pip install langchain_community
!pip install langchain
!pip install pypdf

In [None]:
import IPython
from IPython.display import Markdown, display
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

In [None]:
# get project ID
PROJECT_ID = ! gcloud config get project
PROJECT_ID = PROJECT_ID[0]
LOCATION = "us-central1"
SOURCE_FILE_NAME = "fpc-manual.pdf"
JSONL_FILE_NAME = "embeddings.json"
BUCKET_NAME = "qwiklabs-gcp-00-a99bae2a79d3"
JSONL_FILE_PATH = f"gs://{BUCKET_NAME}"
EMBEDDING_MODEL_NAME = "textembedding-gecko@002"
INDEX_DIMENSION = 768
INDEX_APPROXIMATE_NEIGHBORS_COUNT = 10
INDEX_NAME = "assessment-index"
DEPLOYED_INDEX_ID = "assessment_index_deployed"
INDEX_ENDPOINT_NAME = "assessment-index-endpoint"
INDEX_MACHINE_TYPE = "n1-standard-2"
# generate an unique id for this session
from datetime import datetime
UID = datetime.now().strftime("%m%d%H%M")

In [None]:
import pypdf
from google.cloud import aiplatform
import vertexai
from vertexai.language_models import TextEmbeddingModel
import firebase_admin
from firebase_admin import firestore
from google.cloud import storage
# import pandas as pd
import json

# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=LOCATION)
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@002")

# initialize Firestore
# Application Default credentials are automatically created.
app = firebase_admin.initialize_app()
db = firestore.client()
print(f"Firestore database created successfully.")

# Open the PDF file
pdf_file = open(SOURCE_FILE_NAME, 'rb')

# Create a PDF reader object
pdf_reader = pypdf.PdfReader(pdf_file)

# Get the number of pages in the PDF
num_pages = len(pdf_reader.pages)
print(f"Read PDF with number of pages: {num_pages}")

with open(JSONL_FILE_NAME, 'w', encoding='utf-8') as f:
  # Loop through each page
  for page_num in range(num_pages):

    # Get the current page
    page = pdf_reader.pages[page_num]
      
    # Extract the text from the page
    text = page.extract_text()
    
    # Write to Firestore
    doc_ref = db.collection("page_content").document(str(page_num + 1))
    doc_ref.set({'content': text})
    print(f"Document: {str(page_num + 1)} created in Firestore database successfully.")

    # Get the embeddings for the text
    raw_embeddings_with_metadata = model.get_embeddings([text])
    embeddings = [embedding.values for embedding in raw_embeddings_with_metadata][0]

    # construct Panda dataframe
    data = {
      "id": str(page_num + 1),
      "embedding": embeddings
    }

    # Write to jsonl file
    f.write(json.dumps(data) + "\n")
    # df.to_json(f, orient="values", lines=True)

    print(f"Embedding: {str(page_num + 1)} saved in JSONL file successfully.")

# Close the PDF file
pdf_file.close()

# Create a GCS bucket to store the JSONL file
client = storage.Client(project=PROJECT_ID)
bucket = client.bucket(BUCKET_NAME)

# Create the bucket if it doesn't exist
try:
  bucket = client.create_bucket(bucket, location=LOCATION)
  print(f"Bucket {BUCKET_NAME} created successfully.")
except Exception as e:
  print(f"Error creating bucket: {e}")

# Upload the file to the bucket
blob = bucket.blob(JSONL_FILE_NAME)
blob.upload_from_filename(JSONL_FILE_NAME)
print(f"File {JSONL_FILE_NAME} uploaded to bucket {BUCKET_NAME}.")

# Create a vector index from the JSON-L file
index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
  display_name = INDEX_NAME,
  contents_delta_uri = JSONL_FILE_PATH,
  dimensions = INDEX_DIMENSION,
  approximate_neighbors_count = INDEX_APPROXIMATE_NEIGHBORS_COUNT,
)

# Wait for the index creation to complete
index.wait()

print(f"Vector index created: {index.name}")

# Deploy the vector index as an endpoint
endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
  display_name = INDEX_ENDPOINT_NAME,
  public_endpoint_enabled = True,
  description="Assessment Index Endpoint",
)

endpoint = endpoint.deploy_index(
  index = index, 
  deployed_index_id = DEPLOYED_INDEX_ID,
)

print(f"Endpoint deployed: {endpoint.resource_name}")