In [None]:
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_google_community import BigQueryVectorStore
from langchain_google_vertexai import VertexAI, VertexAIEmbeddings
from google.cloud import bigquery
import vertexai
import os
import config

In [None]:
vertexai.init(project=config.PROJECT_ID, location=config.REGION)

# Retrieve Document from Cloud Storage

Copy PDF from Cloud Storage to local environment for manipulation

In [None]:
if not(os.path.exists("data") and os.path.isdir("data")):
    os.makedirs("data")
    
!gsutil cp "gs://$config.GCS_BUCKET/file.pdf" ./data/file.pdf

Ingest PDF file

In [None]:
loader = PyPDFLoader("data/file.pdf")
documents = loader.load()

# Add document name and source to the metadata
for document in documents:
    doc_md = document.metadata
    document_name = doc_md["source"].split("/")[-1]
    # derive doc source from Document loader
    doc_source_prefix = "/".join(config.GCS_BUCKET_DOCS.split("/")[:3])
    doc_source_suffix = "/".join(doc_md["source"].split("/")[4:-1])
    source = f"{doc_source_prefix}/{doc_source_suffix}"
    document.metadata = {"source": source, "document_name": document_name}

print(f"# of documents loaded (pre-chunking) = {len(documents)}")

# Chunk documents

In [None]:
# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)
doc_splits = text_splitter.split_documents(documents)

# Add chunk number to metadata
for idx, split in enumerate(doc_splits):
    split.metadata["chunk"] = idx

print(f"# of documents = {len(doc_splits)}")

In [None]:
doc_splits[0].metadata

# BigQuery Vectore Store

Make sure there is no previous Vector Store

In [None]:
dataset = f"{config.PROJECT_ID}.{config.DATASET}"
dataset_object = bigquery.Dataset(dataset)
client = bigquery.Client(project=config.PROJECT_ID, location=config.REGION)
client.delete_dataset(dataset_object, delete_contents=True, not_found_ok=True)

Select Embedding Model

In [None]:
embedding_model = VertexAIEmbeddings(
    model_name="textembedding-gecko@latest", project=config.PROJECT_ID
)

In [None]:
bq_store = BigQueryVectorStore(
    project_id=config.PROJECT_ID,
    location=config.REGION,
    dataset_name=config.DATASET,
    table_name=config.TABLE,
    embedding=embedding_model,
)

Add documents to the vector store

In [None]:
doc_ids = bq_store.add_documents(doc_splits)

# Chatbot interaction

Get the Langchain retriever

In [None]:
langchain_retriever = bq_store.as_retriever()

Select LLM for the interaction

In [None]:
llm = VertexAI(model_name="gemini-1.5-flash")
# llm = VertexAI(model_name="gemini-pro") # more advanced Gemini model

Input question

In [None]:
search_query = ""

Compute answer

In [None]:
retrieval_qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=langchain_retriever
)
response = retrieval_qa.invoke(search_query)
print(response["result"])

# Cleaning up

Delete Vectore Store

In [None]:
dataset = f"{config.PROJECT_ID}.{config.DATASET_ID}"
dataset_object = bigquery.Dataset(dataset)
client.delete_dataset(dataset_object, delete_contents=True, not_found_ok=True)