In [1]:
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_google_community import BigQueryVectorStore
from langchain_google_vertexai import VertexAI, VertexAIEmbeddings
from google.cloud import bigquery
import vertexai

import config

In [4]:
vertexai.init(project=config.PROJECT_ID, location=config.REGION)

## Add documents to `BigQueryVectorStore`

This step ingests and parse PDF documents, split them, generate embeddings and add the embeddings to the vector store. The document corpus used as dataset is a collection of owners car manual.

### Create the VertexAI Embedding model

In [5]:
embedding_model = VertexAIEmbeddings(
    model_name="textembedding-gecko@latest", project=config.PROJECT_ID
)

In [14]:
# Copy the file to the current path
!gsutil cp "gs://$config.GCS_BUCKET_DOCS/file.pdf" ./data/file.pdf

Copying gs://rag-llm-documents/file.pdf...
/ [1 files][328.9 KiB/328.9 KiB]                                                
Operation completed over 1 objects/328.9 KiB.                                    


In [16]:
# Ingest PDF files
loader = PyPDFLoader("data/file.pdf")
documents = loader.load()

# Add document name and source to the metadata
for document in documents:
    doc_md = document.metadata
    document_name = doc_md["source"].split("/")[-1]
    # derive doc source from Document loader
    doc_source_prefix = "/".join(config.GCS_BUCKET_DOCS.split("/")[:3])
    doc_source_suffix = "/".join(doc_md["source"].split("/")[4:-1])
    source = f"{doc_source_prefix}/{doc_source_suffix}"
    document.metadata = {"source": source, "document_name": document_name}

print(f"# of documents loaded (pre-chunking) = {len(documents)}")

# of documents loaded (pre-chunking) = 22


Verify document metadata

## Chunk documents - `TextSplitter`

Split the documents to smaller chunks. When splitting the document, ensure a few chunks can fit within the context length of LLM.

In [17]:
# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)
doc_splits = text_splitter.split_documents(documents)

# Add chunk number to metadata
for idx, split in enumerate(doc_splits):
    split.metadata["chunk"] = idx

print(f"# of documents = {len(doc_splits)}")

# of documents = 60


In [18]:
doc_splits[0].metadata

{'source': 'rag-llm-documents/', 'document_name': 'file.pdf', 'chunk': 0}

In [22]:
# Make sure there is no previous Vector Store
dataset = f"{config.PROJECT_ID}.{config.DATASET}"
dataset_object = bigquery.Dataset(dataset)
client = bigquery.Client(project=config.PROJECT_ID, location=config.REGION)
client.delete_dataset(dataset_object, delete_contents=True, not_found_ok=True)

In [23]:
bq_store = BigQueryVectorStore(
    project_id=config.PROJECT_ID,
    location=config.REGION,
    dataset_name=config.DATASET,
    table_name=config.TABLE,
    embedding=embedding_model,
)

BigQuery table rag-llm-428210.sample_app.fixmycar initialized/validated as persistent storage. Access via BigQuery console:
 https://console.cloud.google.com/bigquery?project=rag-llm-428210&ws=!1m5!1m4!4m3!1srag-llm-428210!2ssample_app!3sfixmycar


### Add documents to the store

Note: If you have precomputed embeddings, you can add text, embeddings and potential metadata using the method `add_texts_with_embeddings`

In [24]:
doc_ids = bq_store.add_documents(doc_splits)

### Get a langchain retriever
The retriever will be used in a LangChain Chain to find the most similar documents for a given query.

In [27]:
langchain_retriever = bq_store.as_retriever()

### Compose a LangChain Chain

We are going to use the [`RetrievalQA` chain](https://python.langchain.com/docs/modules/chains/popular/vector_db_qa)
There are several different chain types available, listed [here](https://docs.langchain.com/docs/components/chains/index_related_chains).

In [28]:
llm = VertexAI(model_name="gemini-1.5-flash")

search_query = "What should I do when calling the emergency roadside assistance?"  # @param {type:"string"}

retrieval_qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=langchain_retriever
)
response = retrieval_qa.invoke(search_query)
print("\n################ Final Answer ################\n")
print(response["result"])


################ Final Answer ################

When calling the emergency roadside assistance, you should be prepared to provide the following information: 
* Your name and contact information
* Your vehicle's make, model, and year
* Your vehicle's location
* The nature of the emergency 



## Cleaning up

In [None]:
from google.cloud import bigquery
dataset = f"{config.PROJECT_ID}.{config.DATASET_ID}"
dataset_object = bigquery.Dataset(dataset)
client.delete_dataset(dataset_object, delete_contents=True, not_found_ok=True)

vertex_fs.feature_view.delete()
vertex_fs.online_store.delete()