https://cloud.google.com/blog/products/data-analytics/how-to-use-rag-in-bigquery-to-bolster-llms

https://github.com/GoogleCloudPlatform/data-beans/blob/main/colab-enterprise/gen-ai-demo/Common-Themes-RAG.ipynb

### RAG BQ

[Getting started with retrieval augmented generation on BigQuery with LangChain](https://cloud.google.com/blog/products/ai-machine-learning/rag-with-bigquery-and-langchain-in-cloud) <br>
[Notebook](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_qna_with_bq_and_featurestore.ipynb)<br>

### To Do
<br>
[Langflow](https://www.langflow.org/pt/)
<br>
[Evaluate a complex agent](https://docs.smith.langchain.com/evaluation/tutorials/agents)

In [4]:
from dotenv import load_dotenv
import os
import vertexai

from langchain.chains import RetrievalQA
from langchain.globals import set_debug
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_google_community import BigQueryVectorStore, VertexFSVectorStore
from langchain_google_vertexai import VertexAI, VertexAIEmbeddings

In [5]:
load_dotenv()

True

In [6]:
assert "GOOGLE_CLOUD_PROJECT" in os.environ, "Please set the GOOGLE_CLOUD_PROJECT environment variable."
assert "GCP_REGION" in os.environ, "Please set the GCP_REGION environment variable."

In [7]:
project = os.environ.get('GOOGLE_CLOUD_PROJECT')
location = os.environ.get('GCP_REGION')

In [8]:
vertexai.init(project=project, location=location)

In [9]:
embedding_model = VertexAIEmbeddings(
    model_name="textembedding-gecko@latest", project=project
)

In [10]:
GCS_BUCKET_DOCS = "blog-files-2024/sample"

# Copy the file to the current path
!gsutil cp "gs://$GCS_BUCKET_DOCS/*.pdf" .

Copying gs://blog-files-2024/sample/all_pdf2_2015-03-12_Ideologia_e_Educação.pdf...
/ [0 files][    0.0 B/104.6 KiB]                                                
/ [0 files][104.6 KiB/104.6 KiB]                                                
-
- [1 files][104.6 KiB/104.6 KiB]                                                

Operation completed over 1 objects/104.6 KiB.                                    


In [11]:
# Ingest PDF files
loader = PyPDFLoader("all_pdf2_2015-03-12_Ideologia_e_Educação.pdf")
documents = loader.load()

# Add document name and source to the metadata
for document in documents:
    doc_md = document.metadata
    document_name = doc_md["source"].split("/")[-1]
    # derive doc source from Document loader
    doc_source_prefix = "/".join(GCS_BUCKET_DOCS.split("/")[:3])
    doc_source_suffix = "/".join(doc_md["source"].split("/")[4:-1])
    source = f"{doc_source_prefix}/{doc_source_suffix}"
    document.metadata = {"source": source, "document_name": document_name}

print(f"# of documents loaded (pre-chunking) = {len(documents)}")

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


# of documents loaded (pre-chunking) = 2


In [12]:
documents[0].metadata

{'source': 'blog-files-2024/sample/',
 'document_name': 'all_pdf2_2015-03-12_Ideologia_e_Educação.pdf'}

In [13]:
# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)
doc_splits = text_splitter.split_documents(documents)

# Add chunk number to metadata
for idx, split in enumerate(doc_splits):
    split.metadata["chunk"] = idx

print(f"# of documents = {len(doc_splits)}")

# of documents = 4


In [14]:
doc_splits[0].metadata

{'source': 'blog-files-2024/sample/',
 'document_name': 'all_pdf2_2015-03-12_Ideologia_e_Educação.pdf',
 'chunk': 0}

In [15]:
DATASET = "blog"
TABLE = "posts_dez_2024_emb"

In [16]:
bq_store = BigQueryVectorStore(
    project_id=project,
    location=location,
    dataset_name=DATASET,
    table_name=TABLE,
    embedding=embedding_model,
)

BigQuery table llm-studies.blog.posts_dez_2024_emb initialized/validated as persistent storage. Access via BigQuery console:
 https://console.cloud.google.com/bigquery?project=llm-studies&ws=!1m5!1m4!4m3!1sllm-studies!2sblog!3sposts_dez_2024_emb


In [23]:
%pip install --upgrade --quiet  langchain langchain-google-vertexai "langchain-google-community[featurestore]"

Note: you may need to restart the kernel to use updated packages.


In [24]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [17]:
doc_ids = bq_store.add_documents(doc_splits)

NotFound: 404 POST https://bigquery.googleapis.com/upload/bigquery/v2/projects/llm-studies/jobs?uploadType=multipart: Not found: Dataset llm-studies:blog

In [None]:
bq_store.similarity_search(
    "What should I do when I call the emergency roadside assistance?"
)