In [1]:
# Langchain dependencies
from langchain.document_loaders.pdf import (
    PyPDFDirectoryLoader,
)  # Importing PDF loader from Langchain
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
)  # Importing text splitter from Langchain
from langchain.embeddings import (
    OpenAIEmbeddings,
)  # Importing OpenAI embeddings from Langchain
from langchain.schema import Document  # Importing Document schema from Langchain
from langchain.vectorstores.chroma import (
    Chroma,
)  # Importing Chroma vector store from Langchain
from dotenv import load_dotenv  # Importing dotenv to get API key from .env file
from langchain.chat_models import ChatOpenAI


import os  # Importing os module for operating system functionalities
import shutil  # Importing shutil module for high-level file operations

In [4]:
# Directory to your pdf files:
DATA_PATH = r"../data"


def load_documents():
    """
    Load PDF documents from the specified directory using PyPDFDirectoryLoader.

    Returns:
        List of Document objects: Loaded PDF documents represented as Langchain Document objects.
    """
    document_loader = PyPDFDirectoryLoader(
        DATA_PATH
    )  # Initialize PDF loader with specified directory
    return (
        document_loader.load()
    )  # Load PDF documents and return them as a list of Document objects

In [6]:
documents = load_documents()
print(documents)



In [7]:
type(documents)

list

In [8]:
def split_text(documents: list[Document]):
    """
    Split the text content of the given list of Document objects into smaller chunks.

    Args:
        documents (list[Document]): List of Document objects containing text content to split.

    Returns:
        list[Document]: List of Document objects representing the split text chunks.
    """
    # Initialize text splitter with specified parameters
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,  # Size of each chunk in characters
        chunk_overlap=100,  # Overlap between consecutive chunks
        length_function=len,  # Function to compute the length of the text
        add_start_index=True,  # Flag to add start index to each chunk
    )
    # Split documents into smaller chunks using text splitter
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    # Print example of page content and metadata for a chunk
    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks  # Return the list of split text chunks

In [9]:
chunks = split_text(documents)

Split 154 documents into 1462 chunks.
even performs at a similar level to 1.0 Ultra on several benchmarks.
The ability to model data of increasingly longer contexts has tracked the development of more
general and capable language models, from the now toy 2-gram language model proposed by Shannon
1Please send correspondence to gemini-1_5-report@google.com.
© 2024 Google. All rights reserved
{'source': '../data/gemini_v1_5_report.pdf', 'page': 0, 'page_label': '1', 'start_index': 3104}


In [10]:
for chunk in chunks:
    print(chunk)
    print("\n")

page_content='Gemini 1.5: Unlocking multimodal
understanding across millions of tokens of
context
Gemini Team, Google1
In this report, we introduce the Gemini 1.5 family of models, representing the next generation of highly
compute-efficient multimodal models capable of recalling and reasoning over fine-grained information' metadata={'source': '../data/gemini_v1_5_report.pdf', 'page': 0, 'page_label': '1', 'start_index': 0}


page_content='from millions of tokens of context, including multiple long documents and hours of video and audio. The
family includes two new models: (1) an updated Gemini 1.5 Pro, which exceeds the February version on
the great majority of capabilities and benchmarks; (2) Gemini 1.5 Flash, a more lightweight variant' metadata={'source': '../data/gemini_v1_5_report.pdf', 'page': 0, 'page_label': '1', 'start_index': 311}


page_content='the great majority of capabilities and benchmarks; (2) Gemini 1.5 Flash, a more lightweight variant
designed for efficiency with m

In [19]:
CHROMA_PATH = "../chroma"

In [20]:
def save_to_chroma(chunks: list[Document]):
    """
    Save the given list of Document objects to a Chroma database.
    Args:
    chunks (list[Document]): List of Document objects representing text chunks to save.
    Returns:
    None
    """

    # Clear out the existing database directory if it exists
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new Chroma database from the documents using OpenAI embeddings
    db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
    )

    # Persist the database to disk
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [21]:
def generate_data_store():
    """
    Function to generate vector database in chroma from documents.
    """
    documents = load_documents()  # Load documents from a source
    chunks = split_text(documents)  # Split documents into manageable chunks
    save_to_chroma(chunks)  # Save the processed data to a data store

In [23]:
import chromadb

chromadb.api.client.SharedSystemClient.clear_system_cache()

In [24]:
# Load environment variables from a .env file
load_dotenv()
# Generate the data store
generate_data_store()

Split 154 documents into 1462 chunks.
even performs at a similar level to 1.0 Ultra on several benchmarks.
The ability to model data of increasingly longer contexts has tracked the development of more
general and capable language models, from the now toy 2-gram language model proposed by Shannon
1Please send correspondence to gemini-1_5-report@google.com.
© 2024 Google. All rights reserved
{'source': '../data/gemini_v1_5_report.pdf', 'page': 0, 'page_label': '1', 'start_index': 3104}
Saved 1462 chunks to ../chroma.


  db.persist()


In [25]:
query_text = "Explain how the gemini model works"

In [26]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [27]:
# Use same embedding function as before
embedding_function = OpenAIEmbeddings()

# Prepare the database
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

# Search the DB.
results = db.similarity_search_with_relevance_scores(query_text, k=3)
if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results.")

  db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)


In [28]:
from langchain.prompts import ChatPromptTemplate

In [29]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)
print(prompt)

Human: 
Answer the question based only on the following context:

The same experts then rated and ranked model responses to their respective questions. Models
were evaluated according to their ability to answer such questions with a high degree of accuracy,
but also, secondarily, completeness and informativeness. Figure 18 summarizes the results where we
see that the Gemini 1.5 models significantly and strongly outperform 1.0 Pro on the this task (see

---

also see Gemini 1.5 Pro outperforming all competing models across all modalities even when these
models are augmented with external retrieval methods. We showcase the in-context learning abilities
of both Gemini 1.5 Pro and Gemini 1.5 Flash enabled by very long context: for example, learning
to translate a new language from a single set of linguistic documentation. With only instructional

---

Generative Experiences (see Gemini 1.0 model card; Gemini-
Team et al., 2023). The Gemini 1.5 models provide particular
uses for application

In [30]:
model = ChatOpenAI()
response_text = model.predict(prompt)

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

  model = ChatOpenAI()
  response_text = model.predict(prompt)


Response: The Gemini model works by utilizing the Gemini 1.5 models, which have shown to significantly outperform previous versions like the 1.0 Pro. These models are able to answer questions with high accuracy, completeness, and informativeness. They excel in learning from large amounts of new information and can generate more relevant responses. The in-context learning abilities of Gemini 1.5 Pro and Gemini 1.5 Flash are highlighted, showcasing their ability to translate new languages from a single set of linguistic documentation and analyze, classify, and summarize large amounts of content accurately within a given prompt.
Sources: ['../data/gemini_v1_5_report.pdf', '../data/gemini_v1_5_report.pdf', '../data/gemini_v1_5_report.pdf']


In [31]:
response_text

'The Gemini model works by utilizing the Gemini 1.5 models, which have shown to significantly outperform previous versions like the 1.0 Pro. These models are able to answer questions with high accuracy, completeness, and informativeness. They excel in learning from large amounts of new information and can generate more relevant responses. The in-context learning abilities of Gemini 1.5 Pro and Gemini 1.5 Flash are highlighted, showcasing their ability to translate new languages from a single set of linguistic documentation and analyze, classify, and summarize large amounts of content accurately within a given prompt.'

In [32]:
def query_rag(query_text):
    """
    Query a Retrieval-Augmented Generation (RAG) system using Chroma database and OpenAI.

    Args:
    - query_text (str): The text to query the RAG system with.

    Returns:
    - formatted_response (str): Formatted response including the generated text and sources.
    - response_text (str): The generated response text.
    """
    # Use same embedding function as before
    embedding_function = OpenAIEmbeddings()

    # Prepare the database
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_relevance_scores(query_text, k=3)

    # Check if there are any matching results or if the relevance score is too low
    if len(results) == 0 or results[0][1] < 0.7:
        print(f"Unable to find matching results.")

    # Combine context from matching documents
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])

    # Create prompt template using context and query text
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)

    # Initialize OpenAI chat model
    model = ChatOpenAI()

    # Generate response text based on the prompt
    response_text = model.predict(prompt)

    # Get sources of the matching documents
    sources = [doc.metadata.get("source", None) for doc, _score in results]

    # Format and return response including generated text and sources
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    return formatted_response, response_text

In [33]:
formatted_response, response_text = query_rag(query_text)

In [34]:
response_text

'The Gemini model works by utilizing advanced machine learning techniques to analyze, classify, and summarize large amounts of information within a given prompt. It is able to generate more relevant responses by learning from a single set of linguistic documentation and by having very long context capabilities. The Gemini 1.5 models outperform previous versions and competing models across all modalities, even when augmented with external retrieval methods. This showcases their in-context learning abilities and their effectiveness in tasks that require learning from new information and generating accurate and informative responses.'