In [1]:
# Import modules and setup
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

import firebase_admin
from firebase_admin import credentials
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import os
import time


load_dotenv()

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../firebase-service-account.json"
os.environ["FIREBASE_SERVICE_ACCOUNT_FILE"] = "../firebase-service-account.json"
os.environ["FIREBASE_STORAGE_BUCKET_NAME"] = "gs://sample-firebase-ai-app-7d7d8.appspot.com"

In [2]:
# Initialize Firebase connection
cred = credentials.Certificate(os.getenv("FIREBASE_SERVICE_ACCOUNT_FILE"))
firebase_admin.initialize_app(
    cred,
    {"storageBucket": os.getenv("FIREBASE_STORAGE_BUCKET_NAME")},
    "sample-firebase-ai-app-7d7d8"
)

<firebase_admin.App at 0x1fa7bbdb430>

In [3]:
# Create an LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [4]:
# Create an Embedding
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf_embedding = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

In [5]:
# Create a Pinecone connection
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))


# Create a Pinecone index
index_name = "rag-index"
existing_indexes =[index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1024, # The dimension of the embedding model in above cell
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [6]:
# Create a vector store
vector_store = PineconeVectorStore(index=index, embedding=hf_embedding)

In [7]:
# Load documents
loader = PyPDFLoader(
    "./documents/webapp_contract.pdf",
)

documents = loader.load()

In [8]:
# Create text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=200,
)

splits = text_splitter.split_documents(documents)

In [9]:
# Delete all indexes from the vector store to start fresh
index.delete(delete_all=True)

# Add documents to the vector store
vector_store.add_documents(splits)

['0a9c9f8a-c48e-414c-b1af-44aaaf92c8f6',
 '60369d49-3738-48c4-86d6-4158f409db67',
 '2c26066f-8480-4bc1-b495-1dcb93231253',
 '948bfb7c-75c5-432b-b4b8-39e76827d1d4',
 '067ebedd-3533-44e5-acce-190e6d8c8948',
 'a249b8f8-3f59-401d-9012-a9ce1e0b5694',
 '0f510549-14be-45a3-863f-8deca8abfc68',
 '0489d0fe-a14f-4d41-a322-0cd2bfedd762']

In [26]:
vector_store.similarity_search("effective date")

[Document(id='2c26066f-8480-4bc1-b495-1dcb93231253', metadata={'page': 0.0, 'source': './documents/webapp_contract.pdf'}, page_content='maintenance, and training sessions for Company B’s team on how to use the developed \nsolutions.  \nCompany B agrees to collaborate and provide all necessary data and access required for \nCompany A to complete the above services.  \n2. Term of Agreement  \nThis Agreement shall commence on June  9, 2020 ("Effective Date") and shall remain in effect \nuntil June 9 , 2023, unless terminated earlier by either Party in accordance with Section 7 \n(Termination) of this Agreement.  \n3. Payment Terms  \n• Total Contract Amount:  The total payment for the services outlined in this Agreement \nshall not exceed $250,000. \n• Payment Schedule:'),
 Document(id='0a9c9f8a-c48e-414c-b1af-44aaaf92c8f6', metadata={'page': 0.0, 'source': './documents/webapp_contract.pdf'}, page_content='AGREEMENT CONTRACT  \n \nThis Agreement ("Agreement") is entered into as of June  9

In [10]:
# Create a retriever
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 1, "score_threshold": 0.5},
)

In [25]:
retriever.invoke("What is the effective date of the contract?")

[Document(id='2c26066f-8480-4bc1-b495-1dcb93231253', metadata={'page': 0.0, 'source': './documents/webapp_contract.pdf'}, page_content='maintenance, and training sessions for Company B’s team on how to use the developed \nsolutions.  \nCompany B agrees to collaborate and provide all necessary data and access required for \nCompany A to complete the above services.  \n2. Term of Agreement  \nThis Agreement shall commence on June  9, 2020 ("Effective Date") and shall remain in effect \nuntil June 9 , 2023, unless terminated earlier by either Party in accordance with Section 7 \n(Termination) of this Agreement.  \n3. Payment Terms  \n• Total Contract Amount:  The total payment for the services outlined in this Agreement \nshall not exceed $250,000. \n• Payment Schedule:')]

In [11]:
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use three sentences maximum and keep the answer concise.

Context: {context}
Question: {question}
Answer:
"""

In [12]:
prompt = PromptTemplate.from_template(PROMPT_TEMPLATE)

In [13]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [27]:
type(chain)

langchain_core.runnables.base.RunnableSequence

In [14]:
chain.invoke("What is the effective date of the contract?")

"The contract's effective date is June 9, 2020. This is stated in Section 2 of the agreement, which outlines the term of the agreement. The agreement will remain in effect until June 9, 2023, unless terminated earlier. \n"

In [15]:
chain.invoke("How long is the contract effective for?")

'The contract is effective for three years. It commences on June 9, 2020, and remains in effect until June 9, 2023. The contract can be terminated earlier by either party according to Section 7 of the agreement. \n'

In [16]:
# Define structured output class of time
from datetime import datetime


class EffectiveDate(BaseModel):
    """The effective date in the given context"""
    date: datetime = Field(..., description="The effective date in the format of YYYY-MM-DD")

In [17]:
# Create a Gemini Pro LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

structured_output_llm = llm.with_structured_output(EffectiveDate)

structured_output_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | structured_output_llm
)

In [18]:
structured_result: EffectiveDate = structured_output_chain.invoke("What is the effective date of the contract?")

{'name': 'EffectiveDate', 'description': 'The effective date in the given context', 'parameters': {'type_': 6, 'properties': {'date': {'type_': 1, 'description': 'The effective date in the format of YYYY-MM-DD', 'format_': '', 'nullable': False, 'enum': [], 'max_items': '0', 'min_items': '0', 'properties': {}, 'required': []}}, 'required': ['date'], 'format_': '', 'description': '', 'nullable': False, 'enum': [], 'max_items': '0', 'min_items': '0'}}


In [19]:
structured_result

EffectiveDate(date=datetime.datetime(2020, 6, 9, 0, 0))

In [20]:
structured_result.date.strftime("%a, %d %b %Y")

'Tue, 09 Jun 2020'

In [21]:
structured_end_date = structured_output_chain.invoke("What is the end date of the contract?")

{'name': 'EffectiveDate', 'description': 'The effective date in the given context', 'parameters': {'type_': 6, 'properties': {'date': {'type_': 1, 'description': 'The effective date in the format of YYYY-MM-DD', 'format_': '', 'nullable': False, 'enum': [], 'max_items': '0', 'min_items': '0', 'properties': {}, 'required': []}}, 'required': ['date'], 'format_': '', 'description': '', 'nullable': False, 'enum': [], 'max_items': '0', 'min_items': '0'}}


In [22]:
structured_end_date

EffectiveDate(date=datetime.datetime(2023, 6, 9, 0, 0))

In [23]:
structured_end_date.date.strftime("%a, %d %b %Y")

'Fri, 09 Jun 2023'