In [41]:
# Import modules and setup
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.indexing import index, InMemoryRecordManager
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

from firebase_admin import credentials
from pprint import pp
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import firebase_admin
import os
import time


load_dotenv()

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../firebase-service-account.json"
os.environ["FIREBASE_SERVICE_ACCOUNT_FILE"] = "../firebase-service-account.json"
os.environ["FIREBASE_STORAGE_BUCKET_NAME"] = "gs://sample-firebase-ai-app-7d7d8.appspot.com"

In [2]:
# Initialize Firebase connection
cred = credentials.Certificate(os.getenv("FIREBASE_SERVICE_ACCOUNT_FILE"))
firebase_admin.initialize_app(
    cred,
    {"storageBucket": os.getenv("FIREBASE_STORAGE_BUCKET_NAME")},
    "sample-firebase-ai-app-7d7d8"
)

<firebase_admin.App at 0x1f1f3a9e2c0>

In [3]:
# Create an LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [4]:
# Create an Embedding
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf_embedding = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

In [19]:
# Create a Pinecone connection
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))


# Create a Pinecone index
pinecone_index_name = "rag-index"
existing_indexes =[index_info["name"] for index_info in pc.list_indexes()]

if pinecone_index_name not in existing_indexes:
    pc.create_index(
        name=pinecone_index_name,
        dimension=1024, # The dimension of the embedding model in above cell
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(pinecone_index_name).status["ready"]:
        time.sleep(1)

pinecone_index = pc.Index(pinecone_index_name)

In [14]:
# Create a vector store
vector_store = PineconeVectorStore(index=pinecone_index, embedding=hf_embedding)

In [15]:
# Load documents
loader = PyPDFLoader(
    "./documents/webapp_contract.pdf",
)

documents = loader.load()

In [16]:
# Create text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=200,
)

splits = text_splitter.split_documents(documents)

In [20]:
# Delete all indexes from the vector store to start fresh
# index.delete(delete_all=True)

# Add documents to the vector store
# vector_store.add_documents(splits)

# Setup a record manager for indexing API to cache the metadata
record_manager = InMemoryRecordManager(namespace=f"pinecone/{pinecone_index_name}")
record_manager.create_schema()

# Set up the indexing API
index(splits, record_manager, vector_store, cleanup="full", source_id_key="source")

{'num_added': 8, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [23]:
retrieved_docs = vector_store.similarity_search("effective date")
retrieved_docs

[Document(id='1b8c395b-310d-53fb-a2e4-0a8424df4f00', metadata={'page': 0.0, 'source': './documents/webapp_contract.pdf'}, page_content='maintenance, and training sessions for Company B’s team on how to use the developed \nsolutions.  \nCompany B agrees to collaborate and provide all necessary data and access required for \nCompany A to complete the above services.  \n2. Term of Agreement  \nThis Agreement shall commence on June  9, 2020 ("Effective Date") and shall remain in effect \nuntil June 9 , 2023, unless terminated earlier by either Party in accordance with Section 7 \n(Termination) of this Agreement.  \n3. Payment Terms  \n• Total Contract Amount:  The total payment for the services outlined in this Agreement \nshall not exceed $250,000. \n• Payment Schedule:'),
 Document(id='89b55050-bf8a-59da-b67d-eb97620e0bc7', metadata={'page': 0.0, 'source': './documents/webapp_contract.pdf'}, page_content='AGREEMENT CONTRACT  \n \nThis Agreement ("Agreement") is entered into as of June  9

In [28]:
[print(f"\nDocument: {i}\n\n{doc.page_content}\n\n" + "-" * 100) for i, doc in enumerate(retrieved_docs)]


Document: 0

maintenance, and training sessions for Company B’s team on how to use the developed 
solutions.  
Company B agrees to collaborate and provide all necessary data and access required for 
Company A to complete the above services.  
2. Term of Agreement  
This Agreement shall commence on June  9, 2020 ("Effective Date") and shall remain in effect 
until June 9 , 2023, unless terminated earlier by either Party in accordance with Section 7 
(Termination) of this Agreement.  
3. Payment Terms  
• Total Contract Amount:  The total payment for the services outlined in this Agreement 
shall not exceed $250,000. 
• Payment Schedule:

----------------------------------------------------------------------------------------------------

Document: 1

AGREEMENT CONTRACT  
 
This Agreement ("Agreement") is entered into as of June  9, 2020 ("Effective Date"), by and 
between:  
1. Tech Innovators Inc. , a corporation organized and existing under the laws of California, 
USA, with its prin

[None, None, None, None]

In [37]:
# Create a retriever
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 5, "score_threshold": 0.5},
)

In [38]:
retrieved_docs = retriever.invoke("What is the effective date of the contract?")
retrieved_docs

[Document(id='1b8c395b-310d-53fb-a2e4-0a8424df4f00', metadata={'page': 0.0, 'source': './documents/webapp_contract.pdf'}, page_content='maintenance, and training sessions for Company B’s team on how to use the developed \nsolutions.  \nCompany B agrees to collaborate and provide all necessary data and access required for \nCompany A to complete the above services.  \n2. Term of Agreement  \nThis Agreement shall commence on June  9, 2020 ("Effective Date") and shall remain in effect \nuntil June 9 , 2023, unless terminated earlier by either Party in accordance with Section 7 \n(Termination) of this Agreement.  \n3. Payment Terms  \n• Total Contract Amount:  The total payment for the services outlined in this Agreement \nshall not exceed $250,000. \n• Payment Schedule:'),
 Document(id='89b55050-bf8a-59da-b67d-eb97620e0bc7', metadata={'page': 0.0, 'source': './documents/webapp_contract.pdf'}, page_content='AGREEMENT CONTRACT  \n \nThis Agreement ("Agreement") is entered into as of June  9

In [43]:
[print(f"\nDocument: {i}\n\n{doc.page_content}\n\n" + "-" * 100) for i, doc in enumerate(retrieved_docs)]


Document: 0

maintenance, and training sessions for Company B’s team on how to use the developed 
solutions.  
Company B agrees to collaborate and provide all necessary data and access required for 
Company A to complete the above services.  
2. Term of Agreement  
This Agreement shall commence on June  9, 2020 ("Effective Date") and shall remain in effect 
until June 9 , 2023, unless terminated earlier by either Party in accordance with Section 7 
(Termination) of this Agreement.  
3. Payment Terms  
• Total Contract Amount:  The total payment for the services outlined in this Agreement 
shall not exceed $250,000. 
• Payment Schedule:

----------------------------------------------------------------------------------------------------

Document: 1

AGREEMENT CONTRACT  
 
This Agreement ("Agreement") is entered into as of June  9, 2020 ("Effective Date"), by and 
between:  
1. Tech Innovators Inc. , a corporation organized and existing under the laws of California, 
USA, with its prin

[None, None, None, None, None]

In [44]:
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use three sentences maximum and keep the answer concise.

Context: {context}
Question: {question}
Answer:
"""

In [46]:
prompt = PromptTemplate.from_template(PROMPT_TEMPLATE)

In [47]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [48]:
type(chain)

langchain_core.runnables.base.RunnableSequence

In [49]:
chain.invoke("What is the effective date of the contract?")

'The effective date of the contract is June 9, 2020. This information is stated in the first paragraph of the agreement. \n'

In [50]:
chain.invoke("How long is the contract effective for?")

'The contract is effective for three years, starting on June 9, 2020, and ending on June 9, 2023. It can be terminated earlier by either party according to Section 7 of the agreement. \n'

In [51]:
# Define structured output class of time
from datetime import datetime


class EffectiveDate(BaseModel):
    """The effective date in the given context"""
    date: datetime = Field(..., description="The effective date in the format of YYYY-MM-DD")

In [58]:
# Create a Gemini Pro LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

structured_output_llm = llm.with_structured_output(EffectiveDate)

structured_output_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | structured_output_llm
)

In [59]:
structured_result: EffectiveDate = structured_output_chain.invoke("What is the effective date of the contract?")

{'name': 'EffectiveDate', 'description': 'The effective date in the given context', 'parameters': {'type_': 6, 'properties': {'date': {'type_': 1, 'description': 'The effective date in the format of YYYY-MM-DD', 'format_': '', 'nullable': False, 'enum': [], 'max_items': '0', 'min_items': '0', 'properties': {}, 'required': []}}, 'required': ['date'], 'format_': '', 'description': '', 'nullable': False, 'enum': [], 'max_items': '0', 'min_items': '0'}}


In [60]:
structured_result

EffectiveDate(date=datetime.datetime(2020, 6, 9, 0, 0))

In [61]:
structured_result.date.strftime("%a, %d %b %Y")

'Tue, 09 Jun 2020'

In [63]:
structured_end_date: EffectiveDate = structured_output_chain.invoke("What is the end date of the contract?")

{'name': 'EffectiveDate', 'description': 'The effective date in the given context', 'parameters': {'type_': 6, 'properties': {'date': {'type_': 1, 'description': 'The effective date in the format of YYYY-MM-DD', 'format_': '', 'nullable': False, 'enum': [], 'max_items': '0', 'min_items': '0', 'properties': {}, 'required': []}}, 'required': ['date'], 'format_': '', 'description': '', 'nullable': False, 'enum': [], 'max_items': '0', 'min_items': '0'}}


In [65]:
structured_end_date

EffectiveDate(date=datetime.datetime(2023, 6, 9, 0, 0))

In [66]:
structured_end_date.date.strftime("%a, %d %b %Y")

'Fri, 09 Jun 2023'