In [1]:
from langchain.retrievers import EnsembleRetriever,BM25Retriever
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [4]:
chunks = [
    "Microsoft acquired GitHub for 7.5 billion dollars in 2018.",
    "Tesla Cybertruck production ramp begins in 2024.",
    "Google is a large technology company with global operations.",
    "Tesla reported strong quarterly results. Tesla continues to lead in electric vehicles.",
    "SpaceX develops Starship rockets for Mars missions.",
    "The tech giant acquired the code repository platform for software development.",
    "NVIDIA designs Starship architecture for their new GPUs.",
    "Tesla Tesla Tesla financial quarterly results improved significantly.",
    "Cybertruck reservations exceeded company expectations.",
    "Microsoft is a large technology company with global operations",
    "Apple announced new iPhone features for developers.",
    "The apple orchard harvest was excellent this year.",
    "Python programming language is widely used in AI.",
    "The python snake can grow up to 20 feet long.",
    "Java coffee beans are imported from Indonesia.",
    "Java programming requires understanding of object-oriented concepts.",
    "Orange juice sales increased during winter months.",
    "Orange County reported new housing developments."
]

In [9]:
documents = [Document(page_content=chunk,metadata={"source":f"chunk_{i}"}) for i,chunk in enumerate(chunks)]

print("Simple Data: ")
print('\n'+'='*60)

for i,docs in enumerate(documents):
    print(f"{i}. {docs.page_content}")
print('\n'+'='*60)

Simple Data: 

0. Microsoft acquired GitHub for 7.5 billion dollars in 2018.
1. Tesla Cybertruck production ramp begins in 2024.
2. Google is a large technology company with global operations.
3. Tesla reported strong quarterly results. Tesla continues to lead in electric vehicles.
4. SpaceX develops Starship rockets for Mars missions.
5. The tech giant acquired the code repository platform for software development.
6. NVIDIA designs Starship architecture for their new GPUs.
7. Tesla Tesla Tesla financial quarterly results improved significantly.
8. Cybertruck reservations exceeded company expectations.
9. Microsoft is a large technology company with global operations
10. Apple announced new iPhone features for developers.
11. The apple orchard harvest was excellent this year.
12. Python programming language is widely used in AI.
13. The python snake can grow up to 20 feet long.
14. Java coffee beans are imported from Indonesia.
15. Java programming requires understanding of object-ori

In [13]:
print("Setting Up vector Retriever...")
embedding_model = OpenAIEmbeddings(model = "text-embedding-3-small")
vectorstore = Chroma.from_documents(
    documents = documents,
    embedding = embedding_model,
    collection_metadata= {"hnsw:space":"cosine"}
)

Setting Up vector Retriever...


### 1. Vectore Retriever

In [22]:
vector_retriever = vectorstore.as_retriever(
    search_kwargs={"k":2}
)

test_query = "space exploration company"


print(f"Testing: '{test_query}'")
test_docs = vector_retriever.invoke(test_query)

for i,doc in enumerate(test_docs):
    print(f"Found {i}. {doc.page_content[:250]}...")

Testing: 'space exploration company'
Found 0. SpaceX develops Starship rockets for Mars missions....
Found 1. Microsoft is a large technology company with global operations...


### 2. BM25 Retriever

In [23]:
print('Setting BM25 Retriever...')
bm25_retriever   = BM25Retriever.from_documents(documents)
bm25_retriever.k = 3

test_query = "Tesla"

print(f"Testing: '{test_query}'")
test_docs = bm25_retriever.invoke(test_query)

for i,doc in enumerate(test_docs):
    print(f"Found {i}. {doc.page_content[:250]}...")


Setting BM25 Retriever...
Testing: 'Tesla'
Found 0. Tesla Tesla Tesla financial quarterly results improved significantly....
Found 1. Tesla reported strong quarterly results. Tesla continues to lead in electric vehicles....
Found 2. Tesla Cybertruck production ramp begins in 2024....


### 3. Hybrid Retriever

In [30]:
print("Setting up Hydribd Retriever")
bm25_retriever.k = 2

hybrid_retriever = EnsembleRetriever(
    retrievers=[vector_retriever,bm25_retriever],
    weights=[0.7,0.3]
)

test_query = 'Purching cost 7.5 billion'
print(f"Testing: '{test_query}'")
test_docs = hybrid_retriever.invoke(test_query)

for i,doc in enumerate(test_docs):
    print(f"Found {i}. {doc.page_content[:250]}...")

Setting up Hydribd Retriever
Testing: 'Purching cost 7.5 billion'
Found 0. Microsoft acquired GitHub for 7.5 billion dollars in 2018....
Found 1. The tech giant acquired the code repository platform for software development....
Found 2. Orange juice sales increased during winter months....


In [31]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage,SystemMessage




combined_input = f"""Based on the following documents, please answer the question: {test_query}

Documents:
{chr(10).join([f"- {doc.page_content}" for doc in test_docs])}

Please provide a cleat, helpful answer unsing only the information from these documents. If you cammort fint the answer, tell you cannot find it. Say I dont know information to answer this question
"""

model = ChatOpenAI(model='gpt-4o-mini')

messages = [
    SystemMessage(content = "You are helpful assistant."),
    HumanMessage(content = combined_input),
]

result = model.invoke(messages)

print("\n--- Generated Response ---")
print("Content only:\n")
print(result.content)


--- Generated Response ---
Content only:

Microsoft acquired GitHub for 7.5 billion dollars in 2018.
