In [2]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv

load_dotenv()
print(os.getenv("GOOGLE_API_KEY"))
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


  from .autonotebook import tqdm as notebook_tqdm


AIzaSyBnEofNJDsQjA-n0HagyFLbAl6K9ULEZpc


In [6]:
sample_texts = [
    "The cat sat on the mat",
    "The dog played in the park",
    "Machine learning is fascinating"
]

embedded = embeddings.embed_documents(sample_texts)

print(f"Number of texts: {len(sample_texts)}")
print(f"Embedding dimensions: {len(embedded[0])}")

Number of texts: 3
Embedding dimensions: 384


In [7]:
loader = PyPDFLoader("test.pdf")
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(documents)

print(f"üìÑ Prepared {len(chunks)} chunks")

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chroma_db",
    collection_name="test_collection"
)

print("vectore store created")

üìÑ Prepared 13 chunks
vectore store created


In [8]:
query = "What is this document about?"

# Search for relevant chunks
results = vectorstore.similarity_search(query, k=3)  # Get top 3

print(f"üîç Found {len(results)} relevant chunks:\n")

for i, doc in enumerate(results, 1):
    print(f"Result {i}:")
    print(doc.page_content[:200])  # First 200 chars
    print(f"Source: {doc.metadata}")
    print("-" * 50)

üîç Found 3 relevant chunks:

Result 1:
The Cost of Connections
Understanding the cost of connections
husseinnasser
Source: {'creator': 'Google', 'creationdate': '', 'total_pages': 13, 'source': 'test.pdf', 'page_label': '10', 'producer': 'PyPDF', 'page': 9, 'title': "Nagle's algorithm"}
--------------------------------------------------
Result 2:
Delayed Acknowledgment algorithm
‚óè Waste to acknowledge segments right away
‚óè We can wait little more to receive more segment and ack once
husseinnasser
B
A Delay
12
34
5
ACK 5 ACK all 
at once
Source: {'creator': 'Google', 'source': 'test.pdf', 'page_label': '7', 'creationdate': '', 'page': 6, 'title': "Nagle's algorithm", 'total_pages': 13, 'producer': 'PyPDF'}
--------------------------------------------------
Result 3:
Nagle's algorithm
Delay in the client side
husseinnasser
Source: {'title': "Nagle's algorithm", 'page_label': '1', 'total_pages': 13, 'creator': 'Google', 'producer': 'PyPDF', 'source': 'test.pdf', 'page': 0, 'creation