In [39]:
%pip install -qU langchain langchain-openai python-dotenv rich langchain_postgres langchain_community beautifulsoup4 langchain_experimental "psycopg[binary]"

Note: you may need to restart the kernel to use updated packages.


In [51]:
from dotenv import load_dotenv

load_dotenv(override=True)

True

In [57]:
import os
database_url = os.getenv("DATABASE_URL")
if not database_url:
    print("⚠️  DATABASE_URL not found in environment variables.")
else:
    print(f"✅ DATABASE_URL found: {database_url[:20]}...")  # Show first 20 chars for privacy


In [85]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
chat_model = ChatOpenAI(model="o4-mini")

In [54]:
import bs4
from rich import print
from langchain_community.document_loaders import WebBaseLoader
from langchain_experimental.text_splitter import SemanticChunker

# Load and chunk contents of the documentation
loader = WebBaseLoader(
    web_paths=("https://python.langchain.com/docs/introduction/",),
    bs_kwargs=dict(parse_only=bs4.SoupStrainer(name="article")),
)
document = loader.load()
print(document[0].page_content[:500])

In [55]:
# Semantic Chunking
text_splitter = SemanticChunker(embeddings=embeddings, breakpoint_threshold_type="interquartile")
chunks = text_splitter.split_documents(document)
print(chunks[0])

In [56]:
from langchain_postgres import PGVector
import os

vector_store = PGVector(
    embeddings=embeddings,
    collection_name="my_docs",
    connection=os.getenv("DATABASE_URL"),
    create_extension=True,
)

vector_store.add_documents(documents=chunks)
print(f"Added {len(chunks)} documents to the vector store")

In [89]:
from langchain_core.prompts import ChatPromptTemplate

hyde_prompt = ChatPromptTemplate.from_template(
    """
Given the question '{question}', generate a hypothetical document that directly answers this question. The document should be detailed and in-depth. The document size has be within {chunk_size} words.
"""
)
hyde_prompt.format(question="what is langchain?", chunk_size=500)

"Human: \nGiven the question 'what is langchain?', generate a hypothetical document that directly answers this question. The document should be detailed and in-depth. The document size has be within 500 words.\n"

In [91]:
from rich import print
llm_chain = hyde_prompt | chat_model
question = "what is Langgraph?"
generated_docs = llm_chain.invoke({"question": question, "chunk_size":100}).content
print(generated_docs)

In [92]:
retrieved_docs = vector_store.similarity_search_with_score(question+generated_docs, k=2)
print(retrieved_docs)