In [1]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

In [None]:
# Why?

# Most LLMs perform best with short, focused “chunks” of text.
# Splitting documents ensures that your retrieval step pulls relevant, concise pieces rather than entire docs.

# Why Chunking/Splitting?

# LLMs have limited context windows (e.g., 4k–128k tokens).
# Retrieval is more accurate with smaller, focused pieces (vs. giant docs).
# Avoids irrelevant or noisy context for the LLM.

# Goal:
# Convert loaded documents into manageable, relevant text “chunks” for embedding and retrieval.

# 1. Load data
from langchain_community.document_loaders import TextLoader
loader = TextLoader("demo_document.txt")
docs = loader.load()

# 2. Chunk data
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(docs)

# 3. Visualize
for i, chunk in enumerate(chunks[:3]):
    print(f"Chunk {i+1}:\n{chunk.page_content}\n{'-'*30}")


Chunk 1:
abcde
------------------------------
Chunk 2:
defgh
------------------------------
Chunk 3:
ghijk
------------------------------


In [7]:
chunks

#This is an example of using text file in langchain
#langchain is a powerful framework

[Document(metadata={'source': 'demo.txt'}, page_content='abcde'),
 Document(metadata={'source': 'demo.txt'}, page_content='defgh'),
 Document(metadata={'source': 'demo.txt'}, page_content='ghijk'),
 Document(metadata={'source': 'demo.txt'}, page_content='jkl'),
 Document(metadata={'source': 'demo.txt'}, page_content='This'),
 Document(metadata={'source': 'demo.txt'}, page_content='is'),
 Document(metadata={'source': 'demo.txt'}, page_content='an'),
 Document(metadata={'source': 'demo.txt'}, page_content='exam'),
 Document(metadata={'source': 'demo.txt'}, page_content='ample'),
 Document(metadata={'source': 'demo.txt'}, page_content='of'),
 Document(metadata={'source': 'demo.txt'}, page_content='usin'),
 Document(metadata={'source': 'demo.txt'}, page_content='ing'),
 Document(metadata={'source': 'demo.txt'}, page_content='text'),
 Document(metadata={'source': 'demo.txt'}, page_content='file'),
 Document(metadata={'source': 'demo.txt'}, page_content='in'),
 Document(metadata={'source': '