In [40]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import statistics

#  Define a long simulated text
sample_text = ("My name is Apekshya. I study in islington college. " * 50) * 10

#Set chunk size and overlap combinations\
chunk_configs = [
    (300, 50),
    (500, 100),
    (1000, 200),
    (2000, 400)
]


In [41]:
print(" Experimenting with sample text:")
for chunk_size, overlap in chunk_configs:
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = splitter.split_text(sample_text)
    lengths = [len(chunk) for chunk in chunks]

    print(f"\n--- Chunk Size: {chunk_size}, Overlap: {overlap} ---")
    print(f" Total Chunks: {len(chunks)}")
    print(f" Avg Length: {statistics.mean(lengths):.2f} characters")
    print(f" First Chunk Preview:\n{chunks[0][:300]}...\n")


 Experimenting with sample text:

--- Chunk Size: 300, Overlap: 50 ---
 Total Chunks: 100
 Avg Length: 295.58 characters
 First Chunk Preview:
My name is Apekshya. I study in islington college. My name is Apekshya. I study in islington college. My name is Apekshya. I study in islington college. My name is Apekshya. I study in islington college. My name is Apekshya. I study in islington college. My name is Apekshya. I study in islington...


--- Chunk Size: 500, Overlap: 100 ---
 Total Chunks: 63
 Avg Length: 494.32 characters
 First Chunk Preview:
My name is Apekshya. I study in islington college. My name is Apekshya. I study in islington college. My name is Apekshya. I study in islington college. My name is Apekshya. I study in islington college. My name is Apekshya. I study in islington college. My name is Apekshya. I study in islington col...


--- Chunk Size: 1000, Overlap: 200 ---
 Total Chunks: 32
 Avg Length: 984.00 characters
 First Chunk Preview:
My name is Apekshya. I study i

In [42]:
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter,
    NLTKTextSplitter
)
import statistics

# Sample text
sample_text = ("Hi how are you. I hope you are fine. i am good as well. i have started living in Kathmandu. " * 50) * 10


chunk_size = 500
chunk_overlap = 100


In [45]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [46]:
# Initialize different splitters
splitters = {
    "RecursiveCharacterTextSplitter": RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    ),
    "CharacterTextSplitter": CharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    ),
    "TokenTextSplitter": TokenTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    ),
    "NLTKTextSplitter": NLTKTextSplitter()
}

In [47]:
print("Comparing Different Text Splitters:\n")

for name, splitter in splitters.items():
    try:
        chunks = splitter.split_text(sample_text)
        lengths = [len(chunk) for chunk in chunks]

        print(f"\n=== {name} ===")
        print(f"Total Chunks: {len(chunks)}")
        print(f" Avg Length: {statistics.mean(lengths):.2f} characters")
        print(f" First Chunk Preview:\n{chunks[0][:300]}...\n")
    except Exception as e:
        print(f"\n {name} failed with error: {e}")

Comparing Different Text Splitters:


=== RecursiveCharacterTextSplitter ===
Total Chunks: 116
 Avg Length: 493.71 characters
 First Chunk Preview:
Hi how are you. I hope you are fine. i am good as well. i have started living in Kathmandu. Hi how are you. I hope you are fine. i am good as well. i have started living in Kathmandu. Hi how are you. I hope you are fine. i am good as well. i have started living in Kathmandu. Hi how are you. I hope y...


=== CharacterTextSplitter ===
Total Chunks: 1
 Avg Length: 45999.00 characters
 First Chunk Preview:
Hi how are you. I hope you are fine. i am good as well. i have started living in Kathmandu. Hi how are you. I hope you are fine. i am good as well. i have started living in Kathmandu. Hi how are you. I hope you are fine. i am good as well. i have started living in Kathmandu. Hi how are you. I hope y...


=== TokenTextSplitter ===
Total Chunks: 33
 Avg Length: 1736.88 characters
 First Chunk Preview:
Hi how are you. I hope you are fine. i am 

In [52]:
!pip install unstructured

from langchain_community.document_loaders import (
    PyPDFLoader,
    TextLoader,
    CSVLoader,
    UnstructuredFileLoader,

)

Collecting unstructured
  Downloading unstructured-0.17.2-py3-none-any.whl.metadata (24 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2025.2.18-py3-none-any.whl.metadata (14 kB)
Collecting langdetect (from unstructured)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rapidfuzz (from unstructured)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting backoff (from unstructured)
  Downloadi

In [53]:
loaders = {
    "TextLoader": ("example.txt", TextLoader),
    "PyPDFLoader": ("drylab.pdf", PyPDFLoader),
    "CSVLoader": ("text.csv", CSVLoader),
    "UnstructuredFileLoader": ("about.html", UnstructuredFileLoader),  # works for .docx, .pdf, etc. too
}

In [54]:
for name, (path, LoaderClass) in loaders.items():
    try:
        loader = LoaderClass(path)
        docs = loader.load()
        print(f"\n {name} loaded {len(docs)} document(s) from {path}")
        print(f"First 300 chars:\n{docs[0].page_content[:300]}...\n")
    except Exception as e:
        print(f"\n {name} failed to load {path}: {e}")


 TextLoader failed to load example.txt: Error loading example.txt

 PyPDFLoader loaded 3 document(s) from drylab.pdf
First 300 chars:
DrylabNewsfor investors & friends · Ma y 2017
Welcome to our first newsletter of 2017! It's
been a while since the last one, and a lot has
happened. We promise to keep them coming
every two months hereafter, and permit
ourselves to make this one rather long. The
big news is the beginnings of our lau...


 CSVLoader loaded 416809 document(s) from text.csv
First 300 chars:
tweet_id: 0
content: i just feel really helpless and heavy hearted
sentiment: 4...


 UnstructuredFileLoader loaded 1 document(s) from about.html
First 300 chars:
Meet The Team



Rushav



Sulav



Hari...

