# Text Splitters in LangChain

In [None]:
# !pip install -U langchain-text-splitters
!pip install langchain-community pypdf

# Import Libraries

In [None]:
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter, TokenTextSplitter, Language

In [None]:
from langchain_community.document_loaders import TextLoader, PyPDFLoader



# PDF Load

In [None]:
pdf_loader = PyPDFLoader('/content/data/Lecture 1.pdf')
pdf_doc = pdf_loader.load()
len(pdf_doc)

35

In [None]:
pdf_doc = pdf_doc[:3]
pdf_doc

[Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '2025-10-16T09:16:51-07:00', 'moddate': '2025-10-16T09:16:51-07:00', 'source': '/content/data/Lecture 1.pdf', 'total_pages': 35, 'page': 0, 'page_label': '1'}, page_content='Chapter 1 Copyright ¬© 2017 Pearson Education, Ltd. 1-1\nChapter 1 \nThe Systems Development \nEnvironment\nModern Systems Analysis\nand Design\nEighth Edition, Global Edition \nJoseph S. Valacich\nJoey F. George'),
 Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '2025-10-16T09:16:51-07:00', 'moddate': '2025-10-16T09:16:51-07:00', 'source': '/content/data/Lecture 1.pdf', 'total_pages': 35, 'page': 1, 'page_label': '2'}, page_content='Chapter 1 Copyright ¬© 2017 Pearson Education, Ltd. 1-2\nLearning Objectives\n‚úî Define information systems analysis and design.\n‚úî Describe the information systems development life cycle \n(SDLC).\n‚úî Explain computer-aided software engineering (CASE) \ntools.\n‚úî Describe Ag

In [None]:
text = """
Agriculture is the science and practice of growing crops, raising livestock, and cultivating soil to produce food, fiber, fuel, and other products, forming the foundation of human civilization and evolving from ancient subsistence farming to modern, high-tech systems.
It encompasses diverse activities like crop production (food and cash crops), animal husbandry (livestock, poultry, fisheries), forestry, and incorporates innovations for sustainability and efficiency, facing challenges like climate change, resource management, and feeding a growing global population.

In essence, agriculture is a vital, dynamic sector constantly adapting to feed the world, manage resources, and balance economic needs with environmental concerns, as highlighted by initiatives like the Vision for Agriculture and Food and projects empowering farmer organizations in Bangladesh.
"""

# 1. Length Based

In [None]:
char_splitter = CharacterTextSplitter(
    chunk_size=20,
    chunk_overlap=0,
    separator=''
)

# Single Text
single_splitter = char_splitter.split_text(text=text)
single_splitter

['Agriculture is the',
 'science and practice',
 'of growing crops, r',
 'aising livestock, an',
 'd cultivating soil t',
 'o produce food, fibe',
 'r, fuel, and other p',
 'roducts, forming the',
 'foundation of human',
 'civilization and ev',
 'olving from ancient',
 'subsistence farming',
 'to modern, high-tech',
 'systems.\nIt encompa',
 'sses diverse activit',
 'ies like crop produc',
 'tion (food and cash',
 'crops), animal husba',
 'ndry (livestock, pou',
 'ltry, fisheries), fo',
 'restry, and incorpor',
 'ates innovations for',
 'sustainability and',
 'efficiency, facing c',
 'hallenges like clima',
 'te change, resource',
 'management, and feed',
 'ing a growing global',
 'population. \n\nIn es',
 'sence, agriculture i',
 's a vital, dynamic s',
 'ector constantly ada',
 'pting to feed the wo',
 'rld, manage resource',
 's, and balance econo',
 'mic needs with envir',
 'onmental concerns, a',
 's highlighted by ini',
 'tiatives like the Vi',
 'sion for Agriculture',
 'and Food

In [None]:
# documents splitter
char_doc_splitter = CharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=0,
    separator=''
)

doc_split = char_doc_splitter.split_documents(pdf_doc)
doc_split

[Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '2025-10-16T09:16:51-07:00', 'moddate': '2025-10-16T09:16:51-07:00', 'source': '/content/data/Lecture 1.pdf', 'total_pages': 35, 'page': 0, 'page_label': '1'}, page_content='Chapter 1 Copyright ¬© 2017 Pearson Education, Ltd. 1-1\nChapter 1 \nThe Systems Development \nEnvironment\nModern Systems Analysis\nand Design\nEighth Edition, Global Edition \nJoseph S. Valacich\nJoey F. Geo'),
 Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '2025-10-16T09:16:51-07:00', 'moddate': '2025-10-16T09:16:51-07:00', 'source': '/content/data/Lecture 1.pdf', 'total_pages': 35, 'page': 0, 'page_label': '1'}, page_content='rge'),
 Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '2025-10-16T09:16:51-07:00', 'moddate': '2025-10-16T09:16:51-07:00', 'source': '/content/data/Lecture 1.pdf', 'total_pages': 35, 'page': 1, 'page_label': '2'}, page_content='Chapter 1 Copyright ¬© 2017

In [None]:
len(doc_split)

6

# ‚úÖ 1. Using LangChain‚Äôs built-in TokenTextSplitter

In [None]:
splitter = TokenTextSplitter(
    chunk_size=20,       # max number of tokens
    chunk_overlap=5,     # tokens of overlap
    encoding_name="cl100k_base"  # same tokenizer used by GPT-4 / GPT-3.5 / GPT-4o
)

chunks = splitter.split_text(text)

for i, c in enumerate(chunks):
    print(i, len(c), c)


0 101 
Agriculture is the science and practice of growing crops, raising livestock, and cultivating soil to
1 98 , and cultivating soil to produce food, fiber, fuel, and other products, forming the foundation of
2 121 , forming the foundation of human civilization and evolving from ancient subsistence farming to modern, high-tech systems
3 112  modern, high-tech systems.
It encompasses diverse activities like crop production (food and cash crops), animal
4 94  and cash crops), animal husbandry (livestock, poultry, fisheries), forestry, and incorporates
5 136 ), forestry, and incorporates innovations for sustainability and efficiency, facing challenges like climate change, resource management,
6 107  change, resource management, and feeding a growing global population. 

In essence, agriculture is a vital
7 109 , agriculture is a vital, dynamic sector constantly adapting to feed the world, manage resources, and balance
8 136  manage resources, and balance economic needs with environment

# 2. Text Structured Based

In [None]:
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 50,
    chunk_overlap = 0
)

rec_doc = recursive_splitter.split_text(text=text)
rec_doc

['Agriculture is the science and practice of',
 'growing crops, raising livestock, and cultivating',
 'soil to produce food, fiber, fuel, and other',
 'products, forming the foundation of human',
 'civilization and evolving from ancient',
 'subsistence farming to modern, high-tech systems.',
 'It encompasses diverse activities like crop',
 'production (food and cash crops), animal',
 'husbandry (livestock, poultry, fisheries),',
 'forestry, and incorporates innovations for',
 'sustainability and efficiency, facing challenges',
 'like climate change, resource management, and',
 'feeding a growing global population.',
 'In essence, agriculture is a vital, dynamic',
 'sector constantly adapting to feed the world,',
 'manage resources, and balance economic needs with',
 'environmental concerns, as highlighted by',
 'initiatives like the Vision for Agriculture and',
 'Food and projects empowering farmer organizations',
 'in Bangladesh.']

# 3. Documents Structure Based

## Python Code

In [None]:
python_code = """
# A simple Python program
name = "Python" # A variable storing text
print(f"Hello, {name}!") # Prints "Hello, Python!"

# Using a for loop to count from 1 to 5
for i in range(1, 6):
    print(i)
"""

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language = Language.PYTHON,
    chunk_size=60,
    chunk_overlap=0,
)

py_split = python_splitter.split_text(python_code)
py_split

['# A simple Python program',
 'name = "Python" # A variable storing text',
 'print(f"Hello, {name}!") # Prints "Hello, Python!"',
 '# Using a for loop to count from 1 to 5',
 'for i in range(1, 6):\n    print(i)']

In [None]:
len(py_split)

5

## Markdown Code

In [None]:
md_code = """
# üèÜ What Technique Is Best for Now and Why?

For the vast majority of RAG applications today, the recommended starting point is:

## **Best Technique: `RecursiveCharacterTextSplitter`**

### Why It's the Default Winner:

1.  **Pragmatic Balance:** It offers the best trade-off between **context preservation** (by respecting paragraphs and sentences) and **chunk size control** (by recursively splitting when necessary).
2.  **General-Purpose:** It performs reasonably well across many types of documents (articles, reports, transcripts) that lack strict, predictable structural tagging (unlike Markdown or code).
3.  **Simplicity:** It requires minimal configuration beyond `chunk_size` and `chunk_overlap`.
"""

md_splitter = RecursiveCharacterTextSplitter.from_language(
    language = Language.MARKDOWN,
    chunk_size=60,
    chunk_overlap=0,
)

md_split = md_splitter.split_text(md_code)
md_split

['# üèÜ What Technique Is Best for Now and Why?',
 'For the vast majority of RAG applications today, the',
 'recommended starting point is:',
 '## **Best Technique: `RecursiveCharacterTextSplitter`**',
 "### Why It's the Default Winner:",
 '1.  **Pragmatic Balance:** It offers the best trade-off',
 'between **context preservation** (by respecting paragraphs',
 'and sentences) and **chunk size control** (by recursively',
 'splitting when necessary).',
 '2.  **General-Purpose:** It performs reasonably well across',
 'many types of documents (articles, reports, transcripts)',
 'that lack strict, predictable structural tagging (unlike',
 'Markdown or code).',
 '3.  **Simplicity:** It requires minimal configuration',
 'beyond `chunk_size` and `chunk_overlap`.']