In [1]:
"""
This script demonstrates three different text splitting techniques using LangChain:
1. RecursiveCharacterTextSplitter
2. TokenTextSplitter
3. MarkdownHeaderTextSplitter

Each section prints the number of resulting chunks and provides examples of how the
text is split for clarity.
"""

import textwrap

from langchain_text_splitters import (MarkdownHeaderTextSplitter,
                                      RecursiveCharacterTextSplitter,
                                      TokenTextSplitter)

In [2]:
sample_text = """
Natural language processing (NLP) is a field of artificial intelligence that focuses
on the interaction between computers and human language. It enables computers to
understand, interpret, and generate human language in a valuable way.

The field of NLP involves many different techniques and approaches:
1. Tokenization
2. Part-of-speech tagging
3. Named entity recognition
4. Sentiment analysis

Machine learning, especially deep learning, has revolutionized NLP in recent years.
Transformers and large language models have achieved remarkable results in various
NLP tasks.
"""

markdown_text = """
# Introduction to NLP
Natural language processing is fascinating.

## Basic Concepts
Tokenization is the first step.

### Advanced Topics
Deep learning has changed everything.
"""

In [3]:
def demonstrate_recursive_text_splitter(text: str, chunk_size: int = 120, chunk_overlap: int = 50) -> None:
    """
    Demonstrates splitting text into chunks using RecursiveCharacterTextSplitter.

    :param text: The input text to split.
    :param chunk_size: The maximum size of each chunk.
    :param chunk_overlap: The number of overlapping characters between consecutive chunks.
    """
    print("\n1. Recursive Character Text Splitter Example")
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=[".", " "])
    splits = splitter.split_text(text)
    print(f"Number of chunks: {len(splits)}")

    # Print the first two chunks as an example
    for i, chunk in enumerate(splits[:2], start=1):
        print(f"\nChunk {i}:")
        print(textwrap.fill(chunk, width=70))

demonstrate_recursive_text_splitter(sample_text)


1. Recursive Character Text Splitter Example
Number of chunks: 7

Chunk 1:
Natural language processing (NLP) is a field of artificial
intelligence that focuses on the interaction between

Chunk 2:
that focuses on the interaction between computers and human language


In [4]:
def demonstrate_token_text_splitter(text: str, chunk_size: int = 100, chunk_overlap: int = 50) -> None:
    """
    Demonstrates splitting text into chunks based on tokens using TokenTextSplitter.

    :param text: The input text to split.
    :param chunk_size: The maximum number of tokens per chunk.
    :param chunk_overlap: The overlap in number of tokens between consecutive chunks.
    """
    print("\n2. Token Text Splitter Example")
    splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    splits = splitter.split_text(text)
    print(f"Number of chunks: {len(splits)}")

    # Print the first two chunks as an example
    for i, chunk in enumerate(splits[:2], start=1):
        print(f"\nChunk {i}:")
        print(textwrap.fill(chunk, width=70))

demonstrate_token_text_splitter(sample_text)


2. Token Text Splitter Example
Number of chunks: 2

Chunk 1:
 Natural language processing (NLP) is a field of artificial
intelligence that focuses on the interaction between computers and
human language. It enables computers to understand, interpret, and
generate human language in a valuable way.  The field of NLP involves
many different techniques and approaches: 1. Tokenization 2. Part-of-
speech tagging 3. Named entity recognition 4. Sentiment analysis
Machine learning, especially deep learning, has revolutionized NLP in

Chunk 2:
 NLP involves many different techniques and approaches: 1.
Tokenization 2. Part-of-speech tagging 3. Named entity recognition 4.
Sentiment analysis  Machine learning, especially deep learning, has
revolutionized NLP in recent years. Transformers and large language
models have achieved remarkable results in various NLP tasks.


In [5]:
def demonstrate_markdown_header_text_splitter(text: str) -> None:
    """
    Demonstrates splitting a Markdown text into sections based on header levels
    using MarkdownHeaderTextSplitter.

    :param text: The Markdown text to split.
    """
    print("\n3. Markdown Header Text Splitter Example")

    # Define which headers to split on (and label them in metadata)
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]
    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    splits = splitter.split_text(text)

    print(f"Number of chunks: {len(splits)}")
    if splits:
        print("\nFirst chunk with metadata:")
        print(splits[0])

demonstrate_markdown_header_text_splitter(markdown_text)


3. Markdown Header Text Splitter Example
Number of chunks: 3

First chunk with metadata:
page_content='Natural language processing is fascinating.' metadata={'Header 1': 'Introduction to NLP'}
