In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai_api_key = os.environ["OPENAI_API_KEY"]

## Character Text Splitter

In [2]:
from langchain.text_splitter import CharacterTextSplitter

In [3]:
chunk_size =26
chunk_overlap = 4

In [4]:
character_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [5]:
text1 = 'abcdefghijklmnopqrstuvwxyzabcdefg'

In [6]:
character_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyzabcdefg']

In [7]:
text2 = """
Data that Speak
LLM Applications are revolutionizing industries such as 
banking, healthcare, insurance, education, legal, tourism, 
construction, logistics, marketing, sales, customer service, 
and even public administration.

The aim of our programs is for students to learn how to 
create LLM Applications in the context of a business, 
which presents a set of challenges that are important 
to consider in advance.
"""

In [8]:
character_splitter.split_text(text2)

Created a chunk of size 227, which is longer than the specified 26


['Data that Speak\nLLM Applications are revolutionizing industries such as \nbanking, healthcare, insurance, education, legal, tourism, \nconstruction, logistics, marketing, sales, customer service, \nand even public administration.',
 'The aim of our programs is for students to learn how to \ncreate LLM Applications in the context of a business, \nwhich presents a set of challenges that are important \nto consider in advance.']

## Recursive Character Text Splitter

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [10]:
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [11]:
recursive_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

In [12]:
recursive_splitter.split_text(text2)

['Data that Speak',
 'LLM Applications are',
 'are revolutionizing',
 'industries such as',
 'banking, healthcare,',
 'insurance, education,',
 'legal, tourism,',
 'construction, logistics,',
 'marketing, sales,',
 'customer service,',
 'and even public',
 'administration.',
 'The aim of our programs',
 'is for students to learn',
 'how to',
 'create LLM Applications',
 'in the context of a',
 'a business,',
 'which presents a set of',
 'of challenges that are',
 'are important',
 'to consider in advance.']

In [13]:
second_recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)

In [14]:
second_recursive_splitter.split_text(text2)

['Data that Speak\nLLM Applications are revolutionizing industries such as \nbanking, healthcare, insurance, education, legal, tourism,',
 'construction, logistics, marketing, sales, customer service, \nand even public administration.',
 'The aim of our programs is for students to learn how to \ncreate LLM Applications in the context of a business,',
 'which presents a set of challenges that are important \nto consider in advance.']

In [15]:
chunks = second_recursive_splitter.split_text(text2)

In [16]:
len(chunks)

4

## Markdown Header Text Splitter

### Adding helpful metadata to text chunks

In [17]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [18]:
document_with_markdown = """
# Title: My book\n\n \

## Chapter 1: The day I was born\n\n \
I was born in a very sunny day of summer...\n\n \

### Section 1.1: My family \n\n \
My father had a big white car... \n\n 

## Chapter 2: My school\n\n \
My first day at the school was...\n\n \

"""

In [19]:
headers_to_split_on = [
    ("#", "Book title"),
    ("##", "Chapter"),
    ("###", "Section"),
]

In [20]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

In [21]:
md_header_splits = markdown_splitter.split_text(document_with_markdown)

In [22]:
md_header_splits[0]

Document(page_content='I was born in a very sunny day of summer...', metadata={'Book title': 'Title: My book', 'Chapter': 'Chapter 1: The day I was born'})

In [23]:
md_header_splits[1]

Document(page_content='My father had a big white car...', metadata={'Book title': 'Title: My book', 'Chapter': 'Chapter 1: The day I was born', 'Section': 'Section 1.1: My family'})

In [24]:
md_header_splits[2]

Document(page_content='My first day at the school was...', metadata={'Book title': 'Title: My book', 'Chapter': 'Chapter 2: My school'})