In [None]:
# 1. We import 'Document' - this is the specific format LangChain uses to store text.
# 2. We import 'TextSplitters' - these are tools used to chop long text into smaller pieces.
# 3. We import 'pandas' - a popular tool for viewing data in organized tables.

import langchain
import os   
from typing import List,Dict,Any
import pandas as pd

In [None]:


from langchain_core.documents import Document
from langchain_text_splitters import (RecursiveCharacterTextSplitter, CharacterTextSplitter, TokenTextSplitter)
print("Setup Complete")

  from .autonotebook import tqdm as notebook_tqdm


Setup Complete


In [None]:
# 1. A 'Document' is like a package: it contains the 'page_content' (the text) and 'metadata' (the tags).
# 2. Metadata is crucial for "Filtering" (e.g., searching only for documents written by a specific author).
# 3. Metadata is also used for "Auditing" (e.g., checking exactly when a piece of information was added).
# 4. In LangChain, we always convert our data into this format before doing anything else.

## create a simple document
doc=Document(
    page_content="This is the main text content that will be embedded and searched.",
    metadata={
        "source":"example.txt",
        "page":1,
        "author":"Mathew",
        "date_created":"2024-01-01",
        "cutom_field":"any_value"

    }
)
print("Document Structure")

print(f"Content :{doc.page_content}")
print(f"Metadata :{doc.metadata}")

# Why metadata matters:
print("\n Metadata is crucial for:")
print("- Filtering search results")
print("- Tracking document sources")
print("- Providing context in responses")
print("- Debugging and auditing")

Document Structure
Content :This is the main text content that will be embedded and searched.
Metadata :{'source': 'example.txt', 'page': 1, 'author': 'Mathew', 'date_created': '2024-01-01', 'cutom_field': 'any_value'}

üìù Metadata is crucial for:
- Filtering search results
- Tracking document sources
- Providing context in responses
- Debugging and auditing


### Text Files (.txt) - The Simplest Case {#2-text-files}


In [None]:
# 1. 'os.makedirs' creates a folder on your computer to hold your data.
# 2. 'sample_texts' is a dictionary containing the text we want to save into files.
# 3. The 'for' loop goes through that dictionary and physically writes the text into .txt files.
# 4. This step prepares our "raw data" so we have something to practice loading later.

os.makedirs("data/text_files",exist_ok=True)

In [7]:
sample_texts={
    "data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("Sample text files created!")

Sample text files created!


In [None]:
# 1. 'TextLoader' is a tool specifically designed to read standard text (.txt) files.
# 2. '.load()' is the trigger command that turns the file into a LangChain Document object.
# 3. 'page_content' is the actual text inside the file.
# 4. 'metadata' is "extra info" like the file name or location, which helps the AI cite its sources.

from langchain_community.document_loaders import TextLoader

## Loading a single text file
loader=TextLoader("data/text_files/python_intro.txt", encoding="utf-8")

documents=loader.load()
print(f"Loaded {len(documents)} document")
print(f"Content preview: {documents[0].page_content[:100]}...")
print(f"Metadata: {documents[0].metadata}")

Loaded 1 document
Content preview: Python Programming Introduction

Python is a high-level, interpreted programming language known for ...
Metadata: {'source': 'data/text_files/python_intro.txt'}


In [None]:
# 1. 'DirectoryLoader' is used when you have many files and don't want to load them one by one.
# 2. 'glob="**/*.txt"' tells the loader to find every .txt file in the folder and its sub-folders.
# 3. 'loader_cls=TextLoader' tells this tool to use our TextLoader to open every file it finds.
# 4. 'show_progress=True' adds a visual bar so you can see how many files are finished loading.

from langchain_community.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader=DirectoryLoader(
    "data/text_files",
    glob="**/*.txt", ## Pattern to match files  
    loader_cls= TextLoader, ##loader class to use
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=True

)

documents=dir_loader.load()

print(f"üìÅ Loaded {len(documents)} documents")
for i, doc in enumerate(documents):
    print(f"\nDocument {i+1}:")
    print(f"  Source: {doc.metadata['source']}")
    print(f"  Length: {len(doc.page_content)} characters")


# üìä Analysis
print("\nüìä DirectoryLoader Characteristics:")
print("‚úÖ Advantages:")
print("  - Loads multiple files at once")
print("  - Supports glob patterns")
print("  - Progress tracking")
print("  - Recursive directory scanning")

print("\n‚ùå Disadvantages:")
print("  - All files must be same type")
print("  - Limited error handling per file")
print("  - Can be memory intensive for large directories")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 48.58it/s]

üìÅ Loaded 2 documents

Document 1:
  Source: data\text_files\machine_learning.txt
  Length: 575 characters

Document 2:
  Source: data\text_files\python_intro.txt
  Length: 489 characters

üìä DirectoryLoader Characteristics:
‚úÖ Advantages:
  - Loads multiple files at once
  - Supports glob patterns
  - Progress tracking
  - Recursive directory scanning

‚ùå Disadvantages:
  - All files must be same type
  - Limited error handling per file
  - Can be memory intensive for large directories





In [None]:
# 1. This cell introduces 'Text Splitting', which is the process of breaking long documents 
#    into smaller, bite-sized pieces (chunks) so the AI can process them more easily.
# 2. 'CharacterTextSplitter': Splits text based on a specific number of characters (very simple).
# 3. 'RecursiveCharacterTextSplitter': The most recommended splitter. It tries to split 
#    at natural points like paragraphs and sentences so the meaning isn't lost.
# 4. 'TokenTextSplitter': Splits text based on "tokens" (how AI models actually count words), 
#    ensuring you stay within the AI's memory limits.
# 5. 'print(documents)': This displays your currently loaded data to confirm it's ready for splitting.
### Different text splitting strategies
from langchain_text_splitters import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter
)
print(documents)

[Document(metadata={'source': 'data\\text_files\\machine_learning.txt'}, page_content='Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n\n    '), Document(metadata={'source': 'data\\text_files\\python_intro.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprog

In [None]:
# 1. 'documents[0]' selects the very first Document object from the list you loaded earlier.
# 2. '.page_content' tells Python to ignore the metadata (like filename or author) 
#    and only grab the actual raw text inside that document.
# 3. We store this raw text in the variable 'text' so we can use it to test 
#    the "Character Text Splitter" method.
# 4. Typing 'text' on the last line simply displays the content so you can 
#    see exactly what you are about to split.

### MEthod 1- Character Text Splitter
text=documents[0].page_content
text

'Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n\n    '

In [29]:
# --- COMPARISON OF CHUNKING STRATEGIES ---

# 1. CHARACTER SPLITTER (The "Rigid" Method)
#    - How it works: It looks for ONE specific character (like a space) and cuts 
#      exactly when it hits the limit.
#    - Pros: Very simple and predictable.
#    - Cons: It's "blind" to the structure of your writing. It might cut a 
#      sentence right in the middle just because it reached the character limit.

# 2. RECURSIVE CHARACTER SPLITTER (The "Smart" Method)
#    - How it works: It has a hierarchy of separators (Paragraphs -> Sentences -> Words).
#      If a paragraph is too long, it tries to split at a sentence. If the sentence 
#      is too long, it tries to split at a word.
#    - Pros: It tries its best to keep related text together so the AI can 
#      understand the full context of a thought.
#    - Cons: Slightly more complex, but usually the best choice for general text.

# 3. TOKEN SPLITTER (The "Technical" Method)
#    - How it works: It doesn't count letters; it counts "Tokens" (the chunks 
#      of characters that AI models use to process language).
#    - Pros: Most accurate for AI memory management. Since AI models have a 
#      "token limit," this ensures you never send too much data at once.
#    - Cons: Hard for humans to visualize because tokens don't always 
#      align perfectly with word or character counts.

In [20]:
# 1. 'separator=" "': This tells the splitter to only cut the text at a space. 
#    It prevents the code from cutting a word right in the middle.
# 2. 'chunk_size=200': This is our goal length. We want each piece of text 
#    to be roughly 200 characters long.
# 3. 'chunk_overlap=20': This is very important! It takes 20 characters from the end 
#    of Chunk 1 and repeats them at the start of Chunk 2. This helps the AI 
#    keep the context between chunks so it doesn't "forget" the beginning of a sentence.
# 4. 'length_function=len': This tells the tool to use standard Python counting 
#    (1 character = 1 unit) to measure the size.
# 5. '.split_text(text)': This is the action command that takes your long string 
#    and returns a list of smaller strings (chunks).
# 6. The print statements help you verify how many pieces were created and 
#    show you a preview of the first one.
# Method 1: Character-based splitting
print("1Ô∏è‚É£ CHARACTER TEXT SPLITTER")
char_splitter = CharacterTextSplitter(
    separator=" ",  # Split on newlines
    chunk_size=200,  # Max chunk size in characters
    chunk_overlap=20,  # Overlap between chunks
    length_function=len  # How to measure chunk size
)

char_chunks=char_splitter.split_text(text)
print(f"Created {len(char_chunks)} chunks")
print(f"First chunk: {char_chunks[0][:100]}...")

1Ô∏è‚É£ CHARACTER TEXT SPLITTER
Created 3 chunks
First chunk: Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables system...


In [21]:
print(char_chunks[0])
print("------------------")
print(char_chunks[1])

Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing
------------------
on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning:


In [23]:
# 1. 'Recursive' means the splitter is "smart." It tries to split text using a list of 
#    separators in order (like paragraphs, then sentences, then words) to keep ideas together.
# 2. 'separators=[" "]': In this specific code, it is told to look for spaces. However, 
#    by default, this splitter usually looks for double newlines, then single newlines, then spaces.
# 3. 'chunk_size=200' & 'chunk_overlap=20': Just like the previous method, we want 
#    200-character pieces with 20 characters of "shared memory" between them.
# 4. Why it is RECOMMENDED: Unlike the basic character splitter, this one tries its best 
#    NOT to break a paragraph or a sentence in the middle unless it absolutely has to.
# 5. '.split_text(text)': It processes the raw text and turns it into a list of 
#    well-organized chunks.

# Method 2: Recursive character splitting (RECOMMENDED)
print("\n2Ô∏è‚É£ RECURSIVE CHARACTER TEXT SPLITTER")
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=[" "],  # Try these separators in order
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)

recursive_chunks = recursive_splitter.split_text(text)
print(f"Created {len(recursive_chunks)} chunks")
print(f"First chunk: {recursive_chunks[0][:100]}...")


2Ô∏è‚É£ RECURSIVE CHARACTER TEXT SPLITTER
Created 4 chunks
First chunk: Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables system...


In [24]:
print(recursive_chunks[0])
print("-----------------")
print(recursive_chunks[1])
print("------------------")
print(recursive_chunks[2])

Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing
-----------------
on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning:
------------------
Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation


In [None]:
# 1. 'simple_text': We create a single long line of text with NO newlines. 
#    This forces the splitter to find spaces to break the text.
# 2. 'separators=[" "]': We are overriding the default settings to tell the tool 
#    "ONLY split where there is a space." 
# 3. 'chunk_size=80': We want small, manageable pieces of about 80 characters each.
# 4. 'chunk_overlap=20': This ensures the last 20 characters of Chunk 1 
#    appear at the beginning of Chunk 2.
# 5. The 'for' loop: This is designed to print chunks side-by-side so you can 
#    visually verify that the end of one chunk matches the start of the next.
# 6. This experiment proves that even if you have no paragraphs, the splitter 
#    will intelligently use spaces to keep words whole while maintaining "memory" (overlap).
# Create text without natural break points
simple_text = "This is sentence one and it is quite long. This is sentence two and it is also quite long. This is sentence three which is even longer than the others. This is sentence four. This is sentence five. This is sentence six."

splitter = RecursiveCharacterTextSplitter(
    separators=[" "],  # Only split on spaces
    chunk_size=80,
    chunk_overlap=20,
    length_function=len
)

chunks = splitter.split_text(simple_text)

print(f"\nSimple text example - {len(chunks)} chunks:\n")

for i in range(len(chunks) - 1):
    print(f"Chunk {i+1}: '{chunks[i]}'")
    print(f"Chunk {i+2}: '{chunks[i+1]}'")
    
    
    print()


Simple text example - 4 chunks:

Chunk 1: 'This is sentence one and it is quite long. This is sentence two and it is also'
Chunk 2: 'two and it is also quite long. This is sentence three which is even longer than'

Chunk 2: 'two and it is also quite long. This is sentence three which is even longer than'
Chunk 3: 'is even longer than the others. This is sentence four. This is sentence five.'

Chunk 3: 'is even longer than the others. This is sentence four. This is sentence five.'
Chunk 4: 'is sentence five. This is sentence six.'



In [None]:
# 1. 'Tokens' are not the same as characters. While characters are individual letters, 
#    tokens are chunks of characters (roughly 4 characters or 0.75 words) that AI models 
#    use to "read" and process language.
# 2. 'chunk_size=50': Unlike the previous cells that used characters, this tells the 
#    tool to create chunks that are exactly 50 "tokens" long.
# 3. 'chunk_overlap=10': This keeps 10 tokens of shared context between the chunks.
# 4. Why use this? AI models (like GPT) have a "context limit" measured in tokens. 
#    Using a TokenSplitter ensures your data fits perfectly into the AI's "memory" 
#    without accidentally going over the limit.
# 5. This method is often preferred when you are trying to be very precise about 
#    how much information you are sending to the AI model at once.

# Method 3: Token-based splitting
print("\n3Ô∏è‚É£ TOKEN TEXT SPLITTER")
token_splitter = TokenTextSplitter(
    chunk_size=50,  # Size in tokens (not characters)
    chunk_overlap=10
)

token_chunks = token_splitter.split_text(text)
print(f"Created {len(token_chunks)} chunks")
print(f"First chunk: {token_chunks[0][:100]}...")


3Ô∏è‚É£ TOKEN TEXT SPLITTER
Created 3 chunks
First chunk: Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables system...


In [27]:
# üìä Comparison
print("\nüìä Text Splitting Methods Comparison:")
print("\nCharacterTextSplitter:")
print("  ‚úÖ Simple and predictable")
print("  ‚úÖ Good for structured text")
print("  ‚ùå May break mid-sentence")
print("  Use when: Text has clear delimiters")

print("\nRecursiveCharacterTextSplitter:")
print("  ‚úÖ Respects text structure")
print("  ‚úÖ Tries multiple separators")
print("  ‚úÖ Best general-purpose splitter")
print("  ‚ùå Slightly more complex")
print("  Use when: Default choice for most texts")

print("\nTokenTextSplitter:")
print("  ‚úÖ Respects model token limits")
print("  ‚úÖ More accurate for embeddings")
print("  ‚ùå Slower than character-based")
print("  Use when: Working with token-limited models")


üìä Text Splitting Methods Comparison:

CharacterTextSplitter:
  ‚úÖ Simple and predictable
  ‚úÖ Good for structured text
  ‚ùå May break mid-sentence
  Use when: Text has clear delimiters

RecursiveCharacterTextSplitter:
  ‚úÖ Respects text structure
  ‚úÖ Tries multiple separators
  ‚úÖ Best general-purpose splitter
  ‚ùå Slightly more complex
  Use when: Default choice for most texts

TokenTextSplitter:
  ‚úÖ Respects model token limits
  ‚úÖ More accurate for embeddings
  ‚ùå Slower than character-based
  Use when: Working with token-limited models
