In [1]:
import os
from typing import List, Dict, Any
import pandas as pd

In [2]:
# from langchain_core.documents import Document
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, TokenTextSplitter
from langchain.document_loaders import TextLoader, DirectoryLoader

  from .autonotebook import tqdm as notebook_tqdm


## Understanding Document Structur in Langchain

In [3]:
# create a simple document
doc = Document(
    page_content="This is a sample document. " * 100,
    metadata={
        "source": "sample_source.txt",
        "author": "John Doe",
        "date": "2024-06-01",
        "page": 1}
)

print("Document Content:", doc.page_content[:100] + "...")
print("Document Metadata:", doc.metadata)


Document Content: This is a sample document. This is a sample document. This is a sample document. This is a sample do...
Document Metadata: {'source': 'sample_source.txt', 'author': 'John Doe', 'date': '2024-06-01', 'page': 1}


### Text Files (.txt) - The Simplest Case

In [4]:
os.makedirs("data/texts", exist_ok=True)

In [5]:
sample_texts = {
    "data/texts/doc1.txt": """
Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation."""
}

for filepath, content in sample_texts.items():
    with open(filepath, 'w') as f:
        f.write(content.strip())

### TextLoader - Read Single File

In [6]:

loader = TextLoader("data/texts/doc1.txt", encoding="utf-8")
documents = loader.load()
print(f"Loaded {len(documents)} document(s).")
print(documents)

Loaded 1 document(s).
[Document(metadata={'source': 'data/texts/doc1.txt'}, page_content='Python Programming Introduction\n\nPython is a high-level, interpreted programming language known for its simplicity and readability.\nCreated by Guido van Rossum and first released in 1991, Python has become one of the most popular\nprogramming languages in the world.\n\nKey Features:\n- Easy to learn and use\n- Extensive standard library\n- Cross-platform compatibility\n- Strong community support\n\nPython is widely used in web development, data science, artificial intelligence, and automation.')]


### DirectoryLoader - Multiple Text Files

In [7]:
loader = DirectoryLoader("data/texts", glob="*.txt", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"})
documents = loader.load()

print(f"Loaded {len(documents)} document(s) from directory.")
print("First document content:", documents[0].page_content[:100] + "...")
print("First document metadata:", documents[0].metadata)

Loaded 2 document(s) from directory.
First document content: Python Programming Introduction

Python is a high-level, interpreted programming language known for ...
First document metadata: {'source': 'data/texts/doc1.txt'}


## Text Splitting Strategeis

In [8]:
# character-based splitting

text = """The Rise of Large Language Models: A New Era in Artificial Intelligence

The field of artificial intelligence (AI) has witnessed remarkable advancements over the past decade, with one of the most significant breakthroughs being the development of Large Language Models (LLMs). These models, such as OpenAI's GPT (Generative Pre-trained Transformer) series, Google's LaMDA and PaLM, and Meta's LLaMA, have demonstrated an extraordinary ability to understand, generate, and interact with human language in a way that was previously the realm of science fiction. Their impact is being felt across countless industries, from software development and content creation to customer service and scientific research.

The journey to modern LLMs began with foundational concepts in natural language processing (NLP) and machine learning. Early statistical models, like n-grams, laid the groundwork by predicting the next word in a sequence based on the previous ones. However, these models lacked a deep understanding of context. The introduction of neural networks, particularly Recurrent Neural Networks (RNNs) and Long Short-Term Memory (LSTM) networks, allowed models to maintain a 'memory' of previous inputs, improving their ability to handle longer dependencies in text.

A pivotal moment came with the introduction of the Transformer architecture in the 2017 paper "Attention Is All You Need" by researchers at Google. The Transformer's key innovation was the self-attention mechanism, which enabled the model to weigh the importance of different words in the input text, regardless of their position. This parallelizable architecture allowed for the training of much larger models on vast amounts of data, a crucial ingredient for the success of LLMs. The "pre-training" and "fine-tuning" paradigm emerged, where a model is first trained on a massive, general-purpose text corpus (like the entire internet) and then fine-tuned on a smaller, task-specific dataset.

The scaling hypothesis—the idea that performance improves predictably as model size, dataset size, and computational budget increase—has been a driving force. GPT-3, with its 175 billion parameters, was a landmark model that showcased "in-context learning," the ability to perform tasks it wasn't explicitly trained on, simply by being given a few examples in the prompt. Subsequent models have grown even larger, incorporating multi-modal capabilities (processing images, audio, and video alongside text) and more sophisticated training techniques like Reinforcement Learning from Human Feedback (RLHF) to better align their outputs with human values and intentions.

The applications of LLMs are vast and continue to expand. They power chatbots and virtual assistants that are more conversational and helpful than ever before. In software engineering, they assist with code generation, debugging, and documentation (e.g., GitHub Copilot). Content creators use them to draft articles, marketing copy, and scripts. In the scientific community, they are being used to analyze research papers, generate hypotheses, and even help in drug discovery. However, the rise of LLMs also brings significant challenges and ethical considerations. Issues such as bias in training data, the potential for misuse in generating misinformation, environmental concerns due to the massive computational power required for training, and the impact on the job market are subjects of ongoing debate and research. As we move forward, harnessing the power of LLMs responsibly will be one of the most critical challenges of our time."""

char_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)


char_chunks = char_splitter.split_text(text)
print(f"Character-based splitting produced {len(char_chunks)} chunks.")
for i, chunk in enumerate(char_chunks):
    print(f"Chunk {i+1}:\n{chunk}\n")

Created a chunk of size 634, which is longer than the specified 200
Created a chunk of size 558, which is longer than the specified 200
Created a chunk of size 693, which is longer than the specified 200
Created a chunk of size 667, which is longer than the specified 200


Character-based splitting produced 6 chunks.
Chunk 1:
The Rise of Large Language Models: A New Era in Artificial Intelligence

Chunk 2:
The field of artificial intelligence (AI) has witnessed remarkable advancements over the past decade, with one of the most significant breakthroughs being the development of Large Language Models (LLMs). These models, such as OpenAI's GPT (Generative Pre-trained Transformer) series, Google's LaMDA and PaLM, and Meta's LLaMA, have demonstrated an extraordinary ability to understand, generate, and interact with human language in a way that was previously the realm of science fiction. Their impact is being felt across countless industries, from software development and content creation to customer service and scientific research.

Chunk 3:
The journey to modern LLMs began with foundational concepts in natural language processing (NLP) and machine learning. Early statistical models, like n-grams, laid the groundwork by predicting the next word in a sequenc

In [9]:
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)
recursive_chunks = recursive_splitter.split_text(text)
print(f"Recursive character-based splitting produced {len(recursive_chunks)} chunks.")
for i, chunk in enumerate(recursive_chunks):
    print(f"Chunk {i+1}:\n{chunk}\n")

Recursive character-based splitting produced 22 chunks.
Chunk 1:
The Rise of Large Language Models: A New Era in Artificial Intelligence

Chunk 2:
The field of artificial intelligence (AI) has witnessed remarkable advancements over the past decade, with one of the most significant breakthroughs being the development of Large Language Models

Chunk 3:
Language Models (LLMs). These models, such as OpenAI's GPT (Generative Pre-trained Transformer) series, Google's LaMDA and PaLM, and Meta's LLaMA, have demonstrated an extraordinary ability to

Chunk 4:
ability to understand, generate, and interact with human language in a way that was previously the realm of science fiction. Their impact is being felt across countless industries, from software

Chunk 5:
from software development and content creation to customer service and scientific research.

Chunk 6:
The journey to modern LLMs began with foundational concepts in natural language processing (NLP) and machine learning. Early statistical 

In [10]:
# token-based splitting
token_splitter = TokenTextSplitter(
    chunk_size=200,
    chunk_overlap=20,
)

token_chunks = token_splitter.split_text(text)
print(f"Token-based splitting produced {len(token_chunks)} chunks.")
for i, chunk in enumerate(token_chunks):
    print(f"Chunk {i+1}:\n{chunk}\n")

Token-based splitting produced 4 chunks.
Chunk 1:
The Rise of Large Language Models: A New Era in Artificial Intelligence

The field of artificial intelligence (AI) has witnessed remarkable advancements over the past decade, with one of the most significant breakthroughs being the development of Large Language Models (LLMs). These models, such as OpenAI's GPT (Generative Pre-trained Transformer) series, Google's LaMDA and PaLM, and Meta's LLaMA, have demonstrated an extraordinary ability to understand, generate, and interact with human language in a way that was previously the realm of science fiction. Their impact is being felt across countless industries, from software development and content creation to customer service and scientific research.

The journey to modern LLMs began with foundational concepts in natural language processing (NLP) and machine learning. Early statistical models, like n-grams, laid the groundwork by predicting the next word in a sequence based on the previou