In [141]:
import os
from typing import List, Dict, Any
import pandas as pd

In [142]:
from langchain_core.documents import Document
from langchain_text_splitters import (RecursiveCharacterTextSplitter, CharacterTextSplitter,TokenTextSplitter)

print("Setup is completed!")


Setup is completed!


# Understanding document structure in langchain


In [143]:
doc = Document(page_content="This is the main content that will be parsed into a document.",metadata={
    "name":"Max",
    "age":"31",
    "author": "Max Shahdoost",
    "job":"Software Engineer"
})

print('Document strucutre:')

print(f"Doc Content: {doc.page_content}")
print(f"Metadata :{doc.metadata}")
type(doc)

Document strucutre:
Doc Content: This is the main content that will be parsed into a document.
Metadata :{'name': 'Max', 'age': '31', 'author': 'Max Shahdoost', 'job': 'Software Engineer'}


langchain_core.documents.base.Document

In [144]:
import os
os.makedirs('data/text_files', exist_ok=True)

In [145]:
sample_text={
    "data/text_files/python_intro.txt":"""
🐍 Introduction to Python Programming Language
1. What is Python?

Python is a high-level, interpreted, general-purpose programming language created by Guido van Rossum and first released in 1991.
It’s designed to be easy to read, write, and learn, making it one of the most popular languages in the world today.

2. Why is Python so popular?

✅ Simple syntax (looks like plain English).

✅ Versatile (can be used for web apps, data science, AI, automation, games, etc.).

✅ Huge community & libraries (lots of free tools and support).

✅ Cross-platform (works on Windows, macOS, Linux, and even mobile/embedded systems).

3. Key Features

Interpreted: You don’t need to compile it, just run directly.

Dynamic typing: No need to declare variable types (x = 10 works without saying int).

Object-oriented: Supports classes and objects.

Extensible: Can integrate with C, C++, Java, etc.

Portable: Write once, run anywhere.

Large Standard Library: Comes with modules for math, files, networking, and more.""",
"data/text_files/javascript_intro.txt":"""
JavaScript is a high-level, interpreted programming language primarily used to make web pages interactive. While HTML structures content and CSS styles it, JavaScript adds life by handling dynamic behavior such as animations, form validation, interactive maps, and real-time updates without reloading a page. It runs directly in web browsers, making it the backbone of modern web development alongside HTML and CSS.

Beyond the browser, JavaScript is also widely used on servers with Node.js, enabling developers to build full-stack applications using a single language. It has a massive ecosystem of libraries and frameworks like React, Vue, and Angular for front-end, and Express.js for back-end. Its versatility, ease of use, and community support make JavaScript one of the most important and widely used languages in the world today.
"""
}

for filepath, content in sample_text.items():
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(content)

print('My Sample file has been created!')

My Sample file has been created!


In [146]:
from langchain.document_loaders import TextLoader

loader = TextLoader('data/text_files/python_intro.txt', encoding='utf-8')

documents =loader.load()

print(f'Loader {loader}')
print(documents)
print(type(documents))

print(f"📃 loaded {len(documents)} document")
print(f'Content preview {documents[0].page_content[:100]}...')
print(f'Metadata {documents[0].metadata}')

Loader <langchain_community.document_loaders.text.TextLoader object at 0x000001F565178E10>
[Document(metadata={'source': 'data/text_files/python_intro.txt'}, page_content='\n🐍 Introduction to Python Programming Language\n1. What is Python?\n\nPython is a high-level, interpreted, general-purpose programming language created by Guido van Rossum and first released in 1991.\nIt’s designed to be easy to read, write, and learn, making it one of the most popular languages in the world today.\n\n2. Why is Python so popular?\n\n✅ Simple syntax (looks like plain English).\n\n✅ Versatile (can be used for web apps, data science, AI, automation, games, etc.).\n\n✅ Huge community & libraries (lots of free tools and support).\n\n✅ Cross-platform (works on Windows, macOS, Linux, and even mobile/embedded systems).\n\n3. Key Features\n\nInterpreted: You don’t need to compile it, just run directly.\n\nDynamic typing: No need to declare variable types (x = 10 works without saying int).\n\nObject-oriented:

In [147]:
from langchain_community.document_loaders import DirectoryLoader 

dir_loader = DirectoryLoader(
    'data/text_files', 
    glob="**/*.txt", 
    loader_cls=TextLoader,
    loader_kwargs={'encoding':'utf-8'},
    show_progress=True
)

documents = dir_loader.load()

print(f"📃 loaded {len(documents)} document")

for i, doc in enumerate(documents):
    print(f"\n Document {i+1}:")
    print(f'Source {doc.metadata['source']}')
    print(f'Length {len(doc.page_content)} characters')
    
text = documents[1].page_content


100%|██████████| 2/2 [00:00<00:00, 2295.10it/s]

📃 loaded 2 document

 Document 1:
Source data\text_files\javascript_intro.txt
Length 840 characters

 Document 2:
Source data\text_files\python_intro.txt
Length 1006 characters





In [148]:
from langchain_text_splitters import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter,
)

print(documents)

char_spliter = CharacterTextSplitter(separator='\n', chunk_size=200, chunk_overlap=30, length_function=len)

char_chunks = char_spliter.split_text(text)

print(f" {len(char_chunks)} Chunkes has been created!")
print(f"First chunk: {char_chunks[0][:100]}")

[Document(metadata={'source': 'data\\text_files\\javascript_intro.txt'}, page_content='\nJavaScript is a high-level, interpreted programming language primarily used to make web pages interactive. While HTML structures content and CSS styles it, JavaScript adds life by handling dynamic behavior such as animations, form validation, interactive maps, and real-time updates without reloading a page. It runs directly in web browsers, making it the backbone of modern web development alongside HTML and CSS.\n\nBeyond the browser, JavaScript is also widely used on servers with Node.js, enabling developers to build full-stack applications using a single language. It has a massive ecosystem of libraries and frameworks like React, Vue, and Angular for front-end, and Express.js for back-end. Its versatility, ease of use, and community support make JavaScript one of the most important and widely used languages in the world today.\n'), Document(metadata={'source': 'data\\text_files\\python_intro.txt'

In [149]:
print(char_chunks[0])
print('=============================')
print(char_chunks[1])
print('=============================')
print(char_chunks[2])

🐍 Introduction to Python Programming Language
1. What is Python?
Python is a high-level, interpreted, general-purpose programming language created by Guido van Rossum and first released in 1991.
It’s designed to be easy to read, write, and learn, making it one of the most popular languages in the world today.
2. Why is Python so popular?
✅ Simple syntax (looks like plain English).
✅ Versatile (can be used for web apps, data science, AI, automation, games, etc.).
✅ Huge community & libraries (lots of free tools and support).


In [150]:
recursive_spliter = RecursiveCharacterTextSplitter(
    separators=['\n\n',"\n", " ", ""],
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)

recursive_chunks = recursive_spliter.split_text(text)

print(f" {len(recursive_chunks)} Chunkes has been created!")
print(f"First chunk: {recursive_chunks[0][:100]}")



 8 Chunkes has been created!
First chunk: 🐍 Introduction to Python Programming Language
1. What is Python?


In [152]:
print(recursive_chunks[0])
print('=============================')
print(recursive_chunks[1])
print('=============================')
print(recursive_chunks[2])

🐍 Introduction to Python Programming Language
1. What is Python?
Python is a high-level, interpreted, general-purpose programming language created by Guido van Rossum and first released in 1991.
It’s designed to be easy to read, write, and learn, making it one of the most popular languages in the world today.


In [155]:
token_spliter = TokenTextSplitter(
    chunk_size=200,
    chunk_overlap=20,
)

token_chunks = token_spliter.split_text(text)

print(f" {len(token_chunks)} Chunkes has been created!")
print(f"First chunk: {token_chunks[0][:100]}")

 2 Chunkes has been created!
First chunk: 
🐍 Introduction to Python Programming Language
1. What is Python?

Python is a high-level, interpret


In [156]:
print(token_chunks[0])
print('=============================')
print(token_chunks[1])
print('=============================')


🐍 Introduction to Python Programming Language
1. What is Python?

Python is a high-level, interpreted, general-purpose programming language created by Guido van Rossum and first released in 1991.
It’s designed to be easy to read, write, and learn, making it one of the most popular languages in the world today.

2. Why is Python so popular?

✅ Simple syntax (looks like plain English).

✅ Versatile (can be used for web apps, data science, AI, automation, games, etc.).

✅ Huge community & libraries (lots of free tools and support).

✅ Cross-platform (works on Windows, macOS, Linux, and even mobile/embedded systems).

3. Key Features

Interpreted: You don’t need to compile it, just run directly.

Dynamic typing: No need to declare variable
t need to compile it, just run directly.

Dynamic typing: No need to declare variable types (x = 10 works without saying int).

Object-oriented: Supports classes and objects.

Extensible: Can integrate with C, C++, Java, etc.

Portable: Write once, run 