# **🔷🔷Improving the RAG Architecture🔷🔷**

## **⭐01: Loading Markdown Files**

In [None]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader

loader = UnstructuredMarkdownLoader("README.md")
markdown_content = loader.load()
print(markdown_content[0])

## **⭐02: Loading Python Files**

In [None]:
from langchain_community.document_loaders import PythonLoader

loader = PythonLoader('chatbot.py')
python_data = loader.load()
print(python_data[0])

## **⭐03: Splitting Code Files**

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

python_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150, chunk_overlap=10
)

chunks = python_splitter.split_documents(python_data)
for i, chunk in enumerate(chunks[:3]):
    print(f"Chunk {i+1}:\n{chunk.page_content}\n")

## **⭐04: Language-Specific Splitting**

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=150,
    chunk_overlap=10
)

chunks = python_splitter.split_documents(python_data)
for i, chunk in enumerate(chunks[:3]):
    print(f"Chunk {i+1}:\n{chunk.page_content}\n")

## **⭐05:Token-Based Splitting**

In [None]:
import tiktoken
from langchain_text_splitters import TokenTextSplitter

example_string = "Mary had a little lamb, it's fleece was white as snow."
encoding = tiktoken.encoding_for_model('gpt-4o-mini')
splitter = TokenTextSplitter(
    encoding_name=encoding.name,
    chunk_size=10,
    chunk_overlap=2
)

chunks = splitter.split_text(example_string)
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk}\n")

In [None]:
# Count tokens in each chunk
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\nNo. tokens: {len(encoding.encode(chunk))}\n{chunk}\n")

## **⭐06: Semantic Splitting**

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_experimental.text_splitter import SemanticChunker

embeddings = OpenAIEmbeddings(api_key="...", model='text-embedding-3-small')

semantic_splitter = SemanticChunker(
    embeddings=embeddings,
    breakpoint_threshold_type="gradient",
    breakpoint_threshold_amount=0.8
)

chunks = semantic_splitter.split_documents(python_data)
print(chunks[0])

# 🧩 ***Full code***