# Document Splitting

In [18]:
import os
import sys
import langchain
from langchain.document_loaders import PyPDFLoader
from dotenv import load_dotenv
import json
from pylatexenc.latexencode import UnicodeToLatexEncoder
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, TokenTextSplitter
import tiktoken

In [2]:
load_dotenv()
TEXTBOOK_DIRECTORY = os.getenv('TEXTBOOK_DIRECTORY')
DATA_DIRECTORY = os.getenv('DATA_DIRECTORY')

## Define Splitter

In [29]:
chunk_size = 521
chunk_overlap = 0

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
t_splitter = TokenTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

## Load plain text data

In [51]:
with open('./textbook_config.json') as f:
    textbook_config = json.load(f)

print("Textbooks: ", textbook_config.keys())

Textbooks:  dict_keys(['Deep Learning', 'Stanford CS229', 'Math for Machine Learning', 'The Elements of Statistical Learning', 'An Introduction to Statistical Learning'])


## Split data into chunks

In [60]:
total_chunks = 0
textbook_chunks = {}

for book_name, book_info in textbook_config.items():
    textbook_chunks[book_name] = {
        "authors": book_info.get("authors", []),
        "year": book_info.get("year", ""),
        "chapters": []
    }
    for chapter in book_info['chapters']:
        for chapter_number, page_range in chapter.items():

            chapter_directory = f'{DATA_DIRECTORY}/{book_name}/{chapter_number}.txt'
            with open(chapter_directory, 'r', encoding='utf-8') as file:
                chapter_content = file.read()
            chunks = t_splitter.split_text(chapter_content)
            total_chunks += len(chunks)

            textbook_chunks[book_name]["chapters"].append({chapter_number: chunks})

print(f"Split the knowledge base into {total_chunks} chunks.")

with open(f'{DATA_DIRECTORY}/chunks.json', 'w', encoding='utf-8') as file:
    json.dump(textbook_chunks, file, ensure_ascii=False, indent=4)

Split the knowledge base into 522 chunks.
