In [2]:
import os
import json

In [3]:
def load_and_chunk_data(chunk_size=800, data_path='./raw/', output_path='./processed/'):
    """Loads text documents from subdirectories under 'data_path', 
    splits them into chunks, and saves all chunks into a single JSON file."""
    
    # Create the output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    
    # Define the path for the output JSON file
    output_file_path = os.path.join(output_path, 'processed_data.json')
    
    docs_chunks = []
    
    # Recursively walk through subdirectories of data_path
    for root, dirs, files in os.walk(data_path):
        for filename in files:
            if filename.endswith(".txt"):
                filepath = os.path.join(root, filename)
                try:
                    with open(filepath, 'r', encoding='utf-8') as file:
                        text = file.read()
                    
                    # Extract title from the filename (without extension)
                    title = os.path.splitext(filename)[0]
                    
                    # Split the text into chunks of 'chunk_size' characters
                    for i in range(0, len(text), chunk_size):
                        chunk = text[i:i+chunk_size]
                        docs_chunks.append({
                            "title": title,
                            "chunk_index": i // chunk_size + 1,
                            "content": chunk
                        })
                except Exception as e:
                    print(f"Error processing file {filepath}: {e}")
    
    # Write all chunks to a single JSON file
    with open(output_file_path, 'w', encoding='utf-8') as out_file:
        json.dump(docs_chunks, out_file, ensure_ascii=False, indent=4)
    
    return docs_chunks

# Example execution
chunks = load_and_chunk_data(chunk_size=800, data_path='./raw/', output_path='./processed/')
print(f"Processed {len(chunks)} chunks and saved them in './processed/processed_data.json'.")

Processed 321254 chunks and saved them in './processed/processed_data.json'.
