# full textbook parsing

In [2]:
import re
import json


# ------------------ CLEAN HEADING FUNCTION ------------------
def clean_heading(text):
    """
    Clean broken/duplicated headings and fix common OCR errors.
    """
    corrections = {
        "CHEMIC": "CHEMICAL",
        "CHEMIC1.1": "CHEMICAL",  # Extra broken case
        "EQUA": "EQUATION",
        "TIONSTIONS": "TIONS",
        "EQUAAL": "EQUATION",
        "AL": "",  # Remove unnecessary 'AL' if broken
    }

    # Step 1: Remove repeated number pattern
    text = re.sub(r'^(\d+(?:\.\d+)+)\s+(.*?)\1.*', r'\1 \2', text)

    # Step 2: Split words and remove exact duplicates
    words = text.split()
    cleaned_words = []
    seen_words = set()

    for word in words:
        word = corrections.get(word, word)  # Apply corrections
        if word not in seen_words:
            cleaned_words.append(word)
            seen_words.add(word)

    return ' '.join(cleaned_words)


# ------------------ PARSE FUNCTION ------------------
def parse_chapters_and_sections(pages_text):
    """
    Parses the extracted text into structured chapters and headings.
    """
    structured_content = {}
    current_chapter = None
    current_heading = None
    buffer = ""

    # Regex patterns
    chapter_pattern = re.compile(r'\b(\d+)\s*CHAPTER\b', re.IGNORECASE)
    heading_pattern = re.compile(r'^(\d+(?:\.\d+)+)\s+([A-Z][^\n]*)', re.IGNORECASE)

    chapters_seen = set()  # Avoid TOC duplicates

    for page in pages_text:
        lines = page.split('\n')
        for line in lines:
            line_clean = line.strip()
            if not line_clean:
                continue  # Skip empty lines

            # --------- Skip TOC/Index pages ----------
            if 'contents' in line_clean.lower() or 'index' in line_clean.lower():
                continue

            # ---------- Detect Chapter ----------
            chapter_match = chapter_pattern.search(line_clean)
            if chapter_match:
                chapter_num = chapter_match.group(1) or chapter_match.group(2)
                chapter_key = f"{chapter_num} CHAPTER"

                # Avoid duplicates (TOC chapters)
                if chapter_key in chapters_seen:
                    continue
                chapters_seen.add(chapter_key)

                # Save previous chapter/heading content
                if current_chapter:
                    if current_heading and buffer.strip():
                        structured_content[current_chapter][current_heading] = buffer.strip()
                    elif buffer.strip():
                        structured_content[current_chapter]["content"] = buffer.strip()

                # Initialize new chapter
                current_chapter = chapter_key
                structured_content[current_chapter] = {}
                current_heading = None
                buffer = ""
                print(f"🟢 Detected Chapter: {current_chapter}")
                continue  # Next line

            # ---------- Detect Section/Heading ----------
            heading_match = heading_pattern.match(line_clean)
            if heading_match:
                # Save previous heading content
                if current_heading and buffer.strip():
                    structured_content[current_chapter][current_heading] = buffer.strip()
                elif not current_heading and buffer.strip():
                    structured_content[current_chapter]["content"] = buffer.strip()

                # Clean and set heading
                heading_number = heading_match.group(1)
                heading_title = clean_heading(heading_match.group(2).strip())
                current_heading = f"{heading_number} {heading_title}"
                buffer = ""  # Reset for new section
                print(f"🔵 Detected Heading: {current_heading}")
                continue  # Next line

            # ---------- Accumulate Content ----------
            buffer += " " + line_clean  # Add content

    # ---------- Final buffer save ----------
    if current_chapter:
        if current_heading and buffer.strip():
            structured_content[current_chapter][current_heading] = buffer.strip()
        elif buffer.strip():
            structured_content[current_chapter]["content"] = buffer.strip()

    return structured_content


# ------------------ PRINT FUNCTION ------------------
def print_structured_content(structured_content):
    for chapter, sections in structured_content.items():
        print(f"\n=== {chapter} ===\n")
        for heading, content in sections.items():
            print(f"--- {heading} ---\n")
            print(content[:500])  # Print first 500 chars for preview
            print("\n")


# ------------------ SAVE TO JSON FUNCTION ------------------
def save_to_json(data, file_path):
    with open(file_path, "w", encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"\n✅ Data successfully saved to {file_path}")


# ------------------ PDF TEXT EXTRACTION FUNCTION ------------------
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path):
    """
    Extract text from PDF using PyPDF2.
    """
    extracted_pages = []
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        for page_number, page in enumerate(reader.pages):
            page_text = page.extract_text()
            extracted_pages.append(page_text if page_text else "")
    return extracted_pages


# ------------------ MAIN EXECUTION ------------------
if __name__ == "__main__":
    pdf_file_path = "ex.pdf"  # ✅ Path to your PDF file
    output_json_path = "structured_science_text.json"  # ✅ Output path for JSON

    # Step 1: Extract text from PDF
    pages_text = extract_text_from_pdf(pdf_file_path)

    # Step 2: Parse chapters and sections (with cleaning)
    structured_content = parse_chapters_and_sections(pages_text)

    # Step 3: Print preview in console
    print_structured_content(structured_content)

    # Step 4: Save structured output as JSON
    save_to_json(structured_content, output_json_path)

KeyError: None

# processing chapters ✅

In [16]:
import os
import json
import fitz  # PyMuPDF

# ------------------ EXTRACT TEXT WITH HEADINGS & SUBHEADINGS ------------------

def extract_text_with_markers(pdf_path):
    """
    Extracts text from a PDF and tags headings/subheadings dynamically based on font size.
    - Uses the largest font size for main headings.
    - Uses the second-largest font size for subheadings.
    - Skips unwanted labels like "Activity X.Y" and "Figure X.Y".
    """
    doc = fitz.open(pdf_path)
    extracted_text = []
    font_sizes = []

    # First Pass: Collect font sizes
    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        font_sizes.append(span["size"])

    # Find unique font sizes and determine thresholds
    unique_sizes = sorted(set(font_sizes), reverse=True)  # Sort from largest to smallest
    main_heading_size = unique_sizes[0] if len(unique_sizes) > 0 else 15
    subheading_size = unique_sizes[1] if len(unique_sizes) > 1 else 12  # Second-largest font

    # Second Pass: Extract text with markers
    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    text = " ".join(span["text"] for span in line["spans"]).strip()
                    if not text:
                        continue  # Skip empty lines
                    
                    # Extract font size and style
                    font_size = line["spans"][0]["size"]
                    font_name = line["spans"][0]["font"]

                    # ✅ Ignore "Activity X.Y" or "Figure X.Y" as headings
                    if re.match(r"^(Activity|Figure)\s*\d+(\.\d+)?", text, re.IGNORECASE):
                        continue  

                    # ✅ Detect **Main Headings** (Largest Font Size)
                    if font_size >= main_heading_size:
                        extracted_text.append(f"\n#HEADING# {text}\n")
                    
                    # ✅ Detect **Subheadings** (Second-Largest Font Size, Bold)
                    elif font_size >= subheading_size and "Bold" in font_name:
                        extracted_text.append(f"\n@SUBHEADING@ {text}\n")

                    # ✅ Normal text (Content)
                    else:
                        extracted_text.append(text)

    return "\n".join(extracted_text)  # Convert list to single text block


# ------------------ SAVE TO JSON FUNCTION ------------------

def save_to_json(data, file_path):
    with open(file_path, "w", encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"✅ Data saved to {file_path}")


# ------------------ PROCESS MULTIPLE PDF FILES ------------------

def process_textbook_pdfs(input_folder, output_folder):
    """
    Loops through all PDFs, extracts text, cleans it, and saves output as JSON.
    """
    os.makedirs(output_folder, exist_ok=True)  # Ensure output folder exists

    files_found = [f for f in os.listdir(input_folder) if f.endswith(".pdf")]

    if not files_found:
        print("❌ No PDF files found in the 'textbooks/' folder!")
        return

    print(f"📂 Found {len(files_found)} PDFs: {files_found}")

    for filename in files_found:
        pdf_path = os.path.join(input_folder, filename)
        chapter_number = filename.replace(".pdf", "")  # Extract 'chap1', 'chap2', etc.
        chapter_name = f"Chapter {chapter_number}"  # Format as 'Chapter 1'

        print(f"\n🚀 Extracting text from {filename}...")

        # Step 1: Extract structured text from PDF
        structured_text = extract_text_with_markers(pdf_path)

        if not structured_text.strip():
            print(f"⚠️ Warning: {filename} appears to be empty!")

        # Step 2: Save as JSON
        output_file = os.path.join(output_folder, f"{chapter_number}_output.json")
        save_to_json({chapter_name: structured_text}, output_file)

        print(f"✅ Processed: {filename} → Saved as {output_file}")


# ------------------ MAIN EXECUTION ------------------

if __name__ == "__main__":
    input_folder = "textbooks"  # Folder containing chap1.pdf, chap2.pdf, etc.
    output_folder = "processed_textbook"  # Folder to save output JSONs

    process_textbook_pdfs(input_folder, output_folder)


📂 Found 10 PDFs: ['chap1.pdf', 'chap12.pdf', 'chap13.pdf', 'chap15.pdf', 'chap2.pdf', 'chap3.pdf', 'chap6.pdf', 'chap7.pdf', 'ex.pdf', '~$ex.pdf']

🚀 Extracting text from chap1.pdf...
✅ Data saved to processed_textbook\chap1_output.json
✅ Processed: chap1.pdf → Saved as processed_textbook\chap1_output.json

🚀 Extracting text from chap12.pdf...
✅ Data saved to processed_textbook\chap12_output.json
✅ Processed: chap12.pdf → Saved as processed_textbook\chap12_output.json

🚀 Extracting text from chap13.pdf...
✅ Data saved to processed_textbook\chap13_output.json
✅ Processed: chap13.pdf → Saved as processed_textbook\chap13_output.json

🚀 Extracting text from chap15.pdf...
✅ Data saved to processed_textbook\chap15_output.json
✅ Processed: chap15.pdf → Saved as processed_textbook\chap15_output.json

🚀 Extracting text from chap2.pdf...
✅ Data saved to processed_textbook\chap2_output.json
✅ Processed: chap2.pdf → Saved as processed_textbook\chap2_output.json

🚀 Extracting text from chap3.pdf...

FileDataError: Failed to open file 'textbooks\\~$ex.pdf'.

In [21]:
import os
import json
import re
import fitz  # PyMuPDF

# ------------------ CLEAN TEXT FUNCTION ------------------
def clean_text(text):
    """
    Cleans extracted text by removing duplicate words, phrases, and fixing common OCR errors.
    """
    text = re.sub(r'\b(\w+\s*\d*\.?\d*)\s*(\1)+\b', r'\1', text, flags=re.IGNORECASE)  # Remove repeated words
    text = re.sub(r'(\b[\w\s,\'"-]+[.!?])\s*(\1)+', r'\1', text, flags=re.IGNORECASE)  # Remove repeated sentences
    text = re.sub(r'\n+', '\n', text)  # Replace multiple newlines with one
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# ------------------ EXTRACT TEXT FROM PDF (WITH MARKERS) ------------------
def extract_text_with_markers(pdf_path):
    """
    Extracts text from PDF and marks headings and subheadings with special symbols.
    - Uses font size to detect headings/subheadings.
    - Uses bold styling to differentiate subheadings.
    - Preserves content structure for better chunking later.
    """
    doc = fitz.open(pdf_path)
    extracted_text = []
    font_sizes = []

    for page in doc:
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    text = " ".join(span["text"] for span in line["spans"]).strip()

                    if not text:
                        continue  # Skip empty lines

                    font_size = line["spans"][0]["size"]
                    font_name = line["spans"][0]["font"]
                    font_sizes.append(font_size)

                    # ✅ Detecting **Main Headings** (Pink, Largest Font)
                    if font_size > max(font_sizes) * 0.9:  
                        extracted_text.append(f"\n#HEADING# {text}\n")

                    # ✅ Detecting **Subheadings** (Light Gray/Bold)
                    elif font_size > max(font_sizes) * 0.7 and "Bold" in font_name:
                        extracted_text.append(f"\n@SUBHEADING@ {text}\n")

                    # ✅ Normal text (Content)
                    else:
                        extracted_text.append(text)

    return "\n".join(extracted_text)

# ------------------ SAVE TO JSON FUNCTION ------------------
def save_to_json(data, file_path):
    with open(file_path, "w", encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"✅ Data saved to {file_path}")

# ------------------ PROCESS MULTIPLE PDF FILES ------------------
def process_textbook_pdfs(input_folder, output_folder):
    """
    Loops through all PDFs, extracts text, cleans it, and saves output as JSON.
    """
    os.makedirs(output_folder, exist_ok=True)  # Ensure output folder exists

    files_found = [f for f in os.listdir(input_folder) if f.endswith(".pdf") and not f.startswith("~$")]


    if not files_found:
        print("❌ No PDF files found in the 'textbooks/' folder!")
        return

    print(f"📂 Found {len(files_found)} PDFs: {files_found}")

    for filename in files_found:
        pdf_path = os.path.join(input_folder, filename)
        chapter_number = filename.split(".pdf")[0]  # Extract 'chap1', 'chap2', etc.
        chapter_name = f"Chapter {chapter_number}"  # Format as 'Chapter 1'

        print(f"\n🚀 Extracting text from {filename}...")

        # Step 1: Extract structured text from PDF
        structured_text = extract_text_with_markers(pdf_path)

        if not structured_text.strip():
            print(f"⚠️ Warning: {filename} appears to be empty!")

        # Step 2: Structure the content
        structured_data = {chapter_name: structured_text}

        # Step 3: Save as JSON
        output_file = os.path.join(output_folder, f"{chapter_number}_output.json")
        save_to_json(structured_data, output_file)

        print(f"✅ Processed: {filename} → Saved as {output_file}")

# ------------------ MAIN EXECUTION ------------------
if __name__ == "__main__":
    input_folder = "textbooks"  # Folder containing chap1.pdf, chap2.pdf, etc.
    output_folder = "processed_textbook"  # Folder to save output JSONs

    process_textbook_pdfs(input_folder, output_folder)


📂 Found 8 PDFs: ['chap1.pdf', 'chap12.pdf', 'chap13.pdf', 'chap15.pdf', 'chap2.pdf', 'chap3.pdf', 'chap6.pdf', 'chap7.pdf']

🚀 Extracting text from chap1.pdf...
✅ Data saved to processed_textbook\chap1_output.json
✅ Processed: chap1.pdf → Saved as processed_textbook\chap1_output.json

🚀 Extracting text from chap12.pdf...
✅ Data saved to processed_textbook\chap12_output.json
✅ Processed: chap12.pdf → Saved as processed_textbook\chap12_output.json

🚀 Extracting text from chap13.pdf...
✅ Data saved to processed_textbook\chap13_output.json
✅ Processed: chap13.pdf → Saved as processed_textbook\chap13_output.json

🚀 Extracting text from chap15.pdf...
✅ Data saved to processed_textbook\chap15_output.json
✅ Processed: chap15.pdf → Saved as processed_textbook\chap15_output.json

🚀 Extracting text from chap2.pdf...
✅ Data saved to processed_textbook\chap2_output.json
✅ Processed: chap2.pdf → Saved as processed_textbook\chap2_output.json

🚀 Extracting text from chap3.pdf...
✅ Data saved to proces

# Refining ✅

In [6]:
import os
import re
import json

# ---------------------- CLEANING FUNCTIONS -----------------------

def remove_repeated_phrases(text):
    """Remove repeated phrases (e.g., Activity 1.1 repeated)"""
    return re.sub(r'\b(\w+(?:\s+\w+){0,4})\b(?:\s+\1\b)+', r'\1', text, flags=re.IGNORECASE)


def fix_spacing_and_formatting(text):
    """Fix bullet points, spacing, and line breaks."""
    text = re.sub(r'\bn\s*', '\n- ', text)  # Replace 'n' with bullet points
    return text


def clean_broken_words(text):
    """Fix broken words split by OCR."""
    text = re.sub(r'\bn\s+', ' ', text)  # Remove standalone 'n'
    text = re.sub(r'(\w)-\s*(\w)', r'\1\2', text)  # Fix hyphenated broken words
    return text


def clean_headings(text):
    """Clean broken headings and duplicates."""
    text = re.sub(r'(\d+(?:\.\d+)+)\s+([A-Z]+(?:\s+[A-Z]+)+)', 
                  lambda m: m.group(1) + " " + m.group(2).replace(' ', ''), text)
    text = re.sub(r'\b(\d+(?:\.\d+)+.*?)\1\b', r'\1', text)  # Remove duplicated headings
    return text


def remove_figure_activity_repeats(text):
    """Remove repeated figure and activity captions."""
    text = re.sub(r'(Figure\s+\d+\.\d+)(\s+\1)+', r'\1', text)
    text = re.sub(r'(Activity\s+\d+\.\d+)(\s+\1)+', r'\1', text)
    return text


def format_equations(text):
    """Standardize equations."""
    text = re.sub(r'([A-Za-z0-9\(\)]+)\s*[-–>]\s*([A-Za-z0-9\(\)]+)', r'\1 → \2', text)
    text = re.sub(r'((?:[A-Za-z0-9\(\)]+\s*\+\s*)+[A-Za-z0-9\(\)]+)\s*→\s*((?:[A-Za-z0-9\(\)]+\s*\+\s*)*[A-Za-z0-9\(\)]+)', r'\n\1 → \2\n', text)
    return text


def add_section_breaks(text):
    """Add line breaks before headings for better segmentation."""
    return re.sub(r'(\n*)(\d+(?:\.\d+)*\s+[A-Z][^\n]+)', r'\n\n\2\n', text)


def remove_ktbs_notices(text):
    """Remove KTBS copyright notices."""
    return re.sub(r'©KTBS Not to be re published(?: Science)?', '', text, flags=re.IGNORECASE)


def final_cleanup(text):
    """Final text cleaning for common broken words & unwanted characters."""
    text = re.sub(r'\b→\s*t', 't', text)  # Fix '→ t' as 't'
    text = re.sub(r'\b→\s*n', 'n', text)  # Fix '→ n' as 'n'
    text = re.sub(r'\b→\s*ature', 'nature', text)  # Fix '→ ature' as 'nature'
    text = re.sub(r'\b→\s*eeds', 'needs', text)  # Fix '→ eeds' as 'needs'
    text = re.sub(r'\b→\s*itrate', 'nitrate', text)  # Fix '→ itrate' as 'nitrate'
    text = re.sub(r'\bail\b', 'nail', text)  # Fix 'ail' to 'nail'
    text = re.sub(r'\blear\s*→\s*t\b', 'learnt', text)  # Fix 'lear → t' as 'learnt'
    
    # Fix broken "not"
    text = re.sub(r'→\s*ot', 'not', text)
    
    # Remove repeated figures/activities
    text = re.sub(r'(Figure\s+\d+\.\d+)(\s*\1)+', r'\1', text)
    text = re.sub(r'(Activity\s+\d+\.\d+)(\s*\1)+', r'\1', text)

    return text


def clean_full_text(text):
    """Apply all text cleaning steps."""
    text = remove_ktbs_notices(text)  
    text = remove_repeated_phrases(text)
    text = fix_spacing_and_formatting(text)
    text = clean_broken_words(text)
    text = clean_headings(text)
    text = remove_figure_activity_repeats(text)
    text = format_equations(text)
    text = add_section_breaks(text)
    text = final_cleanup(text)
    return text


# ---------------------- STRUCTURE TEXT TO JSON -----------------------

def structure_text_to_json(text):
    """
    Splits cleaned text into headings and their corresponding content, preserving order.
    """
    structured_data = {}
    current_heading = "INTRODUCTION"  # Default starting heading
    buffer = ""

    heading_pattern = re.compile(r'^(\d+(?:\.\d+)+\s+.*)', re.MULTILINE)

    lines = text.split('\n')
    for line in lines:
        line = line.strip()
        if not line:
            continue  # Skip empty lines

        heading_match = heading_pattern.match(line)
        if heading_match:
            if buffer:
                structured_data[current_heading] = buffer.strip()
                buffer = ""
            current_heading = heading_match.group(1).strip()
        else:
            buffer += " " + line  

    if buffer:
        structured_data[current_heading] = buffer.strip()

    return structured_data


# ---------------------- MAIN EXECUTION -----------------------

if __name__ == "__main__":
    # ✅ Input and Output Paths
    input_folder = "processed_textbook"  # Folder with cleaned JSON files
    cleaned_text_folder = "cleaned_output"  # Folder to save cleaned text
    structured_output_folder = "structured_output"  # Folder to save structured JSONs

    os.makedirs(cleaned_text_folder, exist_ok=True)
    os.makedirs(structured_output_folder, exist_ok=True)

    # ✅ Loop through each JSON file in input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".json"):
            input_path = os.path.join(input_folder, filename)
            chapter_name = filename.replace("_output.json", "")  # Extract "chap1" from "chap1_output.json"

            print(f"\n🚀 Processing {filename}...")

            # Step 1: Load JSON File
            with open(input_path, "r", encoding="utf-8") as file:
                json_data = json.load(file)

            # Extract text (assumes JSON format {"Chapter X": "Text content"})
            chapter_text = list(json_data.values())[0]  

            # Step 2: Clean text
            cleaned_text = clean_full_text(chapter_text)

            # Step 3: Save cleaned text
            cleaned_text_path = os.path.join(cleaned_text_folder, f"{chapter_name}_cleaned.txt")
            with open(cleaned_text_path, "w", encoding="utf-8") as file:
                file.write(cleaned_text)

            # Step 4: Structure text into JSON
            structured_json = structure_text_to_json(cleaned_text)

            # Step 5: Save structured JSON
            structured_json_path = os.path.join(structured_output_folder, f"{chapter_name}_structured.json")
            with open(structured_json_path, "w", encoding="utf-8") as file:
                json.dump(structured_json, file, ensure_ascii=False, indent=4)

            print(f"✅ Processed {filename} → Saved cleaned text & structured JSON.")




🚀 Processing chap12_output.json...
✅ Processed chap12_output.json → Saved cleaned text & structured JSON.

🚀 Processing chap13_output.json...
✅ Processed chap13_output.json → Saved cleaned text & structured JSON.

🚀 Processing chap15_output.json...
✅ Processed chap15_output.json → Saved cleaned text & structured JSON.

🚀 Processing chap1_output.json...
✅ Processed chap1_output.json → Saved cleaned text & structured JSON.

🚀 Processing chap2_output.json...
✅ Processed chap2_output.json → Saved cleaned text & structured JSON.

🚀 Processing chap3_output.json...
✅ Processed chap3_output.json → Saved cleaned text & structured JSON.

🚀 Processing chap6_output.json...
✅ Processed chap6_output.json → Saved cleaned text & structured JSON.

🚀 Processing chap7_output.json...
✅ Processed chap7_output.json → Saved cleaned text & structured JSON.

🚀 Processing ex_output.json...
✅ Processed ex_output.json → Saved cleaned text & structured JSON.


# Chunking of chapter files ✅

In [42]:
import os
import re
import json

# ---------------------- CHUNKING FUNCTION -----------------------

def chunk_text(text):
    """
    Splits text into structured chunks based on subheadings.
    """
    structured_data = {}
    current_section = "Introduction"
    buffer = ""

    # Pattern to detect valid subheadings (e.g., "1.1 Chemical Reactions")
    heading_pattern = re.compile(r'(\d+\.\d+(?:\.\d+)*\s+[A-Z][^\n]*)')

    lines = text.split("\n")
    for line in lines:
        line_clean = line.strip()

        # Ignore empty lines
        if not line_clean:
            continue

        # Detect valid subheadings
        heading_match = heading_pattern.match(line_clean)
        if heading_match:
            new_section = heading_match.group(1).strip()

            # Ignore headings with "Activity" or "Figure"
            if "Activity" in new_section or "Figure" in new_section:
                continue

            # Store previous section content
            if buffer.strip():
                structured_data[current_section] = buffer.strip()

            # Start new section
            current_section = new_section
            buffer = ""
        else:
            buffer += " " + line_clean  # Accumulate content

    # Store last section
    if buffer.strip():
        structured_data[current_section] = buffer.strip()

    return structured_data


def process_cleaned_file(input_path, output_path, chapter_name):
    """
    Reads a cleaned text file, chunks it by subheadings, and saves it as JSON.
    """
    with open(input_path, "r", encoding="utf-8") as file:
        text = file.read()

    # Process the text into structured chunks
    structured_data = chunk_text(text)

    # Save the structured chunked file
    with open(output_path, "w", encoding="utf-8") as file:
        json.dump({chapter_name: structured_data}, file, ensure_ascii=False, indent=4)

    print(f"✅ Chunked file saved: {output_path}")


# ---------------------- MAIN EXECUTION -----------------------

if __name__ == "__main__":
    input_folder = "cleaned_output"  # Folder with cleaned text files
    output_folder = "chunked_output"  # Folder to save chunked JSONs

    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(input_folder):
        if filename.endswith("_cleaned.txt"):
            input_path = os.path.join(input_folder, filename)
            chapter_name = filename.replace("_cleaned.txt", "")  # Extract "chap1"

            output_path = os.path.join(output_folder, f"{chapter_name}_chunked.json")
            process_cleaned_file(input_path, output_path, chapter_name)


✅ Chunked file saved: chunked_output\chap12_chunked.json
✅ Chunked file saved: chunked_output\chap13_chunked.json
✅ Chunked file saved: chunked_output\chap15_chunked.json
✅ Chunked file saved: chunked_output\chap1_chunked.json
✅ Chunked file saved: chunked_output\chap2_chunked.json
✅ Chunked file saved: chunked_output\chap3_chunked.json
✅ Chunked file saved: chunked_output\chap6_chunked.json
✅ Chunked file saved: chunked_output\chap7_chunked.json


In [35]:
import os
import re
import json

# ---------------------- FUNCTION TO CLEAN SUBHEADINGS -----------------------

def clean_subheading(subheading):
    """
    Extracts only the heading number + short title (removes long sentences, duplicate words, and unnecessary text).
    """
    match = re.match(r'(\d+\.\d+)\s+([A-Za-z\s-]+)', subheading)
    if match:
        return f"{match.group(1)} {match.group(2).strip().split()[0]}"  # Keep only main topic
    return subheading.strip()

# ---------------------- FUNCTION TO FIX CHUNKED JSON -----------------------

def fix_chunked_json(input_path, output_path):
    """
    Reads a chunked JSON file, cleans subheading names, and saves a structured output.
    """
    with open(input_path, "r", encoding="utf-8") as file:
        chunked_data = json.load(file)

    fixed_data = {}
    for chapter, sections in chunked_data.items():
        fixed_data[chapter] = {}

        for subheading, chunks in sections.items():
            cleaned_subheading = clean_subheading(subheading)
            fixed_data[chapter][cleaned_subheading] = chunks

    # Save the cleaned JSON
    with open(output_path, "w", encoding="utf-8") as out_file:
        json.dump(fixed_data, out_file, ensure_ascii=False, indent=4)

    print(f"✅ Fixed chunked file saved: {output_path}")

# ---------------------- MAIN EXECUTION -----------------------

if __name__ == "__main__":
    input_folder = "chunked_output"  # Folder with chunked JSON files
    output_folder = "fixed_chunked_output"  # Folder to save fixed JSONs

    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(input_folder):
        if filename.endswith("_chunked.json"):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename.replace("_chunked", "_chunked_fixed"))

            fix_chunked_json(input_path, output_path)


✅ Fixed chunked file saved: fixed_chunked_output\chap12_chunked_fixed.json
✅ Fixed chunked file saved: fixed_chunked_output\chap13_chunked_fixed.json
✅ Fixed chunked file saved: fixed_chunked_output\chap15_chunked_fixed.json
✅ Fixed chunked file saved: fixed_chunked_output\chap1_chunked_fixed.json
✅ Fixed chunked file saved: fixed_chunked_output\chap2_chunked_fixed.json
✅ Fixed chunked file saved: fixed_chunked_output\chap3_chunked_fixed.json
✅ Fixed chunked file saved: fixed_chunked_output\chap6_chunked_fixed.json
✅ Fixed chunked file saved: fixed_chunked_output\chap7_chunked_fixed.json


# Combining Chunked chapters ✅

In [36]:
import os
import json

# ---------------------- FUNCTION TO MERGE ALL CHAPTER JSONs -----------------------

def merge_all_chapters(input_folder, output_file):
    """
    Merges all chapter-wise chunked JSON files into a single JSON file.
    """
    merged_data = {}

    for filename in sorted(os.listdir(input_folder)):  # Ensure correct chapter order
        if filename.endswith("_chunked_fixed.json"):
            chapter_name = filename.replace("_chunked_fixed.json", "")  # Extract "chap1", "chap2", etc.
            file_path = os.path.join(input_folder, filename)

            with open(file_path, "r", encoding="utf-8") as file:
                chapter_data = json.load(file)

            merged_data[chapter_name] = chapter_data.get(chapter_name, {})  # Keep only chapter content

    # Save merged JSON
    with open(output_file, "w", encoding="utf-8") as out_file:
        json.dump(merged_data, out_file, ensure_ascii=False, indent=4)

    print(f"✅ Merged JSON saved: {output_file}")

# ---------------------- MAIN EXECUTION -----------------------

if __name__ == "__main__":
    input_folder = "fixed_chunked_output"  # Folder with fixed chunked JSON files
    output_file = "all_chapters_chunked.json"  # Final merged JSON file

    merge_all_chapters(input_folder, output_file)


✅ Merged JSON saved: all_chapters_chunked.json


# Final refinement ✅

In [38]:
import json
import re
import os

# ---------------------- FUNCTION TO CLEAN SUBHEADINGS -----------------------

def clean_subheading(subheading):
    """
    Extracts only the heading number + short title (removes long sentences, duplicate words, and unnecessary text).
    """
    match = re.match(r'(\d+\.\d+)\s+([A-Za-z\s-]+)', subheading)
    if match:
        return f"{match.group(1)} {match.group(2).strip().split()[0]}"  # Keep only the main topic word
    return subheading.strip()


# ---------------------- FUNCTION TO CLEAN CONTENT -----------------------

def clean_content(text):
    """
    Cleans text by removing extra spaces, fixing broken words, and removing redundant symbols.
    """
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    text = re.sub(r'(\w)-\s*(\w)', r'\1\2', text)  # Fix hyphenated words split by OCR
    text = text.replace("©KTBS", "").replace("Not to be republished", "").strip()  # Remove unwanted text
    return text


# ---------------------- FUNCTION TO FIX CHUNKED JSON -----------------------

def fix_chunked_json(input_path, output_path):
    """
    Reads a chunked JSON file, cleans subheading names, formats content, and saves a structured output.
    """
    with open(input_path, "r", encoding="utf-8") as file:
        chunked_data = json.load(file)

    fixed_data = {}
    for chapter, sections in chunked_data.items():
        fixed_data[chapter] = {}

        for subheading, chunks in sections.items():
            cleaned_subheading = clean_subheading(subheading)
            cleaned_chunks = [clean_content(chunk) for chunk in chunks]

            fixed_data[chapter][cleaned_subheading] = cleaned_chunks

    # Save the cleaned JSON
    with open(output_path, "w", encoding="utf-8") as out_file:
        json.dump(fixed_data, out_file, ensure_ascii=False, indent=4)

    print(f"✅ Fixed RAG-ready JSON saved: {output_path}")


# ---------------------- MAIN EXECUTION -----------------------

if __name__ == "__main__":
    input_file = "all_chapters_chunked.json"  # Input merged JSON file
    output_file = "knowledgebase.json"  # Final cleaned JSON for RAG

    fix_chunked_json(input_file, output_file)


✅ Fixed RAG-ready JSON saved: knowledgebase.json
