In [7]:
import re
import json


# ------------------ CLEAN HEADING FUNCTION ------------------
def clean_heading(text):
    """
    Clean broken/duplicated headings and fix common OCR errors.
    """
    corrections = {
        "CHEMIC": "CHEMICAL",
        "CHEMIC1.1": "CHEMICAL",  # Extra broken case
        "EQUA": "EQUATION",
        "TIONSTIONS": "TIONS",
        "EQUAAL": "EQUATION",
        "AL": "",  # Remove unnecessary 'AL' if broken
    }

    # Step 1: Remove repeated number pattern
    text = re.sub(r'^(\d+(?:\.\d+)+)\s+(.*?)\1.*', r'\1 \2', text)

    # Step 2: Split words and remove exact duplicates
    words = text.split()
    cleaned_words = []
    seen_words = set()

    for word in words:
        word = corrections.get(word, word)  # Apply corrections
        if word not in seen_words:
            cleaned_words.append(word)
            seen_words.add(word)

    return ' '.join(cleaned_words)


# ------------------ PARSE FUNCTION ------------------
def parse_chapters_and_sections(pages_text):
    """
    Parses the extracted text into structured chapters and headings.
    """
    structured_content = {}
    current_chapter = None
    current_heading = None
    buffer = ""

    # Regex patterns
    chapter_pattern = re.compile(r'\b(?:Chapter\s*(\d+)|(\d+)\s+CHAPTER)\b', re.IGNORECASE)
    heading_pattern = re.compile(r'^(\d+(?:\.\d+)+)\s+([A-Z][^\n]*)', re.IGNORECASE)

    chapters_seen = set()  # Avoid TOC duplicates

    for page in pages_text:
        lines = page.split('\n')
        for line in lines:
            line_clean = line.strip()
            if not line_clean:
                continue  # Skip empty lines

            # --------- Skip TOC/Index pages ----------
            if 'contents' in line_clean.lower() or 'index' in line_clean.lower():
                continue

            # ---------- Detect Chapter ----------
            chapter_match = chapter_pattern.search(line_clean)
            if chapter_match:
                chapter_num = chapter_match.group(1) or chapter_match.group(2)
                chapter_key = f"{chapter_num} CHAPTER"

                # Avoid duplicates (TOC chapters)
                if chapter_key in chapters_seen:
                    continue
                chapters_seen.add(chapter_key)

                # Save previous chapter/heading content
                if current_chapter:
                    if current_heading and buffer.strip():
                        structured_content[current_chapter][current_heading] = buffer.strip()
                    elif buffer.strip():
                        structured_content[current_chapter]["content"] = buffer.strip()

                # Initialize new chapter
                current_chapter = chapter_key
                structured_content[current_chapter] = {}
                current_heading = None
                buffer = ""
                print(f"🟢 Detected Chapter: {current_chapter}")
                continue  # Next line

            # ---------- Detect Section/Heading ----------
            heading_match = heading_pattern.match(line_clean)
            if heading_match:
                # Save previous heading content
                if current_heading and buffer.strip():
                    structured_content[current_chapter][current_heading] = buffer.strip()
                elif not current_heading and buffer.strip():
                    structured_content[current_chapter]["content"] = buffer.strip()

                # Clean and set heading
                heading_number = heading_match.group(1)
                heading_title = clean_heading(heading_match.group(2).strip())
                current_heading = f"{heading_number} {heading_title}"
                buffer = ""  # Reset for new section
                print(f"🔵 Detected Heading: {current_heading}")
                continue  # Next line

            # ---------- Accumulate Content ----------
            buffer += " " + line_clean  # Add content

    # ---------- Final buffer save ----------
    if current_chapter:
        if current_heading and buffer.strip():
            structured_content[current_chapter][current_heading] = buffer.strip()
        elif buffer.strip():
            structured_content[current_chapter]["content"] = buffer.strip()

    return structured_content


# ------------------ PRINT FUNCTION ------------------
def print_structured_content(structured_content):
    for chapter, sections in structured_content.items():
        print(f"\n=== {chapter} ===\n")
        for heading, content in sections.items():
            print(f"--- {heading} ---\n")
            print(content[:500])  # Print first 500 chars for preview
            print("\n")


# ------------------ SAVE TO JSON FUNCTION ------------------
def save_to_json(data, file_path):
    with open(file_path, "w", encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"\n✅ Data successfully saved to {file_path}")


# ------------------ PDF TEXT EXTRACTION FUNCTION ------------------
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path):
    """
    Extract text from PDF using PyPDF2.
    """
    extracted_pages = []
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        for page_number, page in enumerate(reader.pages):
            page_text = page.extract_text()
            extracted_pages.append(page_text if page_text else "")
    return extracted_pages


# ------------------ MAIN EXECUTION ------------------
if __name__ == "__main__":
    pdf_file_path = "sciencepart1.pdf"  # ✅ Path to your PDF file
    output_json_path = "structured_science_text.json"  # ✅ Output path for JSON

    # Step 1: Extract text from PDF
    pages_text = extract_text_from_pdf(pdf_file_path)

    # Step 2: Parse chapters and sections (with cleaning)
    structured_content = parse_chapters_and_sections(pages_text)

    # Step 3: Print preview in console
    print_structured_content(structured_content)

    # Step 4: Save structured output as JSON
    save_to_json(structured_content, output_json_path)

🟢 Detected Chapter: 1 CHAPTER
🟢 Detected Chapter: 2 CHAPTER
🟢 Detected Chapter: 3 CHAPTER
🟢 Detected Chapter: 6 CHAPTER
🟢 Detected Chapter: 7 CHAPTER
🟢 Detected Chapter: 12 CHAPTER
🟢 Detected Chapter: 13 CHAPTER
🟢 Detected Chapter: 15 CHAPTER
🔵 Detected Heading: 1.1 CHEMICAL  EQUATION TIONS
🔵 Detected Heading: 1.1.2 Balanced Chemical Equations
🔵 Detected Heading: 1.2.1 Combination Reaction
🔵 Detected Heading: 1.7 double of the amount collected in other? Name this gas.
🔵 Detected Heading: 1.2.3 Displacement Reaction
🔵 Detected Heading: 1.2.4 Double Displacement Reaction
🔵 Detected Heading: 1.3.1 Corrosion
🔵 Detected Heading: 1.3.2 Rancidity
🔵 Detected Heading: 2.1.1 Acids and Bases in the Laborator y
🔵 Detected Heading: 2.1.2 How do Acids and Bases React with Metals?nRinse both cloth strips water again check their odour .
🔵 Detected Heading: 2.1.3 How do Metal Carbonates and
🔵 Detected Heading: 0.5 g of sodium hydrogencarbonate
🔵 Detected Heading: 2.1.5 Reaction of Metallic Oxides with 