<a href="https://colab.research.google.com/github/nithishkesavarapu-code/floorplan_dimension_exrtraction/blob/main/assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install PyMuPDF pdfplumber regex

Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [11]:
import pdfplumber
import os
import regex as re
import math
import json
from typing import List, Dict, Any

In [12]:
DIMENSION_REGEX_RAW = r"""
    \b
    (?:
        # Pattern A: Feet and Inches (e.g., 2' 6 1/2", 3'-6")
        (?P<feet>\d+)\s*['\u2032][\s-]* # Feet value followed by ' or prime symbol
        (?:
            (?P<whole_inches>\d*)[\s-]* # Optional whole inches (can be 0)
            (?:(?P<num>\d+)[/](?P<den>\d+))?            # Optional fraction (e.g., 1/2)
        )?
        \s*["\u2033]?                                   # Optional inches symbol or double prime symbol
    )
    |
    (?:
        # Pattern B: Inches Only (e.g., 25", 34 1/2")
        (?P<inches_only>\d+)[\s-]* # Whole inches value
        (?:(?P<num_only>\d+)[/](?P<den_only>\d+))?      # Optional fraction
        \s*["\u2033]                                    # MUST have an inches symbol
    )
    |
    (?:
        # Pattern C: Simple Feet Only (e.g., 10')
        (?P<feet_only>\d+)\s*['\u2032]
    )
    \b
    |
    # NEW Pattern D: Dimensions separated by 'x' (e.g., 14'x8', 9'3"x10'3")
    # This is common in floorplans and needs special handling for accurate bbox in a moment.
    # For now, we capture the whole string.
    (?P<room_dim>
        \d+['\u2032] (?: \d+["\u2033] )? # First dimension (e.g., 14' or 9'3")
        \s*[xX]\s* # Separator 'x'
        \d+['\u2032] (?: \d+["\u2033] )? # Second dimension (e.g., 8' or 10'3")
    )
"""
DIMENSION_REGEX = re.compile(DIMENSION_REGEX_RAW, re.VERBOSE | re.IGNORECASE)

In [13]:
CODE_REGEX = re.compile(r"""
    \b                      # Word boundary
    [A-Z]{2,}               # Two or more uppercase letters (e.g., DB, SB)
    [0-9]{2,}               # Two or more digits (e.g., 24, 42)
    [A-Z0-9]* # Optional alphanumeric suffix (e.g., FH)
    \b                      # Word boundary
""", re.VERBOSE)

In [14]:
def convert_to_inches(match: re.Match) -> float:
    """Parses a dimension match object and converts the value to a float in inches."""
    total_inches = 0.0
    group_dict = match.groupdict()

    # Helper to convert a single dimension string (e.g., "9'3\"")
    def simple_string_to_inches(dim_str: str) -> float:
        f = 0.0
        i = 0.0

        # Extract feet (e.g., 9' or 14')
        feet_match = re.search(r"(\d+)\s*['\u2032]", dim_str)
        if feet_match:
            f = float(feet_match.group(1))

        # Extract inches (e.g., 3")
        inches_match = re.search(r"(\d+)\s*[\"\u2033]", dim_str)
        if inches_match:
            i = float(inches_match.group(1))

        return (f * 12.0) + i

    # Handle the 'x' separated room dimensions
    if group_dict.get('room_dim'):
        try:
            # Match the first dimension part
            # Adjusted regex to handle unicode/ascii prime/double prime symbols
            first_dim_match = re.match(r"(?P<dim_a>\d+['\u2032] (?:\d+[\"\u2033])?)\s*[xX]", group_dict['room_dim'])
            if first_dim_match:
                # Use the helper function to convert the first dimension part
                return simple_string_to_inches(first_dim_match.group('dim_a').strip())
            return 0.0
        except Exception:
            return 0.0

    # Existing logic for single dimensions
    try:
        # Handle Feet (Pattern A or C)
        feet_str = group_dict.get('feet') or group_dict.get('feet_only')
        if feet_str:
            total_inches += float(feet_str) * 12.0

        # Handle Inches and Fractions (Pattern A: with feet)
        whole_inches_str = group_dict.get('whole_inches')
        num_str = group_dict.get('num')
        den_str = group_dict.get('den')

        if match.group('feet'):
            if whole_inches_str and whole_inches_str.strip():
                total_inches += float(whole_inches_str)
            if num_str and den_str:
                total_inches += float(num_str) / float(den_str)

        # Handle Inches Only (Pattern B: no feet)
        inches_only_str = group_dict.get('inches_only')
        num_only_str = group_dict.get('num_only')
        den_only_str = group_dict.get('den_only')

        if inches_only_str:
            total_inches += float(inches_only_str)
            if num_only_str and den_only_str:
                total_inches += float(num_only_str) / float(den_only_str)

    except (ValueError, ZeroDivisionError, TypeError) as e:
        print(f"Error converting dimension '{match.group(0)}': {e}")
        return 0.0

    return round(total_inches, 4)

In [15]:
def get_bbox_for_span(words: List[Dict[str, Any]], start_index: int, end_index: int) -> List[float]:
    """
    Calculates the combined bounding box [x0, y0, x1, y1] for a sequence of words.
    """
    if start_index >= len(words) or end_index >= len(words):
        return [0.0, 0.0, 0.0, 0.0]

    start_word = words[start_index]
    end_word = words[end_index]

    # Combine the top-left of the first word with the bottom-right of the last word
    return [
        start_word['x0'],  # Smallest X
        start_word['top'], # Smallest Y (top)
        end_word['x1'],    # Largest X
        end_word['bottom'] # Largest Y (bottom)
    ]

In [16]:
def process_pdf_and_generate_json(pdf_path: str, output_json_path: str):
    """
    Main pipeline function to extract data, bounding boxes, and generate the final JSON file.
    """
    if not os.path.exists(pdf_path):
        print(f"Error: The file at {pdf_path} was not found. Cannot proceed.")
        return []

    final_results = []

    print("Starting PDF processing for Bbox extraction and JSON generation...")

    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Iterate through each page of the PDF
            for page_num, page in enumerate(pdf.pages, 1):
                page_data = {
                    "page": page_num,
                    "dimensions": [],
                    "codes": []
                }

                # Use extract_words to get text with coordinates (bbox)
                words = page.extract_words(x_tolerance=3, y_tolerance=2)

                # 1. Create a raw text string and an index map
                raw_text_list = []
                index_to_word_map = {}

                current_char_index = 0
                for i, word in enumerate(words):
                    index_to_word_map[current_char_index] = i
                    raw_text_list.append(word['text'])
                    current_char_index += len(word['text'])

                    if i < len(words) - 1:
                        raw_text_list.append(' ')
                        current_char_index += 1

                page_text = "".join(raw_text_list)

                # --- A. DIMENSION EXTRACTION ---
                print(f"  Processing page {page_num} for dimensions...")

                for match in DIMENSION_REGEX.finditer(page_text):
                    raw_dim = match.group(0).strip()
                    inches = convert_to_inches(match)

                    # Find start word index
                    start_char_index = match.start()
                    start_word_index = None
                    for i in range(start_char_index, -1, -1):
                        if i in index_to_word_map:
                            start_word_index = index_to_word_map[i]
                            break

                    # Find end word index
                    current_match_end_word_index = start_word_index
                    current_span_length = 0

                    if start_word_index is not None:
                        for i in range(start_word_index, len(words)):
                            word_len = len(words[i]['text'])
                            current_span_length += word_len

                            # Check if the combined length of words covers the raw dimension string length (ignoring spaces)
                            if current_span_length >= len(raw_dim.replace(" ", "").replace("'","").replace('"','')):
                                current_match_end_word_index = i
                                break

                            current_span_length += 1 # Account for the space between words

                    bbox = [0.0, 0.0, 0.0, 0.0]
                    if start_word_index is not None and current_match_end_word_index is not None:
                        bbox = get_bbox_for_span(words, start_word_index, current_match_end_word_index)

                    page_data["dimensions"].append({
                        "raw": raw_dim,
                        "inches": inches,
                        "bbox": [round(c, 2) for c in bbox]
                    })

                # --- B. CODE EXTRACTION ---
                print(f"  Processing page {page_num} for codes...")

                for match in CODE_REGEX.finditer(page_text):
                    code = match.group(0).strip()

                    start_char_index = match.start()
                    start_word_index = None
                    for i in range(start_char_index, -1, -1):
                        if i in index_to_word_map:
                            start_word_index = index_to_word_map[i]
                            break

                    code_bbox = [0.0, 0.0, 0.0, 0.0]
                    if start_word_index is not None:
                        code_bbox = get_bbox_for_span(words, start_word_index, start_word_index)
                        page_data["codes"].append({
                            "code": code,
                            "bbox": [round(c, 2) for c in code_bbox]
                        })
                    else:
                         page_data["codes"].append({"code": code, "bbox": [0.0, 0.0, 0.0, 0.0]})


                final_results.append(page_data)

        # Write results to the JSON file
        with open(output_json_path, 'w') as f:
            json.dump(final_results, f, indent=4)

        print(f"\nSuccessfully extracted data for {len(final_results)} pages and saved to '{output_json_path}'")

    except Exception as e:
        print(f"A fatal error occurred during PDF processing: {e}")

    return final_results

In [17]:
if __name__ == "__main__":
    pdf_file_path = "/content/sample_data/floorplan.pdf"
    output_json_file = "output.json"

    # 1. Run the main pipeline to extract data and generate JSON
    extracted_data = process_pdf_and_generate_json(pdf_file_path, output_json_file)

    # Print summary (kept for verification as per previous steps)
    if extracted_data:
        print("\n--- SUMMARY OF FIRST PAGE (FOR CONSOLE VERIFICATION) ---")
        first_page = extracted_data[0]
        print(f"Page: {first_page['page']}")

        print("\nDimensions Found (First 5):")
        for dim in first_page["dimensions"][:5]:
            print(f"  Raw: {dim['raw']}, Inches: {dim['inches']:.2f}, Bbox: {dim['bbox']}")

        print("\nCodes Found:")
        if first_page["codes"]:
            for code in first_page["codes"]:
                print(f"  Code: {code['code']}, Bbox: {code['bbox']}")
        else:
            print("  No cabinet/appliance codes found on this page.")

        print("----------------------------------------------------------")

Starting PDF processing for Bbox extraction and JSON generation...
  Processing page 1 for dimensions...
  Processing page 1 for codes...

Successfully extracted data for 1 pages and saved to 'output.json'

--- SUMMARY OF FIRST PAGE (FOR CONSOLE VERIFICATION) ---
Page: 1

Dimensions Found (First 5):
  Raw: 50', Inches: 600.00, Bbox: [424.24, 82.86, 454.42, 106.02]
  Raw: 14', Inches: 168.00, Bbox: [482.08, 227.91, 499.06, 240.94]
  Raw: 8' 9, Inches: 105.00, Bbox: [512.81, 227.91, 522.55, 240.94]
  Raw: 3", Inches: 3.00, Bbox: [590.59, 238.11, 602.46, 251.14]
  Raw: 10' 3", Inches: 123.00, Bbox: [616.22, 238.11, 633.2, 251.14]

Codes Found:
  No cabinet/appliance codes found on this page.
----------------------------------------------------------
