In [None]:
import os
import fitz
import re
import json

In [5]:
def normalize_text(text):
    # Склеиваем строки, удаляем лишние пробелы
    lines = text.splitlines()
    merged = []
    buffer = ""
    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue
        if stripped.isupper() or re.match(r"^\d+(\.\d+)*\.", stripped):  # Заголовок или пункт
            if buffer:
                merged.append(buffer.strip())
                buffer = ""
            buffer = stripped
        else:
            buffer += " " + stripped
    if buffer:
        merged.append(buffer.strip())
    return "\n".join(merged)

def extract_sections(text, text_pages):
    section1_started = False
    section2_started = False
    section1_lines = []
    section2_lines = []

    for page_text in text_pages:
        lines = page_text.splitlines()
        for line in lines:
            upper = line.strip().upper()
            if "ОБ УТВЕРЖДЕНИИ ПРАВИЛ" in upper:
                section1_started = True
                continue
            if "ПРАВИЛА" in upper and not section2_started:
                section2_started = True
                section1_started = False
                continue

            if section1_started:
                section1_lines.append(line)
            elif section2_started:
                section2_lines.append(line)

    section1_text = "\n".join(section1_lines)
    section2_text = "\n".join(section2_lines)

    return [
        ("I.", section1_text.strip(), "ОБ УТВЕРЖДЕНИИ ПРАВИЛ"),
        ("II.", section2_text.strip(), "ПРАВИЛА ГОСУЧЕТА БПЛА")
    ]

def extract_paragraphs(text):
    paragraph_pattern = r"(\d+(\.\d+)*\.)\s+(.*?)(?=\n\d|\Z)"
    paragraphs = []
    for match in re.finditer(paragraph_pattern, text, re.DOTALL):
        number = match.group(1).strip()
        content = match.group(3).replace("\n", " ").strip()
        paragraphs.append({
            "номер": number,
            "текст": content
        })
    return paragraphs

def parse_pdf_to_json_25_05_2019_N_658(pdf_path, document_title):
    doc = fitz.open(pdf_path)
    full_text = ""
    text_pages = []

    for page in doc[1:]:
        page_text = page.get_text()
        text_pages.append(page_text)

    normalized_pages = [normalize_text(p) for p in text_pages]
    sections = extract_sections(normalized_pages, text_pages)

    result = {
        "документ": document_title,
        "главы": []
    }

    for chapter_num, section_text, chapter_title in sections:
        result["главы"].append({
            "глава": chapter_num,
            "заголовок": chapter_title,
            "пункты": extract_paragraphs(section_text)
        })

    return result


In [6]:
document_title = "Постановление Правительства РФ от 25.05.2019 N 658 (ред. от 12.08.2022) 'Об утверждении Правил государственного учета беспилотных гражданских воздушных судов с максимальной взлетной массой от 0,15 килограмма до 30 килограммов, ввезенных в Российскую Федерацию или произведенных в Российской Федерации'"

pdf_path = "C:\\Users\\My Computer\\Desktop\\Work\\Learn\\OPK_Project\\documents_pdf\\25_05_2019_N_658.pdf"
outp_path = "C:\\Users\\My Computer\\Desktop\\Work\\Learn\\OPK_Project\\documents_json\\25_05_2019_N_658.json"

parsed_data = parse_pdf_to_json_25_05_2019_N_658(pdf_path, document_title)

with open(outp_path, "w", encoding="utf-8") as f:
    json.dump(parsed_data, f, ensure_ascii=False, indent=2)

print(f"JSON успешно сохранён в {outp_path}")

JSON успешно сохранён в C:\Users\My Computer\Desktop\Work\Learn\OPK_Project\documents_json\25_05_2019_N_658.json
