In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json

# URL of the webpage
kars_url = "https://www.riigiteataja.ee/akt/184411"

# Send a request to fetch the content of the webpage
response = requests.get(kars_url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Extract the raw text from the webpage
raw_text = soup.get_text()

# Clean up the text
cleaned_text = " ".join(raw_text.split())

# Regular expressions to identify Peatükk, Jagu, and § (paragrahvid)
peatukk_re = re.compile(r"(\d+\. peatükk .+)")
jagu_re = re.compile(r"(\d+\. jagu .+)")
paragraph_re = re.compile(r"(§ \d+\..+)")

# Structure to store the parsed data
parsed_data = []
current_peatukk = None
current_jagu = None

# Split text into lines for processing
lines = cleaned_text.split("§")

# Process each line
for line in lines:
    # Check if it's a peatükk (chapter)
    peatukk_match = peatukk_re.search(line)
    if peatukk_match:
        current_peatukk = {
            "name": peatukk_match.group(1).strip(),
            "sections": []
        }
        parsed_data.append(current_peatukk)
        continue

    # Check if it's a jagu (section)
    jagu_match = jagu_re.search(line)
    if jagu_match and current_peatukk is not None:
        current_jagu = {
            "name": jagu_match.group(1).strip(),
            "paragraphs": []
        }
        current_peatukk["sections"].append(current_jagu)
        continue

    # Process paragraphs
    paragraph_match = paragraph_re.search("§" + line)  # Add back the '§'
    if paragraph_match and current_jagu is not None:
        paragraph = paragraph_match.group(1).strip()
        # Split the paragraph into multiple parts (e.g., subsections like (1), (2))
        paragraph_parts = re.split(r'\(\d+\)', paragraph)
        paragraph_data = {"text": paragraph_parts[0].strip(), "subsections": []}
        for part in paragraph_parts[1:]:
            paragraph_data["subsections"].append(part.strip())
        current_jagu["paragraphs"].append(paragraph_data)

# Save the structured data as a JSON file
with open('kars_parsed.json', 'w', encoding='utf-8') as f:
    json.dump(parsed_data, f, ensure_ascii=False, indent=4)

# Print output for verification
print(json.dumps(parsed_data, ensure_ascii=False, indent=4))


[
    {
        "name": "1. peatükk ÜLDSÄTTED",
        "sections": []
    },
    {
        "name": "2. peatükk SÜÜTEGU 1. jagu Süüteokoosseis",
        "sections": [
            {
                "name": "2. jagu Õigusvastasuse välistamine",
                "paragraphs": [
                    {
                        "text": "§ 27. Õigusvastane tegu Õigusvastane on tegu, mis vastab seaduses sätestatud süüteokoosseisule ja mille õigusvastasus ei ole välistatud käesoleva seadustiku, muu seaduse, rahvusvahelise konventsiooni või rahvusvahelise tavaga.",
                        "subsections": []
                    },
                    {
                        "text": "§ 28. Hädakaitse",
                        "subsections": [
                            "Tegu ei ole õigusvastane, kui isik tõrjub vahetut või vahetult eesseisvat õigusvastast rünnet enda või teise isiku õigushüvedele, kahjustades ründaja õigushüvesid, ületamata seejuures hädakaitse piiri.",
                            