In [28]:
import json
from pathlib import Path
from typing import Dict, List, Literal

import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

In [3]:
URL = "https://www.paragraf.rs/propisi/zakon_o_radu.html"
output_path = Path("./srb_labor_law_data.json")
page = requests.get(URL)

In [4]:
soup = BeautifulSoup(page.content, "lxml")

In [21]:
def check_class_element(element, class_name: Literal["normal", "clan"]) -> bool:
    """Check if the element has a class 'normal'."""
    if class_name == "normal":
        return element.get("class") == ["normal"]
    else:
        return element.get("class") == ["clan"]


def run_scraper(soup, url: str) -> List[Dict]:
    law_articles = []
    article_title = None
    article_texts = []
    article_link = None

    elements = soup.find_all("p")
    for el in tqdm(elements):
        if check_class_element(el, "clan"):
            if article_title:
                law_articles.append(
                    {
                        "title": article_title,
                        "texts": article_texts,
                        "link": article_link,
                    }
                )
                article_texts = []
            article_title = el.get_text(strip=True)

            name_attr = el.find("a").get("name") if el.find("a") else None
            article_link = f"{url}#{name_attr}" if name_attr else None
        elif article_title and check_class_element(el, "normal"):
            article_texts.append(el.get_text(strip=True))

    if article_title and article_texts:
        law_articles.append(
            {"title": article_title, "texts": article_texts, "link": article_link}
        )

    return law_articles

Write json

In [7]:
with open(output_path, "w", encoding="utf-8") as file:
    file.write(json.dumps(labor_law, indent=4))

Read json

In [17]:
with open(output_path, "r", encoding="utf-8") as file:
    data = json.loads(file.read())

## Scrape multiple laws

In [29]:
urls = [
    "https://www.paragraf.rs/propisi/zakon_o_radu.html",
    "https://www.paragraf.rs/propisi/zakon-o-porezu-na-dohodak-gradjana.html",
    "https://www.paragraf.rs/propisi/zakon_o_zastiti_podataka_o_licnosti.html",
    "https://www.paragraf.rs/propisi/zakon_o_zastiti_potrosaca.html",
    "https://www.paragraf.rs/propisi/porodicni_zakon.html",
]
output_dir = Path("./laws")
if not output_dir.exists():
    output_dir.mkdir(exist_ok=True)

In [30]:
for url in tqdm(urls, desc="Scraping laws", total=len(urls)):
    save_path = output_dir / Path(url).with_suffix(".json").name
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "lxml")

    law_articles = run_scraper(soup, url)

    with open(save_path, "w", encoding="utf-8") as file:
        file.write(json.dumps(law_articles, indent=4))

Scraping laws:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1451 [00:00<?, ?it/s]

  0%|          | 0/1358 [00:00<?, ?it/s]

  0%|          | 0/1095 [00:00<?, ?it/s]

  0%|          | 0/1645 [00:00<?, ?it/s]

  0%|          | 0/1808 [00:00<?, ?it/s]