In [1]:
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import requests
import time
import random
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import json


In [2]:
def extract_meta_data(doc_content):
    meta_data = doc_content.find("table", id="doc-meta")
    meta_data_dict = {}
    for row in meta_data.find_all("tr"):
        header = row.find("th").text.strip(":")
        value = row.find("td").text
        meta_data_dict[header] = value

    return meta_data_dict


def extract_paragraphs(doc_content):
    paragraphs = []
    # Find all h2 headers
    h2_headers = doc_content.find_all("h2")

    for header in h2_headers:
        # Get the header text
        header_text = header.text.strip()

        # Find all paragraphs that follow this header until the next h2 or end
        current = header.next_sibling
        section_paragraphs = []

        while current and not (current.name == "h2"):
            if current.name == "p":
                # Get paragraph text and check for <strong> tags
                p_text = []
                for content in current.contents:
                    if content.name == "strong":
                        break
                    elif isinstance(content, str):
                        p_text.append(content)
                    else:
                        p_text.append(content.text)

                section_paragraphs.extend(
                    [x for x in p_text if x.strip() and "-------------" not in x]
                )
            current = current.next_sibling

        paragraphs.append({"header": header_text, "paragraphs": section_paragraphs})

    return paragraphs


def get_content(doc_content):
    paragraphs = extract_paragraphs(doc_content)

    return {
        section["header"].lower(): "\n\n".join(section["paragraphs"])
        for section in paragraphs
    }


def get_zveza(doc_content):
    zveza = {}
    for zveza_title in doc_content.find_all(
        "div", class_="connection-category-wrapper"
    ):
        zveza_detail = zveza_title.find_next_sibling("div", class_="connection-detail1")
        zveza[zveza_title.text.strip()] = zveza_detail.text.strip()

    if not zveza:
        connection = doc_content.find("div", id="doc-connection")

        if connection:
            zveza["Without category"] = connection.text.strip()

    return zveza


In [3]:
def get_table_links_from_page(page_number):
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode

    # Initialize the Chrome WebDriver
    driver = webdriver.Chrome(options=chrome_options)

    # Load the page
    table_url = f"https://www.sodnapraksa.si/?q=(podrocje:%22civilno%20procesno%20pravo%22)%20IN%20(oddelek:%22Civilni%20oddelek%22%20ALI%20oddelek:%22Gospodarski%20oddelek%22)%20IN%20(sodisce:%22Vrhovno%20sodi%C5%A1%C4%8De%22)&database[SOVS]=SOVS&_submit=i%C5%A1%C4%8Di&showType=table&order=date&direction=desc&page={page_number}&rowsPerPage=20"
    # table_url = f"https://www.sodnapraksa.si/?q=(podrocje:%22kazensko%20procesno%20pravo%22)%20IN%20(oddelek:%22Kazenski%20oddelek%22)&database[SOVS]=SOVS&_submit=i%C5%A1%C4%8Di&showType=table&order=date&direction=desc&page={page_number}&rowsPerPage=20"
    driver.get(table_url)

    # Wait for the table to load
    wait = WebDriverWait(driver, 1)
    _ = wait.until(EC.presence_of_element_located((By.ID, "results-table")))

    # Get the page source after JavaScript has run
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")

    # Clean up
    driver.quit()

    table_content = soup.find("table", id="results-table")
    links = table_content.find_all("a", href=True)
    hrefs = [
        f"https://www.sodnapraksa.si{link.get('href').replace(' ', '%20')}"
        for link in links
    ]
    return hrefs


def extract_data_from_table_link(url):
    try:
        # Add random delay between 0-500ms
        time.sleep(random.randint(0, 1000) / 1000)

        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")

        doc_content = soup.find("div", id="doc-content")

        meta = extract_meta_data(doc_content)
        content = get_content(doc_content)
        zveza = get_zveza(doc_content)

        meta["url"] = url
        return {"meta": meta, "content": content, "zveza": zveza}
    except Exception as e:
        print(f"Error extracting data from {url}: {e}")
        return None


In [4]:
SUBFOLDER = "cpp_in__co_ali_go__in_vs"
for page_num in tqdm(range(0 + 193, 816)):
    links = get_table_links_from_page(page_num)

    with ThreadPoolExecutor(max_workers=3) as executor:
        results = list(executor.map(extract_data_from_table_link, links))

    for result in results:
        if result:
            evidencna = result["meta"]["Evidenčna številka"]
            filename = f"./data/scraped/{SUBFOLDER}/{evidencna}.json"

            with open(filename, "w", encoding="utf-8") as f:
                json.dump(result, f, indent=4)


100%|██████████| 623/623 [1:16:13<00:00,  7.34s/it]


In [5]:
# url = "https://www.sodnapraksa.si/?q=VS2006070&database[SOVS]=SOVS&_submit=i%C5%A1%C4%8Di&rowsPerPage=20&page=0&id=2012032113046733"


In [6]:
# extract_data_from_table_link(url)