In [1]:
import json

import classla
from pathlib import Path
from src.nlp.model import EntType
from src.nlp.text_processor import TextProcessor
from src.parse import (
    _find_entity,
    _map_titles_to_abbreviations,
    _normalize_input_for_clen,
    pdf_to_text,
)
import re
from spacy import displacy
from collections import namedtuple

classla.download("sl")

Downloading https://raw.githubusercontent.com/clarinsi/classla-resources/main/resources_1.0.1.json: 10.3kB [00:00, 13.2MB/s]                   
2025-02-26 14:17:01 INFO: Downloading these customized packages for language: sl (Slovenian)...
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |
| depparse  | standard |
| ner       | standard |
| pretrain  | standard |

2025-02-26 14:17:02 INFO: File exists: /Users/tadejkrivec/classla_resources/sl/pos/standard.pt.
2025-02-26 14:17:02 INFO: File exists: /Users/tadejkrivec/classla_resources/sl/lemma/standard.pt.
2025-02-26 14:17:02 INFO: File exists: /Users/tadejkrivec/classla_resources/sl/depparse/standard.pt.
2025-02-26 14:17:02 INFO: File exists: /Users/tadejkrivec/classla_resources/sl/ner/standard.pt.
2025-02-26 14:17:02 INFO: File exists: /Users/tadejkrivec/classla_resources/sl/pretrain/standard.pt.
2025-02-26 14:17:02 INFO: Finished downloading models and saved to 

In [2]:
tp = TextProcessor(mappings_path=Path("./data/mappings/mappings_all.parquet"))

2025-02-26 14:17:03 INFO: Loading these models for language: sl (Slovenian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |

2025-02-26 14:17:03 INFO: Use device: cpu
2025-02-26 14:17:03 INFO: Loading: tokenize
2025-02-26 14:17:03 INFO: Loading: pos
2025-02-26 14:17:06 INFO: Loading: lemma
2025-02-26 14:17:09 INFO: Done loading processors!


In [3]:
# https://www.sodnapraksa.si/?q=Ips%2023638/2021&database[SOVS]=SOVS&_submit=i%C5%A1%C4%8Di&rowsPerPage=20&page=0&id=2015081111479996
# file_path = "./data/test/I Ips 23638-2021.pdf"

# https://www.sodnapraksa.si/?q=II%20Ips%2032/2024&database[SOVS]=SOVS&_submit=i%C5%A1%C4%8Di&rowsPerPage=20&page=0&id=2015081111477717
# file_path = "./data/test/II Ips 32-2024.pdf"

# https://www.sodnapraksa.si/?q=III%20Ips%2011/2024&database[SOVS]=SOVS&_submit=i%C5%A1%C4%8Di&rowsPerPage=20&page=0&id=2015081111480014
file_path = "./data/test/III Ips 11-2024.pdf"

text_to_process = pdf_to_text(file_path=file_path)
text_to_process = text_to_process.replace("\n", " ")

In [4]:
text_to_process

'SODBA V IMENU LJUDSTVA Vrhovno sodišče Republike Slovenije je v senatu, ki so ga sestavljali vrhovna sodnica in sodniki Franc Seljak kot predsednik ter Magda Teppey, mag. Matej Čujovič, dr. Miodrag Đorđevič in dr. Damjan Orož kot člani, v gospodarskem sporu tožeče stranke: ... zoper toženo stranko: ... zaradi plačila 69.674,98 EUR s pripadki, o reviziji tožene stranke zoper sodbo Višjega sodišča v Mariboru I Cpg 188/2023 z dne 12. 10. 2023  v zvezi s sodbo Okrožnega sodišča v Mariboru I Pg 563/2021 z dne 2. 6. 2023, na seji 10. decembra 2024 1/9 III Ips 11/2024  RAZSODILO: I. Revizija se zavrne. II. Tožena stranka je dolžna tožeči stranki povrniti stroške odgovora na revizijo v znesku 1.116,30 EUR v roku 15 dni od prejema te sodbe, v primeru zamude z zakonskimi zamudnimi obrestmi od dne poteka roka za prostovoljno izpolnitev obveznosti dalje do plačila. OBRAZLOŽITEV: Odločitvi in nosilni razlogi sodbe sodišč prve in druge stopnje 1. Sodišče prve stopnje je toženi stranki naložilo plač

In [5]:
test = tp.nlp(text_to_process)

In [6]:
entities_types_to_parse = [
    EntType.DOC_TITLE,
    EntType.DOC_ABBR,
    EntType.CLEN,
    EntType.CLEN_LEFT,
]

type_to_found_entities = {x: _find_entity(test, x) for x in entities_types_to_parse}

In [7]:
SpanTmp = namedtuple("Span", ["span", "start", "end"])

_spans = [SpanTmp(x, x.start, x.end) for x in test.spans["spans"]]

spans_filtered = []
while _spans:
    is_contained = False
    x = _spans.pop()
    for other in spans_filtered:
        if x.start >= other.start and x.end <= other.end:
            is_contained = True

    if is_contained:
        continue

    spans_filtered.append(x.span)

In [8]:
test.spans["sc"] = spans_filtered

colors = {"NAVEDBA_ZAKONA": "orange", "ODLOCITEV": "lightblue"}
options = {"ents": ["NAVEDBA_ZAKONA", "ODLOCITEV"], "colors": colors}
displacy.render(test, style="span", options=options)

In [9]:
# colors = {"NAVEDBA_ZAKONA": "lightblue", "ODLOCITEV": "red"}
# options = {"ents": ["NAVEDBA_ZAKONA", "ODLOCITEV"], "colors": colors}

# displacy.serve(test, style="span", port=8000, options=options)

In [10]:
zveza_found = []

navedbe_zakona = [x for x in test.spans["spans"] if x.label_ == "navedba_zakona"]
for span in navedbe_zakona:
    entities_types_to_parse = [
        EntType.DOC_TITLE,
        EntType.DOC_ABBR,
        EntType.CLEN,
        EntType.CLEN_LEFT,
    ]

    # Extract abbreviations
    type_to_found_entities = {x: _find_entity(span, x) for x in entities_types_to_parse}
    _abbr = _map_titles_to_abbreviations(
        titles=type_to_found_entities[EntType.DOC_TITLE], tp=tp
    )
    _abbr.extend(type_to_found_entities[EntType.DOC_ABBR])
    found_abbr = max([str(x) for x in _abbr], key=len)

    # Extract clens
    nums = []
    for x in span.text.split(" "):
        pattern = r"\d{1,4}\."
        matches = []
        for word in x:
            if re.match(pattern, word):
                matches.append(word)

    if len(nums) > 1:
        for num in nums:
            zveza_found.append(f"{num} člen {found_abbr}")
    else:
        clens = [
            *[str(x) for x in type_to_found_entities[EntType.CLEN]],
            *[str(x) for x in type_to_found_entities[EntType.CLEN_LEFT]],
        ]

        if len(clens) != 1:
            print(clens)
            raise ValueError

        for x in clens:
            found_clen = _normalize_input_for_clen(str(x))
            zveza_found.append(f"{found_clen} {found_abbr}")

        if not found_clen.split(" ")[0].strip(".").isdigit():
            found_num = found_clen.split(".")[0]
            zveza_found.append(f"{found_num}. člen {found_abbr}")

zveza_found = sorted(list(set(zveza_found)), key=lambda x: int(x.split(".")[0]))


In [11]:
zveza_found

['4. člen OZ',
 '5. člen OZ',
 '7. člen ZPP',
 '82. člen OZ',
 '165. člen ZPP',
 '313. člen ZPP',
 '324. člen ZPP',
 '339. člen ZPP',
 '367.c člen ZPP',
 '367. člen ZPP',
 '367.a člen ZPP',
 '367.b člen ZPP',
 '378. člen ZPP']