In [1]:
import json

import classla
from pathlib import Path
from src.nlp.model import EntType
from src.nlp.text_processor import TextProcessor
from src.parse import (
    pdf_to_text,
    _find_entity,
    _map_titles_to_abbreviations,
    _normalize_input_for_clen,
)

classla.download("sl")

Downloading https://raw.githubusercontent.com/clarinsi/classla-resources/main/resources_1.0.1.json: 10.3kB [00:00, 10.7MB/s]                   
2025-02-21 12:33:45 INFO: Downloading these customized packages for language: sl (Slovenian)...
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |
| depparse  | standard |
| ner       | standard |
| pretrain  | standard |

2025-02-21 12:33:47 INFO: File exists: /Users/tadejkrivec/classla_resources/sl/pos/standard.pt.
2025-02-21 12:33:47 INFO: File exists: /Users/tadejkrivec/classla_resources/sl/lemma/standard.pt.
2025-02-21 12:33:47 INFO: File exists: /Users/tadejkrivec/classla_resources/sl/depparse/standard.pt.
2025-02-21 12:33:47 INFO: File exists: /Users/tadejkrivec/classla_resources/sl/ner/standard.pt.
2025-02-21 12:33:47 INFO: File exists: /Users/tadejkrivec/classla_resources/sl/pretrain/standard.pt.
2025-02-21 12:33:47 INFO: Finished downloading models and saved to 

In [2]:
with open("data/training/data.json", "r", encoding="utf-8") as f:
    training_data = json.load(f)


In [3]:
tp = TextProcessor(mappings_path=Path("./data/mappings/legislation_laws.parquet"))

2025-02-21 12:33:47 INFO: Loading these models for language: sl (Slovenian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |

2025-02-21 12:33:47 INFO: Use device: cpu
2025-02-21 12:33:47 INFO: Loading: tokenize
2025-02-21 12:33:47 INFO: Loading: pos
2025-02-21 12:33:51 INFO: Loading: lemma
2025-02-21 12:33:54 INFO: Done loading processors!


In [4]:
TRAINING_DOCUMENT_ID = 2
jedro = training_data[TRAINING_DOCUMENT_ID]["jedro"]
zveza = training_data[TRAINING_DOCUMENT_ID]["zveza"]
path = Path(training_data[TRAINING_DOCUMENT_ID]["path"])

text_to_process = pdf_to_text(path)
text_to_process = text_to_process.replace("\n", " ").replace(
    "  ", " "
)  # only in processing

# text_to_process = "1. clen KZ bistvenih kršitev določb kazenskega postopka iz prvega odstavka 371. clena in 375. člena ZKP in drugih kršitev določb"

In [5]:
test = tp.nlp(text_to_process)

In [6]:
entities_types_to_parse = [
    EntType.DOC_TITLE,
    EntType.DOC_ABBR,
    EntType.CLEN,
    EntType.CLEN_LEFT,
]

type_to_found_entities = {x: _find_entity(test, x) for x in entities_types_to_parse}
type_to_found_entities

{<EntType.DOC_TITLE: 'DOC_TITLE'>: [Kazenskega zakonika,
  Zakona o kazenskem postopku,
  Kazenskega zakonika,
  Kazenskega zakonika],
 <EntType.DOC_ABBR: 'DOC_ABBR'>: [KZ-1,
  KZ-1,
  ZKP,
  ZKP,
  KZ-1,
  KZ-1,
  KZ,
  ZKP,
  KZ-1,
  KZ-1,
  KZ-1,
  KZ,
  KZ-1,
  KZ-1,
  ZKP,
  ZKP,
  ZKP,
  KZ,
  ZKP,
  KZ-1,
  KZ-1,
  KZ-1,
  KZ-1,
  KZ-1,
  ZKP,
  KZ-1,
  KZ-1],
 <EntType.CLEN: 'CLEN'>: [116. člena,
  34. člena,
  220. člena,
  420. člena,
  423. člena,
  116. člena,
  34. člena,
  116. člena,
  116. člena,
  372. člena,
  116. člena,
  34. člena,
  115. člena,
  116. členu,
  116. člena,
  115. člena,
  115. člena,
  115. člena,
  34. členom,
  6. člena,
  6. člena,
  29. člena,
  258. člena,
  372. člena,
  420. člena,
  116. člena,
  34. člena,
  426. člena,
  115. člena,
  34. člena,
  53. člena,
  220. člena,
  115. člena,
  49. člena,
  425. člen],
 <EntType.CLEN_LEFT: 'CLEN_LEFT'>: []}

In [7]:
test.spans["sc"] = test.spans["spans"]

In [8]:
from spacy import displacy

In [9]:
displacy.render(test, style="span")

In [10]:
zveza_found = []

navedbe_zakona = [x for x in test.spans["spans"] if x.label_ == "navedba_zakona"]
for span in navedbe_zakona:
    entities_types_to_parse = [
        EntType.DOC_TITLE,
        EntType.DOC_ABBR,
        EntType.CLEN,
        EntType.CLEN_LEFT,
    ]

    # Extract abbreviations
    type_to_found_entities = {x: _find_entity(span, x) for x in entities_types_to_parse}
    _abbr = _map_titles_to_abbreviations(
        titles=type_to_found_entities[EntType.DOC_TITLE], tp=tp
    )
    _abbr.extend(type_to_found_entities[EntType.DOC_ABBR])
    if len(_abbr) != 1:
        print(_abbr)
        raise ValueError
    found_abbr = str(_abbr[0])

    # Extract clens
    clens = [
        *[str(x) for x in type_to_found_entities[EntType.CLEN]],
        *[str(x) for x in type_to_found_entities[EntType.CLEN_LEFT]],
    ]
    if len(clens) != 1:
        print(clens)
        raise ValueError
    found_clen = _normalize_input_for_clen(str(clens[0]))
    zveza_found.append(f"{found_clen} {found_abbr}")

    if not found_clen.split(" ")[0].strip(".").isdigit():
        found_num = found_clen.split(".")[0]
        zveza_found.append(f"{found_num}. člen {found_abbr}")

zveza_found = sorted(list(set(zveza_found)), key=lambda x: int(x.split(".")[0]))


In [11]:
zakon, abbr, clens_unformatted = zveza.split(" - ")
zakon, abbr, clens_unformatted

('Kazenski zakonik (2008)', 'KZ-1', 'člen 115, 115/1, 116, 116-1')

In [12]:
def _format_clen_from_docs(clen):
    if clen.isdigit():
        return f"{clen}. člen"
    digits = "".join([x for x in clen if x.isdigit()])
    alphas = "".join([x for x in clen if x.isalpha()])
    return f"{digits}.{alphas} člen"


zveza_true = []
for clen in clens_unformatted.split(","):
    clen = clen.strip()
    clen = clen.split("/")[0]
    clen = clen.split("-")[0]

    if _split := clen.split(" "):
        clen = [x for x in _split if not all(y.isalpha() for y in x)][0]
    zveza_true.append(clen)
zveza_true = sorted(
    [f"{_format_clen_from_docs(x)} {abbr}" for x in list(set(zveza_true))],
    key=lambda x: int(x.split(".")[0]),
)


In [13]:
zveza_found

['34. člen KZ-1',
 '34. člen KZ',
 '49. člen KZ-1',
 '53. člen KZ-1',
 '115. člen KZ-1',
 '116. člen KZ-1',
 '116. člen KZ',
 '220. člen KZ-1',
 '372. člen ZKP',
 '420. člen ZKP',
 '423. člen ZKP',
 '425. člen ZKP',
 '426. člen ZKP']

In [14]:
zveza_true

['115. člen KZ-1', '116. člen KZ-1']

In [15]:
set(zveza_true).issubset(set(zveza_found))

True