In [1]:
import json

import classla
from pathlib import Path
from src.nlp.model import EntType
from src.nlp.text_processor import TextProcessor
from src.parse import (
    pdf_to_text,
    _find_entity,
    _map_titles_to_abbreviations,
    _normalize_input_for_clen,
)
from spacy import displacy

classla.download("sl")

Downloading https://raw.githubusercontent.com/clarinsi/classla-resources/main/resources_1.0.1.json: 10.3kB [00:00, 18.2MB/s]                   
2025-02-26 08:48:54 INFO: Downloading these customized packages for language: sl (Slovenian)...
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |
| depparse  | standard |
| ner       | standard |
| pretrain  | standard |

2025-02-26 08:48:55 INFO: File exists: /Users/tadejkrivec/classla_resources/sl/pos/standard.pt.
2025-02-26 08:48:55 INFO: File exists: /Users/tadejkrivec/classla_resources/sl/lemma/standard.pt.
2025-02-26 08:48:56 INFO: File exists: /Users/tadejkrivec/classla_resources/sl/depparse/standard.pt.
2025-02-26 08:48:56 INFO: File exists: /Users/tadejkrivec/classla_resources/sl/ner/standard.pt.
2025-02-26 08:48:56 INFO: File exists: /Users/tadejkrivec/classla_resources/sl/pretrain/standard.pt.
2025-02-26 08:48:56 INFO: Finished downloading models and saved to 

In [2]:
with open("./data/training/VS00000317.json", "r", encoding="utf-8") as f:
    training_document = json.load(f)


In [3]:
tp = TextProcessor(mappings_path=Path("./data/mappings/mappings_all.parquet"))

2025-02-26 08:48:56 INFO: Loading these models for language: sl (Slovenian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |

2025-02-26 08:48:56 INFO: Use device: cpu
2025-02-26 08:48:56 INFO: Loading: tokenize
2025-02-26 08:48:56 INFO: Loading: pos
2025-02-26 08:48:59 INFO: Loading: lemma
2025-02-26 08:49:02 INFO: Done loading processors!


In [4]:
TRAINING_DOCUMENT_ID = 2
jedro = training_document["content"]["jedro"]
zveza_true = training_document["processed"]["zveza"]

text_to_process = training_document["content"]["obrazložitev"]

In [5]:
test = tp.nlp(text_to_process)

In [6]:
entities_types_to_parse = [
    EntType.DOC_TITLE,
    EntType.DOC_ABBR,
    EntType.CLEN,
    EntType.CLEN_LEFT,
]

type_to_found_entities = {x: _find_entity(test, x) for x in entities_types_to_parse}
type_to_found_entities

{<EntType.DOC_TITLE: 'DOC_TITLE'>: [Kazenskega zakonika,
  Zakonu o kazenskem postopku],
 <EntType.DOC_ABBR: 'DOC_ABBR'>: [KZ-1,
  KZ-1,
  KZ,
  KZ,
  KZ,
  KZ,
  KZ-1,
  KZ-1,
  KZ-1,
  KZ-1,
  KZ-1,
  KZ-1,
  KZ-1,
  KZ-1,
  KZ-1,
  ZKP,
  ZKP,
  ZKP,
  ZKP],
 <EntType.CLEN: 'CLEN'>: [240. člena,
  20. členom,
  244. člena,
  25. členom,
  244. člena,
  27. členom,
  240. člena,
  38. členom,
  240. člena,
  38. členom,
  242. člena,
  211. člena,
  34. členom,
  211. člena,
  38. členom,
  26. člena,
  26. člena,
  26. člena],
 <EntType.CLEN_LEFT: 'CLEN_LEFT'>: []}

In [7]:
test.spans["sc"] = test.spans["spans"]
displacy.render(test, style="span")

In [8]:
zveza_found = []

navedbe_zakona = [x for x in test.spans["spans"] if x.label_ == "navedba_zakona"]
for span in navedbe_zakona:
    entities_types_to_parse = [
        EntType.DOC_TITLE,
        EntType.DOC_ABBR,
        EntType.CLEN,
        EntType.CLEN_LEFT,
    ]

    # Extract abbreviations
    type_to_found_entities = {x: _find_entity(span, x) for x in entities_types_to_parse}
    _abbr = _map_titles_to_abbreviations(
        titles=type_to_found_entities[EntType.DOC_TITLE], tp=tp
    )
    _abbr.extend(type_to_found_entities[EntType.DOC_ABBR])
    if len(_abbr) != 1:
        print(_abbr)
        raise ValueError
    found_abbr = str(_abbr[0])

    # Extract clens
    clens = [
        *[str(x) for x in type_to_found_entities[EntType.CLEN]],
        *[str(x) for x in type_to_found_entities[EntType.CLEN_LEFT]],
    ]
    if len(clens) != 1:
        print(clens)
        raise ValueError
    found_clen = _normalize_input_for_clen(str(clens[0]))
    zveza_found.append(f"{found_clen} {found_abbr}")

    if not found_clen.split(" ")[0].strip(".").isdigit():
        found_num = found_clen.split(".")[0]
        zveza_found.append(f"{found_num}. člen {found_abbr}")

zveza_found = sorted(list(set(zveza_found)), key=lambda x: int(x.split(".")[0]))


['KZ-1', 'KZ']


ValueError: 

In [9]:
zveza_found

[]

In [10]:
zveza_true

['26. člen ZKP']

In [None]:
set(zveza_true).issubset(set(zveza_found))