In [None]:
import json

import classla
from pathlib import Path
from src.nlp.model import EntType
from src.nlp.text_processor import TextProcessor
from src.parse import (
    pdf_to_text,
    _find_entity,
    _map_titles_to_abbreviations,
    _normalize_input_for_clen,
)
import re
from spacy import displacy
from collections import namedtuple

classla.download("sl")

In [2]:
with open("./data/training/VS00000317.json", "r", encoding="utf-8") as f:
    training_document = json.load(f)


In [None]:
tp = TextProcessor(mappings_path=Path("./data/mappings/mappings_all.parquet"))

In [4]:
jedro = training_document["content"]["jedro"]
zveza_true = training_document["processed"]["zveza"]
text_to_process = training_document["content"]["obrazložitev"]
# text_to_process = "nek test 13. in 14. clen ZPP in nek test"
# text_to_process = "nek test 13. in 14.a clen ZPP in nek test"

In [5]:
test = tp.nlp(text_to_process)

In [None]:
entities_types_to_parse = [
    EntType.DOC_TITLE,
    EntType.DOC_ABBR,
    EntType.CLEN,
    EntType.CLEN_LEFT,
]

type_to_found_entities = {x: _find_entity(test, x) for x in entities_types_to_parse}

In [7]:
SpanTmp = namedtuple("Span", ["span", "start", "end"])

_spans = [SpanTmp(x, x.start, x.end) for x in test.spans["spans"]]

spans_filtered = []
while _spans:
    is_contained = False
    x = _spans.pop()
    for other in spans_filtered:
        if x.start >= other.start and x.end <= other.end:
            is_contained = True

    if is_contained:
        continue

    spans_filtered.append(x.span)

In [None]:
test.spans["sc"] = spans_filtered
displacy.render(test, style="span")

In [None]:
displacy.serve(test, style="span", port=8000)

In [9]:
zveza_found = []

navedbe_zakona = [x for x in test.spans["spans"] if x.label_ == "navedba_zakona"]
for span in navedbe_zakona:
    entities_types_to_parse = [
        EntType.DOC_TITLE,
        EntType.DOC_ABBR,
        EntType.CLEN,
        EntType.CLEN_LEFT,
    ]

    # Extract abbreviations
    type_to_found_entities = {x: _find_entity(span, x) for x in entities_types_to_parse}
    _abbr = _map_titles_to_abbreviations(
        titles=type_to_found_entities[EntType.DOC_TITLE], tp=tp
    )
    _abbr.extend(type_to_found_entities[EntType.DOC_ABBR])
    found_abbr = max([str(x) for x in _abbr], key=len)

    # Extract clens
    nums = []
    for x in span.text.split(" "):
        pattern = r"\d{1,4}\."
        matches = []
        for word in x:
            if re.match(pattern, word):
                matches.append(word)

    if len(nums) > 1:
        for num in nums:
            zveza_found.append(f"{num} člen {found_abbr}")
    else:
        clens = [
            *[str(x) for x in type_to_found_entities[EntType.CLEN]],
            *[str(x) for x in type_to_found_entities[EntType.CLEN_LEFT]],
        ]

        if len(clens) != 1:
            print(clens)
            raise ValueError

        for x in clens:
            found_clen = _normalize_input_for_clen(str(x))
            zveza_found.append(f"{found_clen} {found_abbr}")

        if not found_clen.split(" ")[0].strip(".").isdigit():
            found_num = found_clen.split(".")[0]
            zveza_found.append(f"{found_num}. člen {found_abbr}")

zveza_found = sorted(list(set(zveza_found)), key=lambda x: int(x.split(".")[0]))


In [None]:
zveza_found

In [None]:
zveza_true

In [None]:
set(zveza_true).issubset(set(zveza_found))