In [1]:
import pandas as pd
from src.parse import _get_law_abbreviation, _strip_law_abbreviation
from src.nlp.text_processor import TextProcessor

tp = TextProcessor()

2025-02-26 10:46:01 INFO: Loading these models for language: sl (Slovenian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |

2025-02-26 10:46:01 INFO: Use device: cpu
2025-02-26 10:46:01 INFO: Loading: tokenize
2025-02-26 10:46:01 INFO: Loading: pos
2025-02-26 10:46:05 INFO: Loading: lemma
2025-02-26 10:46:08 INFO: Done loading processors!


In [2]:
laws_staging = pd.read_parquet("./data/mappings/legislation_laws_staging.parquet")
laws_staging.head()

Unnamed: 0,entity_id,document_title,document_id,abbreviation,document_title_striped,document_title_lemmatized
0,af476a961-4504-49d7-89f8-5a31d59b1c15,Ustava Republike Slovenije (URS),RS 33-1409/1991I,URS,Ustava Republike Slovenije,ustava republika Slovenija
0,b31c19b7-f5e9-492a-be47-6939e80c9b58,Zakon o trgu vrednostnih papirjev (ZTVP),RS 6-267/1994,ZTVP,Zakon o trgu vrednostnih papirjev,zakon o trg vrednosten papir
1,020d8be8-3c2a-4e25-abe6-273147bcdd1c,Zakon o čeku (ZC),FLRJ 105-735/1946,ZC,Zakon o čeku,zakon o ček
3,a2e29b0f0-e376-4fbf-9a86-cfae5082809d,Zakon o menici (ZM),FLRJ 104-728/1946,ZM,Zakon o menici,zakon o menica
4,aE_9f1c1c45-24bd-44d0-a63b-1f922a601203,Zakon o trgu finančnih instrumentov (ZTFI),RS 67-3699/2007,ZTFI,Zakon o trgu finančnih instrumentov,zakon o trg finančen instrument


In [3]:
laws_staging.shape

(4956, 6)

In [4]:
docs = pd.read_parquet("./data/mappings/document_overview.parquet")
docs.head()

Unnamed: 0,document_title,document_type_class
0,Uredba o ratifikaciji Protokola med Ministrstv...,predpis_tfl
1,Zakon o Pomurski akademsko-znanstveni uniji (P...,zakon_tfl
2,"Sklep o glavnih znamenjih bankovcev za 1000, 5...",predpis_tfl
3,"Pravilnik o merilih, postopku in načinu delitv...",predpis_tfl
4,Zakon o slovenski razvojni družbi in programu ...,zakon_tfl


In [5]:
new_db_abbrs = (
    docs[docs["document_type_class"] == "zakon_tfl"]
    .apply(lambda x: _get_law_abbreviation(x["document_title"]), axis=1)
    .to_list()
)

new_db_titles = docs.loc[
    docs["document_type_class"] == "zakon_tfl", "document_title"
].to_list()

new_db_titles_stripped = (
    docs[docs["document_type_class"] == "zakon_tfl"]
    .apply(lambda x: _strip_law_abbreviation(x["document_title"]), axis=1)
    .to_list()
)

new_db_titles_lemmatized = [tp.lemmatize_text(x) for x in new_db_titles_stripped]


In [6]:
new_entries = pd.DataFrame.from_records(
    [
        {
            "document_title": title,
            "abbreviation": abbr,
            "document_title_striped": stripped,
            "document_title_lemmatized": lemma,
        }
        for title, abbr, stripped, lemma in zip(
            new_db_titles,
            new_db_abbrs,
            new_db_titles_stripped,
            new_db_titles_lemmatized,
        )
    ]
)

In [17]:
pd.concat([laws_staging, new_entries])[
    [
        "document_title",
        "document_title_striped",
        "document_title_lemmatized",
        "abbreviation",
    ]
].drop_duplicates().sort_values(by="document_title").to_parquet(
    "./data/mappings/legislation_mappings_from_db.parquet"
)