In [1]:
import pandas as pd
from src.nlp.text_processor import TextProcessor

tp = TextProcessor()

2025-02-26 08:23:17 INFO: Loading these models for language: sl (Slovenian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |
| lemma     | standard |

2025-02-26 08:23:17 INFO: Use device: cpu
2025-02-26 08:23:17 INFO: Loading: tokenize
2025-02-26 08:23:17 INFO: Loading: pos
2025-02-26 08:23:20 INFO: Loading: lemma
2025-02-26 08:23:23 INFO: Done loading processors!


In [2]:
kzp_mappings = pd.read_parquet(
    "./data/mappings/legislation_mappings_from_kzp_scraped.parquet"
)
cpp_mappings = pd.read_parquet(
    "./data/mappings/legislation_mappings_from_cpp_scraped.parquet"
)
existing_mappings = pd.read_parquet(
    "./data/mappings/legislation_mappings_from_db.parquet"
)

In [3]:
kzp_mappings["abbr"].value_counts()[:20]

abbr
ZKP           2934
KZ-1           593
URS            505
KZ             249
EKČP            30
ZPol            21
ZSKZDČEU-1      20
ZS              19
ZIKS-1          17
ZP-1            11
ZNPPol          11
OZ               8
ZZUSUDJZ         7
ZOPOKD           7
                 7
ZFPPIPP          6
KZ-1B            6
ZGD-1            6
ZOdv             5
ZSKZDČEU         5
Name: count, dtype: int64

In [4]:
cpp_mappings["abbr"].value_counts()[:20]

abbr
ZPP        15503
OZ          2175
ZOR         1703
URS          875
ZIZ          841
ZZZDR        498
SPZ          486
ZD           332
ZNP          317
ZTLR         282
ZS           229
ZNP-1        192
ZDen         176
ZFPPIPP      172
ZZK-1        155
SZ           153
ZGD-1        123
ZMZPP         90
ZGD           86
ZVPot         75
Name: count, dtype: int64

In [5]:
scraped_mappings = (
    pd.concat([kzp_mappings, cpp_mappings])
    .drop_duplicates(subset=["title", "abbr"])[["title", "abbr"]]
    .reset_index(drop=True)
)

court_mappings_processed = []
for _, row in scraped_mappings.iterrows():
    _abbr = row["abbr"]
    title = row["title"]
    if _abbr and title:
        title += f" ({_abbr})"

    court_mappings_processed.append(
        {
            "document_title": title,
            "document_title_striped": row["title"],
            "document_title_lemmatized": tp.lemmatize_text(row["title"]),
            "abbreviation": _abbr,
        }
    )

court_mappings_processed = pd.DataFrame(court_mappings_processed)


In [6]:
court_mappings_processed

Unnamed: 0,document_title,document_title_striped,document_title_lemmatized,abbreviation
0,Zakon o kazenskem postopku (ZKP),Zakon o kazenskem postopku,zakon o kazenski postopek,ZKP
1,Kazenski zakonik (KZ-1),Kazenski zakonik,kazenski zakonik,KZ-1
2,Ustava Republike Slovenije (URS),Ustava Republike Slovenije,ustava republika Slovenija,URS
3,Kazenski zakonik (KZ),Kazenski zakonik,kazenski zakonik,KZ
4,Zakon o trgu finančnih instrumentov (ZTFI),Zakon o trgu finančnih instrumentov,zakon o trg finančen instrument,ZTFI
...,...,...,...,...
693,,,,ZJShemFO
694,,,,ZGO-1A
695,,,,ZNZZ
696,,,,EGS


In [37]:
mappings_all = pd.concat([existing_mappings, court_mappings_processed]).drop_duplicates(
    subset=["document_title", "abbreviation"]
)
mappings_all = mappings_all[
    (mappings_all["document_title_striped"].str.len() > 0)
    & (mappings_all["document_title_striped"].str.len() > 0)
]

mappings_all.to_parquet("./data/mappings/mappings_all.parquet")