In [1]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import os
import re

In [2]:
INPUT_PATH = "laws"
OUTPUT_PATH = "laws_clean"
TAG_PATH = "laws_tagged"

In [3]:
data = pl.read_csv("laws_summary_pos.csv")
names = pl.read_csv("file_names.csv")

In [4]:
data = data.with_columns(
    nominality = (pl.col("NOUN") / pl.col("VERB")).round(2),
    nominality_wider = ((pl.col("NOUN") + pl.col("PROPN") + pl.col("PRON") + pl.col("ADJ") + pl.col("NUM")) / (pl.col("VERB") + pl.col("AUX"))).round(2)
)

In [5]:
data_analysed = []

for filename in os.listdir(OUTPUT_PATH):
    file_path = os.path.join(OUTPUT_PATH, filename)
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    # średnia długość artykułu
    articles = re.split(r"\bArt\.\s", text)[1:]
    article_lengths = [len(article.split()) for article in articles]
    avg_article_length = round(sum(article_lengths) / len(article_lengths), 2)

    # średnia długość paragrafu
    lengths = []
    for article in articles:
        sections = re.split(r"\n\s*\d+\.\s", article)
        if len(sections) == 1:
            lengths.append(len(article.split()))
        else:
            for section in sections:
                section_length = len(section.split())
                if section_length > 0:
                    lengths.append(section_length)

    avg_section_length = round(sum(lengths) / len(lengths), 2)

    # liczenie słów
    words = text.split()
    len_in_words = len(words)

    data_analysed.append(
        {"file_name": filename,
         "avg_article_length": avg_article_length,
         "avg_section_length": avg_section_length,
         "len_in_words": len_in_words}
        )

df_lengths = pl.DataFrame(data_analysed)
df_lengths.write_csv("laws_lengths.csv")

In [6]:
data_names = pl.read_csv("file_names.csv")

In [7]:
data = (
    data
    .join(df_lengths, on="file_name", how="left")
    .join(data_names, on="file_name", how="left")
    .with_columns(
        pl.col("file_name").str.split("_").list.get(0).cast(pl.Int32).alias("year"),
    )
)

In [8]:
data.write_csv("data.csv")

In [9]:
data.head()

NOUN,ADP,PUNCT,DET,VERB,ADJ,AUX,CCONJ,NUM,PRON,ADV,PROPN,SCONJ,X,file_name,SYM,SPACE,INTJ,nominality,nominality_wider,avg_article_length,avg_section_length,len_in_words,name,type,year
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,f64,f64,f64,f64,f64,f64,f64,i64,str,str,i32
1433,979,619,511,422,398,200,155,144,109,100,47,44,17,"""1927_1443.txt""",0.0,0.0,0.0,3.4,3.43,68.23,67.17,4431,"""geologic and mining law""","""administrative""",1927
19159,14138,12082,6564,6046,5924,2413,3697,3504,2566,2007,938,1057,8,"""1930_1398.txt""",1.0,0.0,0.0,3.17,3.79,94.28,94.01,66694,"""penal code""","""penal""",1930
13563,9178,8935,4848,4155,3426,1902,1732,2135,1729,1474,571,834,5,"""1940_1443.txt""",1.0,0.0,0.0,3.26,3.54,89.18,88.82,44189,"""civil procedure""","""civil""",1940
13485,10116,5918,4330,3467,3982,1244,2294,1787,1296,1065,463,411,120,"""1941_633.txt""",3.0,0.0,0.0,3.89,4.46,139.54,60.38,42723,"""author and ip""","""civil""",1941
23239,15881,13418,8634,7668,5675,3896,3264,2357,3502,2489,780,1380,20,"""1942_262.txt""",1.0,1.0,0.0,3.03,3.07,61.07,60.81,75968,"""civil code""","""civil""",1942
