In [1]:
import re
import os
import pandas as pd

In [2]:
INPUT_PATH = "laws"
OUTPUT_PATH = "laws_clean"
TAG_PATH = "laws_tagged"

In [3]:
def clean_text(text):
    cleaned_text = re.sub(r"AGGIORNAMENTO \(\d+\).*?(?=Art\.)", "", text, flags = re.DOTALL)
    return cleaned_text

In [6]:
for file in os.listdir(INPUT_PATH):
    with open(os.path.join(INPUT_PATH, file), "r", encoding="utf-8") as f:
        text = f.read()
    cleaned_text = clean_text(text)
    with open(os.path.join(OUTPUT_PATH, file), "w", encoding="utf-8") as f:
        f.write(cleaned_text)

In [7]:
import spacy

In [9]:
nlp = spacy.load("it_core_news_sm")

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [10]:
def tag_text(text):
    doc = nlp(text)
    tagged_sentences = []

    for sent in doc.sents:
        tagged_tokens = [
            f"{token.text}\t{token.lemma_}\t{token.pos_}\t{token.tag_}\t{token.dep_}" for token in sent
        ]
        tagged_sentences.append("\n".join(tagged_tokens))
    return "\n\n".join(tagged_sentences)

In [12]:
for file in os.listdir(INPUT_PATH):
    with open(os.path.join(OUTPUT_PATH, file), "r", encoding="utf-8") as f:
        text = f.read()
    tagged_text = tag_text(text)
    with open(os.path.join(TAG_PATH, file), "w", encoding="utf-8") as f:
        f.write(tagged_text)

In [14]:
data = []

for file in os.listdir(TAG_PATH):
    df = pd.read_csv(os.path.join(TAG_PATH, file), sep=";", header=None, names = ['token', 'lemma', 'pos', 'tag', 'dep'], keep_default_na=False)
    pos_counts = df['pos'].value_counts().to_dict()
    pos_counts['file_name'] = file
    data.append(pos_counts)

df_summary = pd.DataFrame(data).fillna(0)
output_path = "laws_summary.csv"
df_summary.to_csv(output_path, index=False)

In [15]:
import os
import pandas as pd
from collections import Counter

# Ścieżka do katalogu z plikami
folder_path = "laws_tagged"  # <-- Podmień na właściwą ścieżkę

# Licznik dla liczby pól w wierszach
field_counts = Counter()

# Sprawdzanie liczby pól w każdej linii w plikach
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):  # Tylko pliki tekstowe
        file_path = os.path.join(folder_path, filename)

        with open(file_path, "r", encoding="utf-8") as file:
            for line in file:
                num_fields = line.strip().count("\t") + 1  # Liczymy kolumny (ilość tabulatorów + 1)
                field_counts[num_fields] += 1  # Zliczamy różne przypadki

# Tworzenie DataFrame dla analizy
df_fields = pd.DataFrame(field_counts.items(), columns=["num_fields", "num_lines"]).sort_values(by="num_fields")

print(df_fields)

   num_fields  num_lines
1           1     254886
2           3      60010
0           5    1931674


In [17]:
data = []
for filename in os.listdir(TAG_PATH):
    file_path = os.path.join(TAG_PATH, filename)
    rows = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            fields = line.strip().split("\t")
            if len(fields) == 5:
                token, lemma, pos, tag, dep = fields
                rows.append(pos)
    df = pd.DataFrame(rows, columns = ['pos'])
    pos_counts = df['pos'].value_counts().to_dict()
    pos_counts['file_name'] = filename
    data.append(pos_counts)

In [18]:
df_summary = pd.DataFrame(data).fillna(0)
output_path = "laws_summary_pos.csv"
df_summary.to_csv(output_path, index = False)

In [20]:

df_summary.head(100)

Unnamed: 0,NOUN,ADP,PUNCT,DET,VERB,ADJ,AUX,CCONJ,NUM,PRON,ADV,PROPN,SCONJ,X,file_name,SYM,SPACE,INTJ
0,1433,979,619,511,422,398,200,155,144,109,100,47,44,17,1927_1443.txt,0.0,0.0,0.0
1,19159,14138,12082,6564,6046,5924,2413,3697,3504,2566,2007,938,1057,8,1930_1398.txt,1.0,0.0,0.0
2,13563,9178,8935,4848,4155,3426,1902,1732,2135,1729,1474,571,834,5,1940_1443.txt,1.0,0.0,0.0
3,13485,10116,5918,4330,3467,3982,1244,2294,1787,1296,1065,463,411,120,1941_633.txt,3.0,0.0,0.0
4,23239,15881,13418,8634,7668,5675,3896,3264,2357,3502,2489,780,1380,20,1942_262.txt,1.0,1.0,0.0
5,23844,17704,12631,5399,5281,7238,1621,3739,5535,2415,1665,1990,536,385,1972_633.txt,7.0,3.0,1.0
6,3503,2674,1303,928,750,1012,341,522,640,341,262,109,73,9,1975_110.txt,0.0,0.0,0.0
7,8194,6526,3071,2213,1731,3496,499,1319,922,650,375,232,57,110,1978_833.txt,0.0,0.0,0.0
8,32981,27952,15800,6874,5851,8933,1939,3622,11592,2396,1351,1226,165,220,197_2022.txt,2.0,0.0,0.0
9,882,714,337,202,148,281,47,140,136,63,35,40,5,5,1982_752.txt,0.0,0.0,0.0


In [21]:
df_summary = df_summary.assign(
    nominality = lambda x: round(x.NOUN / x.VERB, 2),
    nominality_wider = lambda x: round((x.NOUN + x.PROPN + x.PRON + x.ADJ + x.NUM) / (x.VERB + x.AUX), 2)
)

In [22]:
data = []

for filename in os.listdir(OUTPUT_PATH):
    file_path = os.path.join(OUTPUT_PATH, filename)
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    articles = re.split(r"\bArt\.\s", text)[1:]
    lengths = [len(article.split()) for article in articles]
    avg_length = round(sum(lengths) / len(lengths), 2)
    data.append({"file_name": filename, "avg_length": avg_length})

df_avg_length_abs = pd.DataFrame(data)

In [23]:
df_avg_length_abs.head(10)

Unnamed: 0,file_name,avg_length
0,1927_1443.txt,68.23
1,1930_1398.txt,94.28
2,1940_1443.txt,89.18
3,1941_633.txt,139.54
4,1942_262.txt,61.07
5,1972_633.txt,433.87
6,1975_110.txt,244.82
7,1978_833.txt,306.59
8,197_2022.txt,3640.32
9,1982_752.txt,143.22


In [25]:
df_merged = (
    df_avg_length_abs
    .merge(df_avg_length_abs, on="file_name", suffixes=("_rel", "_abs"))
    .merge(df_summary, on="file_name")
)

columns_order = ["file_name"] + [col for col in df_merged.columns if col != "file_name"]
df_merged = df_merged[columns_order]

df_merged.head()

Unnamed: 0,file_name,avg_length_rel,avg_length_abs,NOUN,ADP,PUNCT,DET,VERB,ADJ,AUX,...,PRON,ADV,PROPN,SCONJ,X,SYM,SPACE,INTJ,nominality,nominality_wider
0,1927_1443.txt,68.23,68.23,1433,979,619,511,422,398,200,...,109,100,47,44,17,0.0,0.0,0.0,3.4,3.43
1,1930_1398.txt,94.28,94.28,19159,14138,12082,6564,6046,5924,2413,...,2566,2007,938,1057,8,1.0,0.0,0.0,3.17,3.79
2,1940_1443.txt,89.18,89.18,13563,9178,8935,4848,4155,3426,1902,...,1729,1474,571,834,5,1.0,0.0,0.0,3.26,3.54
3,1941_633.txt,139.54,139.54,13485,10116,5918,4330,3467,3982,1244,...,1296,1065,463,411,120,3.0,0.0,0.0,3.89,4.46
4,1942_262.txt,61.07,61.07,23239,15881,13418,8634,7668,5675,3896,...,3502,2489,780,1380,20,1.0,1.0,0.0,3.03,3.07


In [26]:
data = []

for filename in os.listdir(OUTPUT_PATH):
    file_path = os.path.join(OUTPUT_PATH, filename)
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    articles = re.split(r"\bArt\.\s", text)[1:]
    lengths = []

    for article in articles:
        sections = re.split(r"\n\s*\d+\.\s", article)
        if len(sections) == 1:
            lengths.append(len(article.split()))
        else:
            for section in sections:
                section_length = len(section.split())
                if section_length > 0:
                    lengths.append(section_length)

    avg_length = round(sum(lengths) / len(lengths), 2)
    data.append({"file_name": filename, "avg_length": avg_length})

df_avg_length_rel = pd.DataFrame(data)

In [27]:
df_avg_length_rel.head(10)

Unnamed: 0,file_name,avg_length
0,1927_1443.txt,67.17
1,1930_1398.txt,94.01
2,1940_1443.txt,88.82
3,1941_633.txt,60.38
4,1942_262.txt,60.81
5,1972_633.txt,148.62
6,1975_110.txt,185.48
7,1978_833.txt,302.93
8,197_2022.txt,99.65
9,1982_752.txt,143.22


In [28]:
df_merged[['file_name', 'avg_length_abs', 'avg_length_rel', 'nominality', 'nominality_wider']].head(10)


Unnamed: 0,file_name,avg_length_abs,avg_length_rel,nominality,nominality_wider
0,1927_1443.txt,68.23,68.23,3.4,3.43
1,1930_1398.txt,94.28,94.28,3.17,3.79
2,1940_1443.txt,89.18,89.18,3.26,3.54
3,1941_633.txt,139.54,139.54,3.89,4.46
4,1942_262.txt,61.07,61.07,3.03,3.07
5,1972_633.txt,433.87,433.87,4.52,5.94
6,1975_110.txt,244.82,244.82,4.67,5.14
7,1978_833.txt,306.59,306.59,4.73,6.05
8,197_2022.txt,3640.32,3640.32,5.64,7.33
9,1982_752.txt,143.22,143.22,5.96,7.19
