In [23]:
import re
import os
import pandas as pd

In [16]:
INPUT_PATH = "laws"
OUTPUT_PATH = "laws_clean"
TAG_PATH = "laws_tagged"

In [10]:
def clean_text(text):
    cleaned_text = re.sub(r"AGGIORNAMENTO \(\d+\).*?(?=Art\.)", "", text, flags = re.DOTALL)
    return cleaned_text

In [11]:
for file in os.listdir(INPUT_PATH):
    with open(os.path.join(INPUT_PATH, file), "r") as f:
        text = f.read()
    cleaned_text = clean_text(text)
    with open(os.path.join(OUTPUT_PATH, file), "w") as f:
        f.write(cleaned_text)

In [12]:
import spacy

In [14]:
nlp = spacy.load("it_core_news_sm")

In [32]:
def tag_text(text):
    doc = nlp(text)
    tagged_sentences = []

    for sent in doc.sents:
        tagged_tokens = [
            f"{token.text}\t{token.lemma_}\t{token.pos_}\t{token.tag_}\t{token.dep_}" for token in sent
        ]
        tagged_sentences.append("\n".join(tagged_tokens))
    return "\n\n".join(tagged_sentences)

In [33]:
for file in os.listdir(INPUT_PATH):
    with open(os.path.join(OUTPUT_PATH, file), "r") as f:
        text = f.read()
    tagged_text = tag_text(text)
    with open(os.path.join(TAG_PATH, file), "w") as f:
        f.write(tagged_text)

In [34]:
data = []

for file in os.listdir(TAG_PATH):
    df = pd.read_csv(os.path.join(TAG_PATH, file), sep="\t", header=None, names = ['token', 'lemma', 'pos', 'tag', 'dep'], keep_default_na=False)
    pos_counts = df['pos'].value_counts().to_dict()
    pos_counts['file_name'] = file
    data.append(pos_counts)

df_summary = pd.DataFrame(data).fillna(0)
output_path = "laws_summary.csv"
df_summary.to_csv(output_path, index=False)

ParserError: Error tokenizing data. C error: Expected 5 fields in line 95050, saw 7


In [36]:
import os
import pandas as pd
from collections import Counter

# Ścieżka do katalogu z plikami
folder_path = "laws_tagged"  # <-- Podmień na właściwą ścieżkę

# Licznik dla liczby pól w wierszach
field_counts = Counter()

# Sprawdzanie liczby pól w każdej linii w plikach
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):  # Tylko pliki tekstowe
        file_path = os.path.join(folder_path, filename)

        with open(file_path, "r", encoding="utf-8") as file:
            for line in file:
                num_fields = line.strip().count("\t") + 1  # Liczymy kolumny (ilość tabulatorów + 1)
                field_counts[num_fields] += 1  # Zliczamy różne przypadki

# Tworzenie DataFrame dla analizy
df_fields = pd.DataFrame(field_counts.items(), columns=["num_fields", "num_lines"]).sort_values(by="num_fields")

print(df_fields)

   num_fields  num_lines
1           1      30413
2           3       8078
0           5     404580


In [38]:
data = []
for filename in os.listdir(TAG_PATH):
    file_path = os.path.join(TAG_PATH, filename)
    rows = []
    with open(file_path, "r") as f:
        for line in f:
            fields = line.strip().split("\t")
            if len(fields) == 5:
                token, lemma, pos, tag, dep = fields
                rows.append(pos)
    df = pd.DataFrame(rows, columns = ['pos'])
    pos_counts = df['pos'].value_counts().to_dict()
    pos_counts['file_name'] = filename
    data.append(pos_counts)

In [39]:
df_summary = pd.DataFrame(data).fillna(0)
output_path = "laws_summary_pos.csv"
df_summary.to_csv(output_path, index = False)

In [50]:

df_summary.head(10)

Unnamed: 0,NOUN,ADP,PUNCT,NUM,ADJ,DET,VERB,CCONJ,PRON,PROPN,AUX,ADV,SCONJ,X,SYM,file_name,nominality,nominality_wider
0,5821,4607,2963,1871,1797,1321,1122,684,532,395,366,345,95,57,5.0,446_1997.txt,5.19,7.0
1,24002,17780,12676,5491,7195,5359,5308,3741,2408,2073,1569,1662,540,186,4.0,633_1972.txt,4.52,5.99
2,2191,1599,1156,367,745,576,506,286,178,56,183,214,38,23,0.0,212_2000.txt,4.33,5.13
3,32981,27952,15800,11592,8933,6874,5851,3622,2396,1226,1939,1351,165,220,2.0,197_2022.txt,5.64,7.33
4,24716,20577,11917,8498,6562,4991,4146,2541,1659,1031,1441,876,154,127,16.0,213_2023.txt,5.96,7.6
5,19842,15812,9569,4457,6202,4572,4525,2749,2252,854,1249,1568,419,261,0.0,917_1986.txt,4.38,5.82


In [49]:
df_summary = df_summary.assign(
    nominality = lambda x: round(x.NOUN / x.VERB, 2),
    nominality_wider = lambda x: round((x.NOUN + x.PROPN + x.PRON + x.ADJ + x.NUM) / (x.VERB + x.AUX), 2)
)

In [57]:
data = []

for filename in os.listdir(OUTPUT_PATH):
    file_path = os.path.join(OUTPUT_PATH, filename)
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    articles = re.split(r"\bArt\.\s", text)[1:]
    lengths = [len(article.split()) for article in articles]
    avg_length = round(sum(lengths) / len(lengths), 2)
    data.append({"file_name": filename, "avg_length": avg_length})

df_avg_length_abs = pd.DataFrame(data)

In [58]:
df_avg_length_abs.head(10)

Unnamed: 0,file_name,avg_length
0,446_1997.txt,247.07
1,633_1972.txt,433.87
2,212_2000.txt,182.08
3,197_2022.txt,3640.32
4,213_2023.txt,1823.41
5,917_1986.txt,400.65


In [61]:
df_merged = (
    df_avg_length_rel
    .merge(df_avg_length_abs, on="file_name", suffixes=("_rel", "_abs"))
    .merge(df_summary, on="file_name")
)

columns_order = ["file_name"] + [col for col in df_merged.columns if col != "file_name"]
df_merged = df_merged[columns_order]

df_merged.head()

Unnamed: 0,file_name,avg_length_rel,avg_length_abs,NOUN,ADP,PUNCT,NUM,ADJ,DET,VERB,CCONJ,PRON,PROPN,AUX,ADV,SCONJ,X,SYM,nominality,nominality_wider
0,446_1997.txt,63.19,247.07,5821,4607,2963,1871,1797,1321,1122,684,532,395,366,345,95,57,5.0,5.19,7.0
1,633_1972.txt,148.62,433.87,24002,17780,12676,5491,7195,5359,5308,3741,2408,2073,1569,1662,540,186,4.0,4.52,5.99
2,212_2000.txt,43.27,182.08,2191,1599,1156,367,745,576,506,286,178,56,183,214,38,23,0.0,4.33,5.13
3,197_2022.txt,99.65,3640.32,32981,27952,15800,11592,8933,6874,5851,3622,2396,1226,1939,1351,165,220,2.0,5.64,7.33
4,213_2023.txt,98.34,1823.41,24716,20577,11917,8498,6562,4991,4146,2541,1659,1031,1441,876,154,127,16.0,5.96,7.6


In [59]:
data = []

for filename in os.listdir(OUTPUT_PATH):
    file_path = os.path.join(OUTPUT_PATH, filename)
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    articles = re.split(r"\bArt\.\s", text)[1:]
    lengths = []

    for article in articles:
        sections = re.split(r"\n\s*\d+\.\s", article)
        if len(sections) == 1:
            lengths.append(len(article.split()))
        else:
            for section in sections:
                section_length = len(section.split())
                if section_length > 0:
                    lengths.append(section_length)

    avg_length = round(sum(lengths) / len(lengths), 2)
    data.append({"file_name": filename, "avg_length": avg_length})

df_avg_length_rel = pd.DataFrame(data)

In [60]:
df_avg_length_rel.head(10)

Unnamed: 0,file_name,avg_length
0,446_1997.txt,63.19
1,633_1972.txt,148.62
2,212_2000.txt,43.27
3,197_2022.txt,99.65
4,213_2023.txt,98.34
5,917_1986.txt,97.97


In [63]:
df_merged[['file_name', 'avg_length_abs', 'avg_length_rel', 'nominality', 'nominality_wider']].head(10)


Unnamed: 0,file_name,avg_length_abs,avg_length_rel,nominality,nominality_wider
0,446_1997.txt,247.07,63.19,5.19,7.0
1,633_1972.txt,433.87,148.62,4.52,5.99
2,212_2000.txt,182.08,43.27,4.33,5.13
3,197_2022.txt,3640.32,99.65,5.64,7.33
4,213_2023.txt,1823.41,98.34,5.96,7.6
5,917_1986.txt,400.65,97.97,4.38,5.82
