In [None]:
clean_title("Rozporządzenie Rady Ministrów z dnia 7 sierpnia 2023 r. zmieniające rozporządzenie w sprawie Krajowej Tablicy Przeznaczeń Częstotliwości")

In [None]:
# load documents from `acts.csv` `
import pandas as pd

acts = pd.read_csv("acts.csv")
acts['title'] = acts['title'].apply(clean_title)

# to csv
acts.to_csv("acts_cleaned.csv", index=False)

# get only 40 rows
acts = pd.read_csv("acts_cleaned.csv", nrows=40)

cleaned_titles = [clean_title(t) for t in titles]
# there is only title column in the csv file
# documents = acts["title"].tolist() 
documents = cleaned_titles
print(len(documents))
print(documents[:5])

In [None]:
import pandas as pd
from collections import defaultdict
import re
from tqdm import tqdm

def find_common_substrings(df, min_length=20, min_occurrences=10):
    substring_counts = defaultdict(int)
    # Clean and prepare titles
    titles = df['title'].str.strip().tolist()
    total_titles = len(titles)
    
    # First, find all "Rozporządzenie/Obwieszczenie X z dnia" patterns
    skip_pattern = re.compile(r'^(?:Rozporządzenie|Obwieszczenie)\s+.*?\s+z\s+dnia')
    
    print(f"Analyzing substrings in {total_titles} documents...")
    for title in tqdm(titles):
        # Skip administrative part of the title
        skip_match = skip_pattern.match(title)
        if skip_match:
            start_idx = skip_match.end()
            title = title[start_idx:]
        
        # Get all possible substrings of meaningful length
        words = title.split()
        for i in range(len(words)):
            for j in range(i + 1, len(words) + 1):
                substring = ' '.join(words[i:j])
                if len(substring) >= min_length:
                    substring_counts[substring] += 1

    # Filter by minimum occurrences and create DataFrame
    common_substrings = [
        (substr, count, len(substr), round(count/total_titles * 100, 2))
        for substr, count in substring_counts.items()
        if count >= min_occurrences
    ]
    
    # Create DataFrame and sort primarily by occurrences
    result_df = pd.DataFrame(
        common_substrings,
        columns=['substring', 'occurrences', 'length', 'percentage']
    )
    
    # Sort by occurrences (descending) and then by length (descending)
    result_df = result_df.sort_values(
        by=['occurrences', 'length'],
        ascending=[False, False]
    ).reset_index(drop=True)
    
    return result_df, total_titles

def analyze_titles(csv_path, min_length=20, min_occurrences=5):
    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Find common substrings
    common_substrings_df, total_docs = find_common_substrings(
        df,
        min_length=min_length,
        min_occurrences=min_occurrences
    )
    
    pd.set_option('display.max_colwidth', None)
    print(f"\nAnalyzed {total_docs} documents")
    print(f"Found {len(common_substrings_df)} common substrings")
    print("\nMost common substrings (sorted by number of occurrences):")
    
    # Format the output for better readability
    for idx, row in common_substrings_df.head(20).iterrows():
        print("\n" + "="*80)
        print(f"#{idx + 1}: Occurs in {row['occurrences']} documents ({row['percentage']}%)")
        print(f"Length: {row['length']} characters")
        print(f"Substring: {row['substring']}")
    
    # Summary statistics
    print("\n" + "="*80)
    print("\nOccurrence Statistics:")
    print(f"Most frequent: {common_substrings_df['occurrences'].max()} occurrences")
    print(f"Median occurrences: {common_substrings_df['occurrences'].median()}")
    print(f"Mean occurrences: {common_substrings_df['occurrences'].mean():.2f}")
    
    return common_substrings_df

# Usage:
# results_df = analyze_titles('Acts.csv', min_length=20, min_occurrences=5)

In [None]:
analyze_titles('acts_cleaned.csv')

In [None]:
from __future__ import annotations
import os
from pathlib import Path
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
import vertexai

credentials_path = Path().cwd().parent / "sejm-stats-439117-39efc9d2f8b8.json"
print(credentials_path)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(credentials_path)


vertexai.init(project="sejm-stats-439117")



def embed_text(texts: list) -> list[list[float]]:
    dimensionality = 512
    task = "RETRIEVAL_DOCUMENT"


    model = TextEmbeddingModel.from_pretrained("text-multilingual-embedding-002")


    inputs = [TextEmbeddingInput(text, task) for text in texts]


    kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {}


    embeddings = model.get_embeddings(inputs, **kwargs)

    return [embedding.values for embedding in embeddings]

In [None]:
import numpy as np
documents= pd.read_csv("acts_cleaned.csv")["title"].tolist()[:100]
embeddings = embed_text(documents)
embeddings = np.array(embeddings)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
test = "samolot"
em = embed_text([test])

# get the cosine similarity between the test and the documents
cosine_similarities = cosine_similarity(em, embeddings)
# print 2 best matches
best_match = np.argmax(cosine_similarities)
print(f'Best match: {documents[best_match]}')
cosine_similarities[0][best_match]



In [None]:
from pydantic import BaseModel
import requests
import pdfplumber
import re
import tempfile
from loguru import logger
class DocumentChunk(BaseModel):
    start_page: int
    end_page: int
    content: str
    chapter_title: str| None = None  # If you can extract section headings
    word_count: int = 0
    
def clean_text(text: str) -> str:
    """Enhanced text cleaning for Polish legal documents that preserves structure."""
    logger.debug("Cleaning text of length {}", len(text))
    
    # Replace multiple spaces with single space, but preserve newlines
    text = re.sub(r' +', ' ', text)
    
    # Clean up newlines but preserve paragraph structure
    text = re.sub(r'\n\s*\n+', '\n\n', text)
    
    # Format section headers
    text = re.sub(r'(^|\n)§\s*(\d+)', r'\n\n### § \2', text)
    text = re.sub(r'(^|\n)Art\.\s*(\d+)', r'\n\n### Art. \2', text)
    
    # Format table headers
    text = re.sub(r'(^|\n)Tabela\s+(\d+)', r'\n\n## Tabela \2', text)
    
    # Format attachments
    text = re.sub(r'(^|\n)Załącznik\s+[Nn]r\s*(\d+)', r'\n\n## Załącznik nr \2', text)
    
    # Clean up common artifacts but preserve structure
    text = re.sub(r'©.*?\n', '', text)
    text = re.sub(r'Dziennik\s+Ustaw.*?(?=\n)', '', text)
    text = re.sub(r'Poz\.\s*\d+.*?(?=\n)', '', text)
    
    return text.strip()

In [None]:
def find_chapter_title(text: str) -> str:
    """Extract chapter title if present."""
    patterns = [
        r'Rozdział\s+[\dIVXLC]+\.?\s*([^\n]+)',
        r'DZIAŁ\s+[\dIVXLC]+\.?\s*([^\n]+)'
    ]
    
    for pattern in patterns:
        if match := re.search(pattern, text):
            title = match.group(1).strip()
            logger.debug("Found chapter title: {}", title)
            return title
    logger.debug("No chapter title found")
    return None
def downloadAndParsePdf(eli: str, chunk_size: int = 10) -> list[DocumentChunk]:
    url = f"https://api.sejm.gov.pl/eli/acts/{eli}/text.pdf"
    chunks = []
    
    logger.info("Starting download of PDF for ELI: {}", eli)
    logger.debug("Downloading from URL: {}", url)
    
    # Download PDF
    response = requests.get(url)
    response.raise_for_status()
    logger.success("Downloaded PDF successfully")
    
    # Save to temp file
    with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as temp_pdf:
        temp_pdf.write(response.content)
        pdf_path = temp_pdf.name
        logger.debug("Saved PDF to temporary file: {}", pdf_path)
    
    # Process PDF
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        logger.info("Processing PDF with {} pages", total_pages)
        
        for start_page in range(0, total_pages, chunk_size):
            end_page = min(start_page + chunk_size, total_pages)
            logger.debug("Processing chunk pages {}-{}", start_page, end_page - 1)
            chunk_text = []
            
            # Extract text from page range
            for page_num in range(start_page, end_page):
                logger.trace("Extracting text from page {}", page_num)
                page = pdf.pages[page_num]
                text = page.extract_text()
                if text:
                    chunk_text.append(clean_text(text))
            
            if chunk_text:
                combined_text = '\n'.join(chunk_text)
                logger.debug("Created chunk with {} characters", len(combined_text))
                
                chunk = DocumentChunk(
                    start_page=start_page,
                    end_page=end_page - 1,
                    content=combined_text,
                    chapter_title=find_chapter_title(combined_text),
                    word_count=len(combined_text.split())
                )
                chunks.append(chunk)
                logger.info(
                    "Added chunk: pages {}-{}, {} words", 
                    chunk.start_page, 
                    chunk.end_page, 
                    chunk.word_count
                )
    
    logger.success("Completed processing PDF. Created {} chunks", len(chunks))
    return chunks


In [None]:
downloadAndParsePdf("DU/2024/1573")

In [None]:
from pprint import pprint
import pandas as pd

em_df =pd.read_csv("embeded.csv")
em_df
ELIS  = em_df["ELI"].tolist()
print(em_df['title'].tolist()[0])
pprint(downloadAndParsePdf("DU/2024/1568"))

In [3]:
import base64
from dataclasses import dataclass
from typing import List, NamedTuple
import vertexai
from vertexai.generative_models import GenerativeModel
from vertexai.preview.prompts import Prompt
import pdfplumber
import requests
import tempfile
import re
import csv
from time import sleep
from loguru import logger


@dataclass
class ProcessedChunk:
    start_page: int
    end_page: int
    chapter_title: str | None
    summary: str


class ChapterTitle(NamedTuple):
    eli: str
    page_number: int
    title: str


SYSTEM_MESSAGE = """
You are a Polish legal assistant specializing in summarizing legal documents for vector models. Summaries must capture the essential legal provisions and their implications, phrased concisely without procedural or formal terminology.You don't include informations about author of act or its type. Responses should be provided as brief, points without bullet points or formatting.Max 5 sentences. You always respond in Polish language

Example of a correct response format:
Ustalenie limitów emisji dla przemysłu ciężkiego, których zakłady muszą przestrzegać, obowiązek kwartalnego monitorowania i raportowania poziomu zanieczyszczeń przez przedsiębiorstwa, wprowadzenie kar za przekroczenie limitów emisji w celu ochrony środowiska.
"""


def create_prompt(chunk_text: str) -> Prompt:
    prompt_template = """
    {fragment_aktu}"""

    return Prompt(
        system_instruction=SYSTEM_MESSAGE,
        prompt_data=[prompt_template],
        model_name="gemini-1.5-flash-001",
        variables=[{"fragment_aktu": [chunk_text]}],
        generation_config={
            "max_output_tokens": 300,
            "temperature": 1,
            "top_p": 0.95,
        },
    )


def generate_summary(prompt: Prompt) -> str:
    responses = prompt.generate_content(
        contents=prompt.assemble_contents(**prompt.variables[0]),
        stream=True,
    )
    return " ".join("".join(response.text for response in responses).split())


def process_legal_documents(
    elis: List[str], summaries_csv: str, titles_csv: str, chunk_size: int = 10
):
    vertexai.init(project="sejm-stats-439117", location="us-central1")
    processed_chunks = {}

    title_patterns = [
        r"Rozdział\s+[\dIVXLC]+\.?\s*([^\n]+)",
        r"DZIAŁ\s+[\dIVXLC]+\.?\s*([^\n]+)",
    ]

    with open(summaries_csv, "w", newline="", encoding="utf-8") as f1, open(
        titles_csv, "w", newline="", encoding="utf-8"
    ) as f2:
        summary_writer = csv.writer(f1)
        titles_writer = csv.writer(f2)

        summary_writer.writerow(
            [
                "ELI",
                "chunk_start_page",
                "chunk_end_page",
                "chapter_title",
                "summary",
                "keywords",
                "title",
            ]
        )
        titles_writer.writerow(["ELI", "page_number", "title", "keywords", "doc_title"])

        for eli in elis:
            try:
                # Fetch metadata
                metadata = requests.get(
                    f"https://api.sejm.gov.pl/eli/acts/{eli}/"
                ).json()
                keywords = metadata.get("keywords", [])
                doc_title = metadata.get("title", "")

                # Download PDF
                response = requests.get(
                    f"https://api.sejm.gov.pl/eli/acts/{eli}/text.pdf"
                )
                response.raise_for_status()

                with tempfile.NamedTemporaryFile(
                    suffix=".pdf", delete=False
                ) as temp_pdf:
                    temp_pdf.write(response.content)

                    with pdfplumber.open(temp_pdf.name) as pdf:
                        # Process chapter titles
                        for page_num, page in enumerate(pdf.pages):
                            text = page.extract_text()
                            if text:
                                for pattern in title_patterns:
                                    for match in re.finditer(pattern, text):
                                        title = match.group(1).strip()
                                        titles_writer.writerow(
                                            [
                                                eli,
                                                page_num,
                                                title,
                                                ",".join(keywords),
                                                doc_title,
                                            ]
                                        )

                        # Process chunks for summaries
                        if eli not in processed_chunks:
                            processed_chunks[eli] = set()

                        total_pages = len(pdf.pages)
                        for start_page in range(0, total_pages, chunk_size):
                            end_page = min(start_page + chunk_size, total_pages)
                            chunk_key = (start_page, end_page)

                            if chunk_key not in processed_chunks[eli]:
                                chunk_text = ""
                                for page_num in range(start_page, end_page):
                                    if page_num < len(pdf.pages):
                                        chunk_text += (
                                            pdf.pages[page_num].extract_text() or ""
                                        )

                                prompt = create_prompt(chunk_text)
                                summary = generate_summary(prompt)
                                summary_writer.writerow(
                                    [
                                        eli,
                                        start_page,
                                        end_page,
                                        "",
                                        summary,
                                        ",".join(keywords),
                                        doc_title,
                                    ]
                                )
                                processed_chunks[eli].add(chunk_key)
                                sleep(10)

                logger.success(f"Completed processing: {eli}")

            except Exception as e:
                logger.error(f"Error processing {eli}: {str(e)}")
                continue


if __name__ == "__main__":
    elis_to_process = [
        "DU/2024/1583",
        # "DU/2024/1575",
        # "DU/2022/1225",
    ]
    process_legal_documents(
        elis_to_process, "legal_acts_summaries.csv", "legal_acts_titles.csv"
    )

Assembled prompt replacing: 1 instances of variable fragment_aktu
Assembled prompt replacing: 1 instances of variable fragment_aktu
Assembled prompt replacing: 1 instances of variable fragment_aktu
Assembled prompt replacing: 1 instances of variable fragment_aktu
Assembled prompt replacing: 1 instances of variable fragment_aktu


[32m2024-10-31 15:07:43.026[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mprocess_legal_documents[0m:[36m164[0m - [32m[1mCompleted processing: DU/2024/1583[0m


In [39]:
from typing import List, NamedTuple
import re
import requests
import pdfplumber
import tempfile
from loguru import logger

class ChapterTitle(NamedTuple):
    """Structure to hold chapter title information"""
    eli: str
    page_number: int
    title: str

def extract_chapter_titles(eli: str) -> List[ChapterTitle]:
    """
    Extract all chapter titles from a legal document PDF.
    
    Args:
        eli: ELI identifier (e.g., 'DU/2024/1583')
        
    Returns:
        List of ChapterTitle objects containing title and page information
    """
    url = f"https://api.sejm.gov.pl/eli/acts/{eli}/text.pdf"
    titles = []
    
    # Patterns for matching different types of section titles
    patterns = [
        r'Rozdział\s+[\dIVXLC]+\.?\s*([^\n]+)',
        r'DZIAŁ\s+[\dIVXLC]+\.?\s*([^\n]+)'
    ]
    
    try:
        # Download PDF
        response = requests.get(url)
        response.raise_for_status()
        
        # Save to temp file
        with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as temp_pdf:
            temp_pdf.write(response.content)
            pdf_path = temp_pdf.name
        
        # Process PDF
        with pdfplumber.open(pdf_path) as pdf:
            # Process each page individually
            for page_num, page in enumerate(pdf.pages):
                text = page.extract_text()
                if text:
                    # Check for titles using each pattern
                    for pattern in patterns:
                        matches = re.finditer(pattern, text)
                        for match in matches:
                            title = match.group(1).strip()
                            titles.append(ChapterTitle(
                                eli=eli,
                                page_number=page_num,
                                title=title
                            ))
        
        return titles
        
    except Exception as e:
        logger.error(f"Error processing {eli}: {str(e)}")
        return []

def extract_titles_batch(elis: List[str], output_csv_path: str):
    """
    Process multiple documents and save their chapter titles to CSV.
    
    Args:
        elis: List of ELI identifiers
        output_csv_path: Path to output CSV file
    """
    import csv
    
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['ELI', 'page_number', 'title'])
        
        for eli in elis:
            titles = extract_chapter_titles(eli)
            for title in titles:
                writer.writerow([title.eli, title.page_number, title.title])
            logger.info(f"Processed {eli}: found {len(titles)} titles")

if __name__ == "__main__":
    # Example usage
    elis_to_process = [
        "DU/2024/1583",
        "DU/2024/1575",
        "DU/2024/1587",
        "DU/2024/1576",
        "DU/2024/1578",
        "DU/2024/1584",
        "DU/2024/1569",
        "DU/2024/1570",
        "DU/2024/1557",
        "DU/2024/1556",
        "DU/2022/1225",
    ]
    output_csv = "legal_acts_titles.csv"
    extract_titles_batch(elis_to_process, output_csv)

[32m2024-10-31 14:02:18.720[0m | [1mINFO    [0m | [36m__main__[0m:[36mextract_titles_batch[0m:[36m84[0m - [1mProcessed DU/2024/1583: found 19 titles[0m
[32m2024-10-31 14:02:19.324[0m | [1mINFO    [0m | [36m__main__[0m:[36mextract_titles_batch[0m:[36m84[0m - [1mProcessed DU/2024/1575: found 0 titles[0m
[32m2024-10-31 14:02:20.715[0m | [1mINFO    [0m | [36m__main__[0m:[36mextract_titles_batch[0m:[36m84[0m - [1mProcessed DU/2024/1587: found 0 titles[0m
[32m2024-10-31 14:02:25.489[0m | [1mINFO    [0m | [36m__main__[0m:[36mextract_titles_batch[0m:[36m84[0m - [1mProcessed DU/2024/1576: found 7 titles[0m
[32m2024-10-31 14:02:26.863[0m | [1mINFO    [0m | [36m__main__[0m:[36mextract_titles_batch[0m:[36m84[0m - [1mProcessed DU/2024/1578: found 0 titles[0m
[32m2024-10-31 14:02:29.166[0m | [1mINFO    [0m | [36m__main__[0m:[36mextract_titles_batch[0m:[36m84[0m - [1mProcessed DU/2024/1584: found 8 titles[0m
[32m2024-10-31 14:02

In [None]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting


def generate():
    vertexai.init(project="sejm-stats-439117", location="us-central1")
    model = GenerativeModel(
        "gemini-1.5-flash-002",
    )
    responses = model.generate_content(
        [],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=True,
        
    )

    for response in responses:
        print(response.text, end="")


generation_config = {
    "max_output_tokens": 474,
    "temperature": 1,
    "top_p": 0.95,
}

generate()

In [None]:
summaries = pd.read_csv("legal_acts_summaries.csv")
summaries["summary"]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(phrase_embedding, embeddings)

# Get the indices of the top 5 most similar documents
top_5_indices = similarities[0].argsort()[-5:][::-1]

# Print the top 5 most similar documents and their similarity scores
for index in top_5_indices:
    print(titles[index])
    print(similarities[0][index])