In [None]:
import pypdf
import re
import transformers
import torch

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        pdf_reader = pypdf.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

def clean_text(text):
    # Remove headers, footers, and page numbers
    cleaned_text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'Projekt\s*$', '', cleaned_text, flags=re.MULTILINE)
    
    # Remove specific sections (customize as needed)
    sections_to_remove = [
        r'UZASADNIENIE',
        r'OCENA SKUTKÓW REGULACJI',
        r'Nazwa projektu',
        r'Ministerstwo wiodące i ministerstwa współpracujące'
    ]
    for section in sections_to_remove:
        cleaned_text = re.split(section, cleaned_text, flags=re.IGNORECASE)[0]
    
    # Remove extra whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

def create_ai_prompt(cleaned_text):
    prompt = f"""Podsumuj poniższy projekt ustawy, koncentrując się na głównych zmianach i ich potencjalnym wpływie. Ogranicz podsumowanie do około 200 słów.

Projekt ustawy:
{cleaned_text}

Podsumowanie:"""
    return prompt


: 

In [None]:

def run_ai_model(prompt):
    model_id = "RemekLlama-3-8B-Omnibus-1-PL-v01-INSTRUCT"
    model_id = "meta-llama/Meta-Llama-3.1-8B"

    pipeline = transformers.pipeline(
        task="text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16, "device": "auto"},
    )
    
    messages = [
        {"role": "system", "content": "You are a helpful, smart, kind, and efficient AI assistant."},
        {"role": "user", "content": prompt},
    ]
    
    full_prompt = pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    outputs = pipeline(
        full_prompt,
        max_new_tokens=256,
        eos_token_id=[pipeline.tokenizer.eos_token_id],
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    
    return outputs[0]['generated_text'][len(full_prompt):]



In [None]:
def summarize_projekt_ustawy(pdf_path):
    # Step 1: Extract text from PDF
    raw_text = extract_text_from_pdf(pdf_path)
    
    # Step 2: Clean the extracted text
    cleaned_text = clean_text(raw_text)
    
    # Step 3: Create AI prompt
    prompt = create_ai_prompt(cleaned_text)
    
    # Step 4: Run AI model
    summary = run_ai_model(prompt)
    
    return summary

# Example usage
pdf_path = "path/to/your/projekt_ustawy.pdf"
summary = summarize_projekt_ustawy(pdf_path)
print(summary)