In [None]:
import requests
import os
import time
import json

import warnings
warnings.filterwarnings('ignore')

In [None]:
BASE = "https://api.sejm.gov.pl/eli/acts"
RAW_DATA = "/Users/mateuszbulanda-gorol/Desktop/Projects/rag_lex_project/data/raw_data"

In [None]:
def mkdir_if_not(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [None]:
def fetch_acts_for_year(publisher:str, year:int, limit:int = 100, offset:int = 0):
    """
    Retrieves a list of acts for a given publisher and year, returning JSON.
    """
    url = f"{BASE}/{publisher}/{year}"
    params = {
        "limit": limit,
        "offset": offset
    }
    resp = requests.get(url, params=params, headers={"Accept": "application/json"})
    resp.raise_for_status()
    return resp.json()

In [None]:
def fetch_act_text(publisher:str, year:int, num:int, as_pdf:bool = False):
    """
    Retrieves the text of the document — HTML or PDF, depending on the as_pdf flag.
    Returns bytes or text.
    """
    ext = "pdf" if as_pdf else "html"
    url = f"{BASE}/{publisher}/{year}/{num}/text.{ext}"
    resp = requests.get(url, headers={"Accept": "text/html" if not as_pdf else "application/octet-stream"})
    resp.raise_for_status()
    return resp.content if as_pdf else resp.text

In [None]:
def fetch_and_save(publisher:str, year:int, limit:int=100, offset:int=0, save_dir:str="data_raw"):
    """
    The main function is to download files and save them locally.
    """
    # utwórz foldery
    mkdir_if_not(save_dir)
    # pobierz listę aktów
    acts_data = fetch_acts_for_year(publisher, year, limit, offset)
    items = acts_data.get("items", [])
    manifest = []
    for act in items:
        # act ma pola m.in. 'pos' – numer aktu w danym roku, 'textHTML', 'textPDF'
        pos = act.get("pos")
        eli = act.get("ELI")
        title = act.get("title")
        has_html = act.get("textHTML", False)
        has_pdf = act.get("textPDF", False)

        # preferuj html, jeśli dostępne, inaczej pdf
        try:
            if has_html:
                content = fetch_act_text(publisher, year, pos, as_pdf=False)
                fname = f"{publisher}_{year}_{pos}.html"
                mode = "w"
                encoding = "utf-8"
            elif has_pdf:
                content = fetch_act_text(publisher, year, pos, as_pdf=True)
                fname = f"{publisher}_{year}_{pos}.pdf"
                mode = "wb"
                encoding = None
            else:
                # brak tekstu, pomiń
                print(f"No text for act {publisher} {year} {pos}")
                continue

            path = os.path.join(save_dir, fname)
            # zapisz plik
            if encoding:
                with open(path, mode, encoding=encoding) as f:
                    f.write(content)
            else:
                with open(path, mode) as f:
                    f.write(content)

            manifest.append({
                "publisher": publisher,
                "year": year,
                "pos": pos,
                "ELI": eli,
                "title": title,
                "filename": fname,
                "path": path,
                "has_html": has_html,
                "has_pdf": has_pdf
            })

            # niewielki delay, by nie przeciążyć API
            time.sleep(0.2)
        except Exception as e:
            print(f"Error fetching act {publisher} {year} {pos}: {e}")
        
    # zapisz manifest
    manifest_path = os.path.join(save_dir, f"{publisher}_{year}_manifest.json")
    with open(manifest_path, "w", encoding="utf-8") as mf:
        json.dump(manifest, mf, ensure_ascii=False, indent=2)

    return manifest

In [None]:
publisher = "DU"
year = 2020
manifest = fetch_and_save(publisher, year, limit=50, offset=0, save_dir=f"{RAW_DATA}")
print("Fetched acts:", len(manifest))