In [38]:
import camelot
import pandas as pd

In [118]:
def merge_multipage_table_to_csv(
    pdf_path,
    pages="all",
    flavor="lattice",
    output_file="merged_table.csv",
):
    """
    Parse a table that spans multiple pages and merge into a single CSV file

    Parameters:
    -----------
    pdf_path : str
        Path to the PDF file
    pages : str
        Pages to parse. Can be 'all', '1,2,3', or '1-3'
    flavor : str
        'lattice' for tables with lines, 'stream' for tables without lines
    output_file : str
        Name of the output CSV file

    Returns:
    --------
    pandas.DataFrame: The merged table
    """

    # Read PDF tables
    print(f"Reading tables from {pdf_path}...")
    tables = camelot.read_pdf(
        pdf_path,
        pages=pages,
        flavor=flavor,
        table_areas=["28,813,567,27"],
        columns=["60,144,267,332,422,497"],
        row_tol=9,
    )

    if len(tables) == 0:
        print("No tables found in the PDF")
        return None, None

    print(f"Found {len(tables)} table(s) across pages")

    # Initialize merged dataframe
    merged_df = None

    for i, table in enumerate(tables):
        df = table.df

        df = df.apply(
            lambda col: (
                col.astype(str)
                .str.replace("\n", "", regex=False)
                .str.replace("\r", "", regex=False)
                .str.strip()
                if col.dtype == "object"
                else col
            )
        )

        print(f"  Page table {i+1}: Shape {df.shape}")

        if i == 0:
            # First table - keep as is
            merged_df = df.iloc[2:]
        else:
            df = df.iloc[1:]  # Skip the header row
            merged_df = pd.concat([merged_df, df], ignore_index=True)

    # Clean up the merged dataframe
    first_row = merged_df.iloc[0].astype(str)
    merged_df.columns = first_row
    merged_df = merged_df[1:].reset_index(drop=True)

    # Remove any completely empty rows
    merged_df = merged_df.dropna(how="all").reset_index(drop=True)

    # Save to CSV
    merged_df.to_csv(output_file, index=False)
    print(f"\nMerged table saved to: {output_file}")
    print(f"Final table shape: {merged_df.shape}")

    return merged_df, tables

In [119]:
if __name__ == "__main__":
    df, tables = merge_multipage_table_to_csv(
        pdf_path="oficjalny_spis_pna_2025.pdf",
        pages="3-350",
        flavor="stream",
        output_file="merged_output.csv",
    )

Reading tables from oficjalny_spis_pna_2025.pdf...
Found 348 table(s) across pages
  Page table 1: Shape (76, 7)
  Page table 2: Shape (81, 7)
  Page table 3: Shape (81, 7)
  Page table 4: Shape (81, 7)
  Page table 5: Shape (81, 7)
  Page table 6: Shape (80, 7)
  Page table 7: Shape (81, 7)
  Page table 8: Shape (81, 7)
  Page table 9: Shape (81, 7)
  Page table 10: Shape (81, 7)
  Page table 11: Shape (81, 7)
  Page table 12: Shape (81, 7)
  Page table 13: Shape (81, 7)
  Page table 14: Shape (81, 7)
  Page table 15: Shape (81, 7)
  Page table 16: Shape (81, 7)
  Page table 17: Shape (81, 7)
  Page table 18: Shape (81, 7)
  Page table 19: Shape (79, 7)
  Page table 20: Shape (81, 7)
  Page table 21: Shape (81, 7)
  Page table 22: Shape (81, 7)
  Page table 23: Shape (79, 7)
  Page table 24: Shape (80, 7)
  Page table 25: Shape (81, 7)
  Page table 26: Shape (80, 7)
  Page table 27: Shape (81, 7)
  Page table 28: Shape (81, 7)
  Page table 29: Shape (80, 7)
  Page table 30: Shape (81,

In [120]:
df = pd.read_csv("merged_output.csv")

# Filter rows where PNA is missing
missing_pna = df[df["PNA"].isna() | (df["PNA"] == "")]

missing_pna

Unnamed: 0,PNA,Miejscowość,Ulica,Numery,Gmina,Powiat,Województwo
133,,,,12-15,,,
1693,,,,"28-102(p), 115",,,
1718,,,,94-392(p),,,
2061,,,,69-DK(n),,,
2065,,,,4-26a(p),,,
...,...,...,...,...,...,...,...
27171,,,,6-14(p),,,
27228,,,,29-39a(n),,,
27413,,,,"180-266(p),",,,
27414,,,,"199-DK,",,,


In [128]:
postal_code_pattern = r"^\d{2}-\d{3}$"

# Filter rows where PNA does not match Polish postal code
invalid_pna = df[
    (df["PNA"].notna())
    & (~df["PNA"].astype(str).str.strip().str.match(postal_code_pattern))
]

invalid_pna

Unnamed: 0,PNA,Miejscowość,Ulica,Numery,Gmina,Powiat,Województwo
