# Polish Postal Codes PDF Parser - Clean Version

This notebook parses the Polish postal codes PDF and converts it to CSV format.
Handles complex address ranges and compound municipality names like 'Nowe Miasto'.


In [1]:
import pdfplumber
import pandas as pd
import re
from typing import List, Dict, Optional
import numpy as np

print("Libraries imported successfully")

Libraries imported successfully


In [2]:
# Column structure
COLUMNS = ["PNA", "Miejscowość", "Ulica", "Numery", "Gmina", "Powiat", "Województwo"]


def clean_text(text: str) -> str:
    """Clean extracted text by removing extra whitespace and newlines"""
    if not text:
        return ""
    return re.sub(r"\s+", " ", text.strip())


def is_postal_code(text: str) -> bool:
    """Check if text matches Polish postal code format (XX-XXX)"""
    if not text:
        return False
    return bool(re.match(r"^\d{2}-\d{3}$", text.strip()))


print("Helper functions defined")

Helper functions defined


In [3]:
def parse_postal_record(line: str) -> Optional[Dict[str, str]]:
    """Parse a single line into postal record components"""
    line = clean_text(line)

    if not line:
        return None

    # Skip header/footer lines
    skip_patterns = [
        "Poczta Polska",
        "Oficjalny Spis",
        "Strona",
        "Copyright",
        "PNA Miejscowość",
        "Część 1",
        "miejscowości i ulic",
    ]

    if any(pattern in line for pattern in skip_patterns):
        return None

    parts = line.split()

    if len(parts) < 4:
        return None

    # First part must be postal code
    if not is_postal_code(parts[0]):
        return None

    postal_code = parts[0]

    # Known voivodeships
    voivodeships = [
        "mazowieckie",
        "śląskie",
        "wielkopolskie",
        "małopolskie",
        "lubelskie",
        "podkarpackie",
        "dolnośląskie",
        "kujawsko-pomorskie",
        "pomorskie",
        "łódzkie",
        "zachodniopomorskie",
        "lubuskie",
        "podlaskie",
        "świętokrzyskie",
        "opolskie",
        "warmińsko-mazurskie",
    ]

    # Find voivodeship from the end
    voivodeship = ""
    voiv_idx = -1

    for i in range(len(parts) - 1, -1, -1):
        if parts[i] in voivodeships:
            voivodeship = parts[i]
            voiv_idx = i
            break
        # Check compound voivodeships
        if i > 0:
            compound = parts[i - 1] + "-" + parts[i]
            if compound in voivodeships:
                voivodeship = compound
                voiv_idx = i - 1
                break

    if not voivodeship or voiv_idx < 3:
        return None

    # Powiat is just before voivodeship
    powiat_idx = voiv_idx - 1
    if powiat_idx < 2:
        return None

    powiat = parts[powiat_idx]

    # Everything between postal code and powiat
    remaining_parts = parts[1:powiat_idx]

    if len(remaining_parts) < 1:
        return None

    # Common compound municipality names
    compound_gminas = [
        "Nowe Miasto",
        "Stare Miasto",
        "Biała Rawska",
        "Góra Kalwaria",
        "Nowa Dęba",
        "Stary Sącz",
        "Nowy Dwór",
        "Biała Podlaska",
        "Pruszcz Gdański",
        "Nowy Tomyśl",
        "Stary Dzierzgoń",
    ]

    # Try to identify gmina (could be 1 or 2 words)
    gmina = ""
    gmina_word_count = 1

    # Check if last 2 words form a compound gmina
    if len(remaining_parts) >= 2:
        potential_compound = remaining_parts[-2] + " " + remaining_parts[-1]
        if potential_compound in compound_gminas:
            gmina = potential_compound
            gmina_word_count = 2
        else:
            gmina = remaining_parts[-1]
            gmina_word_count = 1
    else:
        gmina = remaining_parts[-1]
        gmina_word_count = 1

    # Everything before gmina is address info
    address_parts = remaining_parts[:-gmina_word_count]

    if len(address_parts) < 1:
        return None

    # First address part is locality
    locality = address_parts[0]

    # Remaining parts are street and/or numbers
    street = ""
    numbers = ""

    if len(address_parts) > 1:
        address_remaining = address_parts[1:]

        # Separate street from numbers - work backwards
        street_parts = []
        number_parts = []

        collecting_numbers = False

        for i in range(len(address_remaining) - 1, -1, -1):
            part = address_remaining[i]

            # Check if this looks like numbers/ranges
            if re.search(r"\d|\(.*\)|^DK$", part) or part in [",", "-", "n", "p"]:
                number_parts.insert(0, part)
                collecting_numbers = True
            else:
                if not collecting_numbers:
                    street_parts.insert(0, part)
                else:
                    # Could be part of complex address
                    number_parts.insert(0, part)

        street = " ".join(street_parts)
        numbers = " ".join(number_parts)

    return {
        "PNA": postal_code,
        "Miejscowość": locality,
        "Ulica": street,
        "Numery": numbers,
        "Gmina": gmina,
        "Powiat": powiat,
        "Województwo": voivodeship,
    }


print("Parser function defined")

Parser function defined


In [4]:
# Test the parser with problematic examples
test_lines = [
    "05-192 Aleksandria 4-6, 7-9(n), 12-15 Nowe Miasto płoński mazowieckie",
    "09-120 Aleksandria 8-10(p), 11, 31 Nowe Miasto płoński mazowieckie",
    "83-440 Abisynia Karsin kościerski pomorskie",
    "20-388 Abramowice Kościelne Głusk lubelski lubelskie",
]

print("Testing parser on sample lines:")
print()

for i, line in enumerate(test_lines):
    print(f"{i+1}. Input: {line}")
    result = parse_postal_record(line)
    if result:
        print(
            f"   ✓ {result['PNA']} | {result['Miejscowość']:12} | {result['Ulica']:15} | {result['Numery']:20} | {result['Gmina']:15} | {result['Powiat']:12} | {result['Województwo']}"
        )
    else:
        print("   ✗ Failed to parse")
    print()

Testing parser on sample lines:

1. Input: 05-192 Aleksandria 4-6, 7-9(n), 12-15 Nowe Miasto płoński mazowieckie
   ✓ 05-192 | Aleksandria  |                 | 4-6, 7-9(n), 12-15   | Nowe Miasto     | płoński      | mazowieckie

2. Input: 09-120 Aleksandria 8-10(p), 11, 31 Nowe Miasto płoński mazowieckie
   ✓ 09-120 | Aleksandria  |                 | 8-10(p), 11, 31      | Nowe Miasto     | płoński      | mazowieckie

3. Input: 83-440 Abisynia Karsin kościerski pomorskie
   ✓ 83-440 | Abisynia     |                 |                      | Karsin          | kościerski   | pomorskie

4. Input: 20-388 Abramowice Kościelne Głusk lubelski lubelskie
   ✓ 20-388 | Abramowice   | Kościelne       |                      | Głusk           | lubelski     | lubelskie



In [5]:
def parse_pdf_to_records(
    pdf_path: str, start_page: int = 1, end_page: int = None
) -> tuple:
    """Parse PDF pages and return (records, unparsed_lines)"""
    results = []
    unparsed_lines = []

    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)

        if end_page is None:
            end_page = total_pages

        print(f"Processing pages {start_page} to {end_page} of {total_pages}")

        for page_num in range(start_page - 1, min(end_page, total_pages)):
            page = pdf.pages[page_num]
            current_page = page_num + 1

            print(f"Page {current_page}...", end=" ")

            try:
                text = page.extract_text()
                if not text:
                    print("(no text)")
                    continue

                lines = text.split("\n")
                page_records = 0

                for line_num, line in enumerate(lines):
                    parsed = parse_postal_record(line)

                    if parsed:
                        results.append(parsed)
                        page_records += 1
                    else:
                        # Check if this looks like a postal record that failed
                        line_clean = clean_text(line)
                        if line_clean and len(line_clean.split()) >= 4:
                            first_part = line_clean.split()[0]
                            if is_postal_code(first_part):
                                unparsed_lines.append(
                                    {
                                        "page": current_page,
                                        "line": line_num + 1,
                                        "content": line_clean,
                                    }
                                )

                print(f"({page_records} records)")

            except Exception as e:
                print(f"Error: {e}")
                continue

    return results, unparsed_lines


print("PDF parsing function defined")

PDF parsing function defined


In [6]:
# Parse the sample PDF
pdf_path = "pages_3_to_22.pdf"

print("Parsing PDF...")
records, unparsed = parse_pdf_to_records(pdf_path, start_page=1, end_page=5)

print(f"\n✓ Successfully parsed: {len(records)} records")
print(f"✗ Could not parse: {len(unparsed)} lines")

Parsing PDF...
Processing pages 1 to 5 of 20
Page 1... (73 records)
Page 2... (79 records)
Page 3... (80 records)
Page 4... (80 records)
Page 5... (80 records)

✓ Successfully parsed: 392 records
✗ Could not parse: 0 lines


In [7]:
# Show results
if records:
    print("\n=== FIRST 15 PARSED RECORDS ===")
    for i, record in enumerate(records[:15]):
        print(
            f"{i+1:2d}. {record['PNA']} | {record['Miejscowość']:15} | {record['Ulica']:20} | {record['Numery']:20} | {record['Gmina']:15} | {record['Powiat']:12} | {record['Województwo']}"
        )

    # Look specifically for Aleksandria records
    aleksandria_records = [
        r for r in records if "Aleksandria" in r.get("Miejscowość", "")
    ]

    if aleksandria_records:
        print("\n🎯 ALEKSANDRIA RECORDS (should show 'Nowe Miasto' as gmina):")
        for i, record in enumerate(aleksandria_records):
            print(
                f"{i+1}. {record['PNA']} | {record['Miejscowość']} | '{record['Ulica']}' | '{record['Numery']}' | {record['Gmina']} | {record['Powiat']} | {record['Województwo']}"
            )

if unparsed:
    print("\n=== UNPARSED LINES (first 5) ===")
    for item in unparsed[:5]:
        print(f"Page {item['page']}: {item['content'][:80]}...")


=== FIRST 15 PARSED RECORDS ===
 1. 83-440 | Abisynia        |                      |                      | Karsin          | kościerski   | pomorskie
 2. 20-388 | Abramowice      | Kościelne            |                      | Głusk           | lubelski     | lubelskie
 3. 20-388 | Abramowice      | Prywatne             |                      | Głusk           | lubelski     | lubelskie
 4. 23-450 | Abramów         |                      |                      | Goraj           | biłgorajski  | lubelskie
 5. 21-143 | Abramów         |                      |                      | Abramów         | lubartowski  | lubelskie
 6. 05-310 | Abramy          |                      |                      | Kałuszyn        | miński       | mazowieckie
 7. 16-123 | Achrymowce      |                      |                      | Kuźnica         | sokólski     | podlaskie
 8. 27-640 | Adamczowice     |                      |                      | Klimontów       | sandomierski | świętokrzyskie


In [8]:
# Export to CSV
def export_to_csv(
    records: List[Dict[str, str]], filename: str = "polish_postal_codes.csv"
):
    """Export records to CSV"""
    if not records:
        print("No records to export")
        return None

    df = pd.DataFrame(records)

    # Ensure all columns exist
    for col in COLUMNS:
        if col not in df.columns:
            df[col] = ""

    # Reorder columns
    df = df[COLUMNS]

    # Export
    df.to_csv(filename, index=False, encoding="utf-8")

    print(f"\n✓ Exported {len(records)} records to {filename}")

    # Statistics
    print(f"\n=== STATISTICS ===")
    print(f"Total records: {len(df)}")
    print(f"Unique postal codes: {df['PNA'].nunique()}")
    print(f"Unique voivodeships: {df['Województwo'].nunique()}")

    print(f"\nRecords per voivodeship:")
    for voiv, count in df["Województwo"].value_counts().items():
        print(f"  {voiv}: {count}")

    return df


# Export the results
if records:
    df = export_to_csv(records, "postal_codes_sample.csv")

    print(f"\n=== SAMPLE OF EXPORTED DATA ===")
    print(df.head(10).to_string(index=False))
else:
    print("No records to export")


✓ Exported 392 records to postal_codes_sample.csv

=== STATISTICS ===
Total records: 392
Unique postal codes: 307
Unique voivodeships: 16

Records per voivodeship:
  mazowieckie: 104
  łódzkie: 73
  lubelskie: 58
  wielkopolskie: 46
  kujawsko-pomorskie: 28
  podlaskie: 17
  świętokrzyskie: 17
  śląskie: 12
  warmińsko-mazurskie: 11
  pomorskie: 7
  małopolskie: 7
  podkarpackie: 5
  zachodniopomorskie: 3
  opolskie: 2
  dolnośląskie: 1
  lubuskie: 1

=== SAMPLE OF EXPORTED DATA ===
   PNA Miejscowość        Ulica Numery     Gmina       Powiat    Województwo
83-440    Abisynia                        Karsin   kościerski      pomorskie
20-388  Abramowice    Kościelne            Głusk     lubelski      lubelskie
20-388  Abramowice     Prywatne            Głusk     lubelski      lubelskie
23-450     Abramów                         Goraj  biłgorajski      lubelskie
21-143     Abramów                       Abramów  lubartowski      lubelskie
05-310      Abramy                      Kałuszyn 

In [9]:
# Function to process the full PDF when ready
def process_full_pdf(pdf_path: str, output_file: str = "complete_postal_codes.csv"):
    """Process the complete PDF file"""
    print("=== PROCESSING FULL PDF ===")
    print("This will process the entire PDF...")

    response = input("Continue? (y/n): ").strip().lower()
    if response != "y":
        print("Cancelled.")
        return

    all_records, all_unparsed = parse_pdf_to_records(pdf_path)

    print(f"\n=== FINAL RESULTS ===")
    print(f"Total records: {len(all_records)}")
    print(f"Unparsed lines: {len(all_unparsed)}")

    if all_records:
        final_df = export_to_csv(all_records, output_file)

        # Save unparsed lines for review
        if all_unparsed:
            unparsed_df = pd.DataFrame(all_unparsed)
            unparsed_file = "unparsed_lines.csv"
            unparsed_df.to_csv(unparsed_file, index=False)
            print(f"Saved unparsed lines to {unparsed_file}")

        return final_df, all_unparsed

    return None, all_unparsed


print("\nTo process the full PDF, run:")
print("process_full_pdf('oficjalny_spis_pna_2025.pdf')")
print("\nReady to process!")


To process the full PDF, run:
process_full_pdf('oficjalny_spis_pna_2025.pdf')

Ready to process!
