In [None]:
# CLEAN VERSION - Enhanced parser with multiline support
def enhanced_parse_pdf_clean(pdf_path: str, start_page: int = 1, end_page: int = None, 
                            max_unparsed: int = 10) -> tuple:
    """
    Enhanced PDF parser that handles multiline records and tracks unparsed lines
    Returns (parsed_records, unparsed_lines)
    """
    results = []
    unparsed_lines = []
    
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        
        if end_page is None:
            end_page = total_pages
        
        print(f"Processing pages {start_page} to {end_page} of {total_pages}")
        
        for page_num in range(start_page - 1, min(end_page, total_pages)):
            page = pdf.pages[page_num]
            current_page = page_num + 1
            
            print(f"Processing page {current_page}...", end=' ')
            
            try:
                text = page.extract_text()
                if not text:
                    print("(no text)")
                    continue
                
                lines = text.split('\\n')
                i = 0
                page_records = 0
                
                while i < len(lines):
                    line = clean_text(lines[i])
                    
                    if not line:
                        i += 1
                        continue
                    
                    # Skip header/footer lines
                    skip_keywords = ['Poczta Polska', 'Strona', 'Copyright', 'PNA Miejscowość Ulica', 'Część 1']
                    if any(keyword in line for keyword in skip_keywords):
                        i += 1
                        continue
                    
                    # Try to parse using the fixed advanced parser if available
                    parsed = None
                    if 'fixed_advanced_parse_row' in globals():
                        parsed = fixed_advanced_parse_row(line)
                    else:
                        parsed = improved_parse_row(line)
                    
                    if parsed:
                        results.append(parsed)
                        page_records += 1
                        i += 1
                    else:
                        # Try multiline parsing
                        if 'handle_multiline_records' in globals():
                            parsed_multi, lines_consumed = handle_multiline_records(lines, i)
                            
                            if parsed_multi:
                                results.append(parsed_multi)
                                page_records += 1
                                i += lines_consumed
                            else:
                                # This line couldn't be parsed
                                parts = line.split()
                                if parts and is_postal_code(parts[0]) and len(unparsed_lines) < max_unparsed:
                                    unparsed_lines.append({
                                        'page': current_page,
                                        'line_number': i + 1,
                                        'content': line
                                    })
                                i += 1
                        else:
                            # No multiline function available, just skip
                            i += 1
                
                print(f"({page_records} records)")
                
            except Exception as e:
                print(f"Error on page {current_page}: {str(e)}")
                continue
    
    return results, unparsed_lines

# Test the clean enhanced parser
print("Testing CLEAN enhanced parser...")
try:
    data_clean, unparsed_clean = enhanced_parse_pdf_clean(pdf_path, start_page=1, end_page=5)

    print(f"\\n✓ Successfully parsed: {len(data_clean)} records")
    print(f"✗ Could not parse: {len(unparsed_clean)} lines")

    if data_clean:
        print("\\n=== SAMPLE PARSED RECORDS ===")
        for i, record in enumerate(data_clean[:10]):
            print(f"{i+1:2d}. {record['PNA']} | {record['Miejscowość'][:15]:15} | {record['Ulica'][:15]:15} | {record['Gmina'][:12]:12} | {record['Powiat'][:12]:12} | {record['Województwo']}")
        
        # Look for Aleksandria records specifically
        aleksandria_records = [r for r in data_clean if 'Aleksandria' in r.get('Miejscowość', '')]
        if aleksandria_records:
            print("\\n=== ALEKSANDRIA RECORDS ===")
            for i, record in enumerate(aleksandria_records):
                print(f"{i+1}. {record['PNA']} | {record['Miejscowość']} | '{record['Ulica']}' | '{record['Numery']}' | {record['Gmina']} | {record['Powiat']}")

    if unparsed_clean:
        print("\\n=== UNPARSED LINES ===")
        for item in unparsed_clean[:5]:  # Show first 5
            print(f"Page {item['page']}: {item['content'][:80]}...")
            
except Exception as e:
    print(f"Error running parser: {e}")
    import traceback
    traceback.print_exc()"

In [None]:
# Re-run the full parser with the fixed version
def final_enhanced_parse_pdf(pdf_path: str, start_page: int = 1, end_page: int = None, 
                           max_unparsed: int = 10) -> tuple:
    \"\"\"
    Final enhanced PDF parser using the fixed advanced parsing logic
    \"\"\"
    results = []
    unparsed_lines = []
    
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        
        if end_page is None:
            end_page = total_pages
        
        print(f\"Processing pages {start_page} to {end_page} of {total_pages} (using FIXED parser)\")
        
        for page_num in range(start_page - 1, min(end_page, total_pages)):
            page = pdf.pages[page_num]
            current_page = page_num + 1
            
            print(f\"Processing page {current_page}...\", end=' ')
            
            try:
                text = page.extract_text()
                if not text:
                    print(\"(no text)\")
                    continue
                
                lines = text.split('\\n')
                i = 0
                page_records = 0
                
                while i < len(lines):
                    line = clean_text(lines[i])
                    
                    if not line:
                        i += 1
                        continue
                    
                    # Skip header/footer lines
                    if any(keyword in line for keyword in ['Poczta Polska', 'Strona', 'Copyright', 'PNA Miejscowość Ulica', 'Część 1']):
                        i += 1
                        continue
                    
                    # Try the fixed advanced parsing first
                    parsed = fixed_advanced_parse_row(line)
                    
                    if parsed:
                        results.append(parsed)
                        page_records += 1
                        i += 1
                    else:
                        # Try multiline parsing as fallback
                        parsed_multi, lines_consumed = handle_multiline_records(lines, i)
                        
                        if parsed_multi:
                            # Re-process the multiline result with fixed parser
                            combined_line = ' '.join([lines[i+j].strip() for j in range(lines_consumed)])
                            reparsed = fixed_advanced_parse_row(combined_line)
                            if reparsed:
                                results.append(reparsed)
                            else:
                                results.append(parsed_multi)
                            page_records += 1
                            i += lines_consumed
                        else:
                            # This line couldn't be parsed
                            parts = line.split()
                            if parts and is_postal_code(parts[0]) and len(unparsed_lines) < max_unparsed:
                                unparsed_lines.append({
                                    'page': current_page,
                                    'line_number': i + 1,
                                    'content': line
                                })
                            i += 1
                
                print(f\"({page_records} records)\")
                
            except Exception as e:
                print(f\"Error on page {current_page}: {str(e)}\")
                continue
    
    return results, unparsed_lines

# Run the final parser
print(\"Running FINAL enhanced parser...\")\nfinal_data, final_unparsed = final_enhanced_parse_pdf(pdf_path, start_page=1, end_page=5)

print(f\"\\n✓ Successfully parsed: {len(final_data)} records\")
print(f\"✗ Could not parse: {len(final_unparsed)} lines\")

if final_data:
    print(\"\\n=== FINAL PARSING RESULTS (first 15 records) ===\")
    for i, record in enumerate(final_data[:15]):
        print(f\"{i+1:2d}. {record['PNA']} | {record['Miejscowość']:15} | {record['Ulica']:20} | {record['Numery']:20} | {record['Gmina']:15} | {record['Powiat']:12} | {record['Województwo']}\")
    
    # Look for the specific Aleksandria records
    aleksandria_fixed = [r for r in final_data if 'Aleksandria' in r.get('Miejscowość', '') and 'Nowe Miasto' in r.get('Gmina', '')]
    
    if aleksandria_fixed:
        print(\"\\n🎉 ALEKSANDRIA RECORDS - FIXED:\")
        for i, record in enumerate(aleksandria_fixed):
            print(f\"{i+1}. {record['PNA']} | {record['Miejscowość']} | '{record['Ulica']}' | '{record['Numery']}' | {record['Gmina']} | {record['Powiat']} | {record['Województwo']}\")

# Export the final results
if final_data:
    print(\"\\nExporting FINAL results...\")
    final_df = export_to_csv(final_data, \"postal_codes_FINAL.csv\")

if final_unparsed:
    print(\"\\n=== FINAL UNPARSED LINES (still need help) ===\")
    for item in final_unparsed:
        print(f\"Page {item['page']}: {item['content'][:80]}...\")"

In [None]:
# Fixed advanced parser for compound names and complex addresses
def fixed_advanced_parse_row(line: str) -> Optional[Dict[str, str]]:
    \"\"\"
    Fixed advanced parser that properly handles compound municipality names
    \"\"\"
    line = clean_text(line)
    
    if not line:
        return None
    
    # Skip header/footer lines
    skip_patterns = [
        'Poczta Polska', 'Oficjalny Spis', 'Strona', 'Copyright', 
        'PNA Miejscowość', 'Część 1', 'miejscowości i ulic'
    ]
    
    if any(pattern in line for pattern in skip_patterns):
        return None
    
    parts = line.split()
    
    if len(parts) < 4:
        return None
    
    # First part must be postal code
    if not is_postal_code(parts[0]):
        return None
    
    postal_code = parts[0]
    
    # Known voivodeships
    voivodeships = [
        'mazowieckie', 'śląskie', 'wielkopolskie', 'małopolskie', 'lubelskie',
        'podkarpackie', 'dolnośląskie', 'kujawsko-pomorskie', 'pomorskie',
        'łódzkie', 'zachodniopomorskie', 'lubuskie', 'podlaskie', 'świętokrzyskie',
        'opolskie', 'warmińsko-mazurskie'
    ]
    
    # Find voivodeship from the end
    voivodeship = \"\"
    voiv_idx = -1
    
    for i in range(len(parts) - 1, -1, -1):
        if parts[i] in voivodeships:
            voivodeship = parts[i]
            voiv_idx = i
            break
        # Check compound voivodeships
        if i > 0:
            compound = parts[i-1] + '-' + parts[i]
            if compound in voivodeships:
                voivodeship = compound
                voiv_idx = i - 1
                break
    
    if not voivodeship or voiv_idx < 3:
        return None
    
    # Powiat is just before voivodeship
    if voivodeship.count('-') > 0:  # compound voivodeship
        powiat_idx = voiv_idx - 1
    else:
        powiat_idx = voiv_idx - 1
    
    if powiat_idx < 2:
        return None
        
    powiat = parts[powiat_idx]
    
    # Now we need to find where gmina ends and address begins
    # Working backwards from powiat
    remaining_parts = parts[1:powiat_idx]  # everything between postal code and powiat
    
    if len(remaining_parts) < 1:
        return None
    
    # Common compound municipality names that we should keep together
    compound_gminas = [
        'Nowe Miasto', 'Stare Miasto', 'Biała Rawska', 'Góra Kalwaria',
        'Nowa Dęba', 'Stary Sącz', 'Nowy Dwór', 'Biała Podlaska',
        'Pruszcz Gdański', 'Nowy Tomyśl', 'Stary Dzierzgoń'
    ]
    
    # Try to identify gmina (could be 1 or 2 words)
    gmina = \"\"
    gmina_word_count = 1
    
    # Check if last 2 words form a compound gmina
    if len(remaining_parts) >= 2:
        potential_compound = remaining_parts[-2] + ' ' + remaining_parts[-1]
        if potential_compound in compound_gminas:
            gmina = potential_compound
            gmina_word_count = 2
        else:
            gmina = remaining_parts[-1]
            gmina_word_count = 1
    else:
        gmina = remaining_parts[-1]
        gmina_word_count = 1
    
    # Everything before gmina is address info (locality, street, numbers)
    address_parts = remaining_parts[:-gmina_word_count]
    
    if len(address_parts) < 1:
        return None
    
    # First address part is always locality
    locality = address_parts[0]
    
    # Remaining parts are street and/or numbers
    street = \"\"
    numbers = \"\"
    
    if len(address_parts) > 1:
        address_remaining = address_parts[1:]
        
        # Separate street from numbers
        street_parts = []
        number_parts = []
        
        # Strategy: collect everything that looks like numbers/ranges at the end
        # Work backwards through address_remaining
        collecting_numbers = False
        
        for i in range(len(address_remaining) - 1, -1, -1):
            part = address_remaining[i]
            
            # Check if this looks like a number, range, or number modifier
            if re.search(r'^\\d|\\d$|^\\d+[-,]|\\(.*\\)|^DK$', part) or part in [',', '-', 'n', 'p']:
                number_parts.insert(0, part)  # Insert at beginning since we're going backwards
                collecting_numbers = True
            else:
                # If we haven't started collecting numbers, it's street
                # If we have started, we need to decide if this continues the numbers or is street
                if not collecting_numbers:
                    street_parts.insert(0, part)
                else:
                    # This could be part of a complex address - let's be conservative
                    # and include it in numbers for now
                    number_parts.insert(0, part)
        
        street = ' '.join(street_parts)
        numbers = ' '.join(number_parts)
    
    return {
        'PNA': postal_code,
        'Miejscowość': locality,
        'Ulica': street,
        'Numery': numbers,
        'Gmina': gmina,
        'Powiat': powiat,
        'Województwo': voivodeship
    }

# Test the fixed parser
test_lines = [
    \"05-192 Aleksandria 4-6, 7-9(n), 12-15 Nowe Miasto płoński mazowieckie\",
    \"09-120 Aleksandria 8-10(p), 11, 31 Nowe Miasto płoński mazowieckie\",
    \"83-440 Abisynia Karsin kościerski pomorskie\",
    \"20-388 Abramowice Kościelne Głusk lubelski lubelskie\"
]

print(\"Testing FIXED advanced parser:\")\nfor i, line in enumerate(test_lines):
    print(f\"\\n{i+1}. Input: {line}\")
    result = fixed_advanced_parse_row(line)
    if result:
        print(f\"   ✓ {result['PNA']} | {result['Miejscowość']:12} | {result['Ulica']:15} | {result['Numery']:20} | {result['Gmina']:15} | {result['Powiat']:12} | {result['Województwo']}\")
    else:
        print(\"   ✗ Failed to parse\")"

In [None]:
# Update the main parsing function to use the advanced parser
def enhanced_parse_pdf_v2(pdf_path: str, start_page: int = 1, end_page: int = None, 
                         max_unparsed: int = 10) -> tuple:
    results = []
    unparsed_lines = []
    
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        
        if end_page is None:
            end_page = total_pages
        
        print(f\"Processing pages {start_page} to {end_page} of {total_pages} (using advanced parser v2)\")
        
        for page_num in range(start_page - 1, min(end_page, total_pages)):
            page = pdf.pages[page_num]
            current_page = page_num + 1
            
            print(f\"Processing page {current_page}...\", end=' ')
            
            try:
                text = page.extract_text()
                if not text:
                    print(\"(no text)\")
                    continue
                
                lines = text.split('\\n')
                i = 0
                page_records = 0
                
                while i < len(lines):
                    line = clean_text(lines[i])
                    
                    if not line:
                        i += 1
                        continue
                    
                    # Skip header/footer lines
                    if any(keyword in line for keyword in ['Poczta Polska', 'Strona', 'Copyright', 'PNA Miejscowość Ulica', 'Część 1']):\n                        i += 1\n                        continue\n                    \n                    # Try advanced parsing first\n                    parsed = advanced_parse_row(line)\n                    \n                    if parsed:\n                        results.append(parsed)\n                        page_records += 1\n                        i += 1\n                    else:\n                        # Try multiline parsing\n                        parsed_multi, lines_consumed = handle_multiline_records(lines, i)\n                        \n                        if parsed_multi:\n                            results.append(parsed_multi)\n                            page_records += 1\n                            i += lines_consumed\n                        else:\n                            # This line couldn't be parsed\n                            parts = line.split()\n                            if parts and is_postal_code(parts[0]) and len(unparsed_lines) < max_unparsed:\n                                unparsed_lines.append({\n                                    'page': current_page,\n                                    'line_number': i + 1,\n                                    'content': line\n                                })\n                            i += 1\n                \n                print(f\"({page_records} records)\")\n                \n            except Exception as e:\n                print(f\"Error on page {current_page}: {str(e)}\")\n                continue\n    \n    return results, unparsed_lines\n\n# Re-run the parser with the improved version\nprint(\"Re-running with advanced parser v2...\")\ndata_v2, unparsed_v2 = enhanced_parse_pdf_v2(pdf_path, start_page=1, end_page=5)\n\nprint(f\"\\n✓ Successfully parsed: {len(data_v2)} records\")\nprint(f\"✗ Could not parse: {len(unparsed_v2)} lines\")\n\nif data_v2:\n    print(\"\\n=== IMPROVED PARSING RESULTS ===\")\n    \n    # Look specifically for the Aleksandria records\n    aleksandria_records = [r for r in data_v2 if r['Miejscowość'] == 'Aleksandria' and r['Gmina'].startswith('Nowe')]\n    \n    if aleksandria_records:\n        print(\"\\nAleksandria records (fixed):\")\n        for i, record in enumerate(aleksandria_records):\n            print(f\"{i+1}. {record['PNA']} | {record['Miejscowość']:15} | {record['Ulica']:20} | {record['Numery']:20} | {record['Gmina']:15} | {record['Powiat']:12} | {record['Województwo']}\")\n    \n    print(f\"\\nFirst 10 records overall:\")\n    for i, record in enumerate(data_v2[:10]):\n        print(f\"{i+1:2d}. {record['PNA']} | {record['Miejscowość'][:15]:15} | {record['Ulica'][:15]:15} | {record['Numery'][:15]:15} | {record['Gmina'][:12]:12} | {record['Powiat'][:12]:12} | {record['Województwo']}\")\n\n# Compare improvements\nif len(data_v2) > len(data):\n    print(f\"\\n🎉 Improvement: {len(data_v2) - len(data)} more records parsed successfully!\")\nelif len(unparsed_v2) < len(unparsed):\n    print(f\"\\n🎉 Improvement: {len(unparsed) - len(unparsed_v2)} fewer unparsed lines!\")\n\n# Export the improved results\nif data_v2:\n    print(\"\\nExporting improved results...\")\n    df_v2 = export_to_csv(data_v2, \"postal_codes_sample_v2.csv\")\n    \nif unparsed_v2:\n    print(\"\\n=== REMAINING UNPARSED LINES ===\")\n    for item in unparsed_v2:\n        print(f\"Page {item['page']}: {item['content'][:100]}...\")"

SyntaxError: unexpected character after line continuation character (1991665567.py, line 4)

In [None]:
# Improved parser to handle complex address ranges
def advanced_parse_row(line: str) -> Optional[Dict[str, str]]:
    \"\"\"
    Advanced row parsing that better handles complex number ranges and compound names
    \"\"\"
    line = clean_text(line)\n    
    if not line:
        return None
    
    # Skip obvious non-data lines
    skip_patterns = [
        'Poczta Polska', 'Oficjalny Spis', 'Strona', 'Copyright', 
        'PNA Miejscowość', 'Część 1', 'miejscowości i ulic'
    ]
    
    if any(pattern in line for pattern in skip_patterns):
        return None
    
    parts = line.split()
    
    if len(parts) < 4:
        return None
    
    # First part must be postal code
    if not is_postal_code(parts[0]):
        return None
    
    postal_code = parts[0]
    
    # Known voivodeships (including compound ones)
    voivodeships = [
        'mazowieckie', 'śląskie', 'wielkopolskie', 'małopolskie', 'lubelskie',
        'podkarpackie', 'dolnośląskie', 'kujawsko-pomorskie', 'pomorskie',
        'łódzkie', 'zachodniopomorskie', 'lubuskie', 'podlaskie', 'świętokrzyskie',
        'opolskie', 'warmińsko-mazurskie'
    ]
    
    # Find voivodeship
    voivodeship = \"\"
    voiv_idx = -1
    
    # Check from end backwards
    for i in range(len(parts) - 1, -1, -1):
        if parts[i] in voivodeships:
            voivodeship = parts[i]
            voiv_idx = i
            break
        # Check compound voivodeships
        if i > 0:
            compound = parts[i-1] + '-' + parts[i]
            if compound in voivodeships:
                voivodeship = compound
                voiv_idx = i - 1
                break
    
    if not voivodeship or voiv_idx < 3:
        return None
    
    # Extract powiat and gmina (should be just before voivodeship)
    remaining_parts = parts[1:voiv_idx]  # Everything between postal code and voivodeship
    
    if len(remaining_parts) < 2:
        return None
    
    # Powiat is the last part before voivodeship
    powiat = remaining_parts[-1]
    
    # Gmina could be compound (e.g., \"Nowe Miasto\")
    # We need to identify where the locality/street/numbers end and gmina begins
    
    # Strategy: Look for common patterns in gmina names and number patterns
    gmina_parts = []
    address_parts = remaining_parts[:-1]  # Everything except powiat
    
    # Common compound municipality names
    compound_municipalities = [
        \"Nowe Miasto\", \"Stare Miasto\", \"Biała Rawska\", \"Góra Kalwaria\",\        \"Nowa Dęba\", \"Stary Sącz\", \"Nowy Dwór\", \"Biała Podlaska\"
    ]
    
    # Check if the last few parts form a compound municipality name
    gmina = \"\"
    gmina_start_idx = len(address_parts)  # Default: no gmina found in address_parts
    
    # Try 2-word compound names first
    if len(address_parts) >= 2:
        potential_2word = address_parts[-2] + \" \" + address_parts[-1]
        if potential_2word in compound_municipalities:
            gmina = potential_2word
            gmina_start_idx = len(address_parts) - 2
        else:
            # Try single word
            gmina = address_parts[-1]
            gmina_start_idx = len(address_parts) - 1
    else:
        gmina = address_parts[-1]
        gmina_start_idx = len(address_parts) - 1
    
    # Everything before gmina is locality, street, and numbers
    location_parts = address_parts[:gmina_start_idx]
    
    if not location_parts:
        return None
    
    # First part is always locality
    locality = location_parts[0]
    
    # Separate street from numbers in remaining parts
    street_parts = []
    number_parts = []
    
    # Look for number patterns: digits, ranges, parenthetical notes
    number_indicators = re.compile(r'[0-9]|[()n,p-]|DK')
    
    collecting_numbers = False
    for part in location_parts[1:]:
        # If we find number indicators, start collecting numbers
        if number_indicators.search(part):
            collecting_numbers = True
            number_parts.append(part)
        else:
            # If not collecting numbers yet, it's part of street name
            if not collecting_numbers:
                street_parts.append(part)
            else:
                # This could be a continuation of address (like \"Nowe\" in \"Nowe Miasto\")
                # But since we're already collecting numbers, it might be part of complex address
                number_parts.append(part)
    
    return {
        'PNA': postal_code,
        'Miejscowość': locality,
        'Ulica': ' '.join(street_parts),
        'Numery': ' '.join(number_parts),
        'Gmina': gmina,
        'Powiat': powiat,
        'Województwo': voivodeship
    }

# Test the advanced parser on the problematic records
test_lines = [
    \"05-192 Aleksandria 4-6, 7-9(n), 12-15 Nowe Miasto płoński mazowieckie\",
    \"09-120 Aleksandria 8-10(p), 11, 31 Nowe Miasto płoński mazowieckie\"
]

print(\"Testing advanced parser on problematic records:\")\nfor i, line in enumerate(test_lines):
    print(f\"\\n{i+1}. Input: {line}\")
    result = advanced_parse_row(line)
    if result:
        print(f\"   ✓ PNA: {result['PNA']} | Miejscowość: {result['Miejscowość']} | Ulica: '{result['Ulica']}' | Numery: '{result['Numery']}' | Gmina: {result['Gmina']} | Powiat: {result['Powiat']} | Woj: {result['Województwo']}\")
    else:
        print(\"   ✗ Failed to parse\")"

In [None]:
# Process the full PDF (when ready)\ndef process_full_pdf(pdf_path: str, output_file: str = \"complete_postal_codes.csv\", \n                    batch_size: int = 50):\n    \"\"\"\n    Process the complete PDF in batches to avoid memory issues\n    \"\"\"\n    print(\"=== PROCESSING COMPLETE PDF ===\")\n    print(\"This will process the entire PDF (could take several minutes)...\")\n    \n    response = input(\"Continue? (y/n): \").strip().lower()\n    if response != 'y':\n        print(\"Cancelled.\")\n        return\n    \n    all_records = []\n    all_unparsed = []\n    \n    with pdfplumber.open(pdf_path) as pdf:\n        total_pages = len(pdf.pages)\n        print(f\"Total pages to process: {total_pages}\")\n        \n        # Process in batches\n        for start_page in range(1, total_pages + 1, batch_size):\n            end_page = min(start_page + batch_size - 1, total_pages)\n            \n            print(f\"\\nProcessing batch: pages {start_page}-{end_page}\")\n            \n            batch_records, batch_unparsed = enhanced_parse_pdf(\n                pdf_path, start_page, end_page, max_unparsed=50\n            )\n            \n            all_records.extend(batch_records)\n            all_unparsed.extend(batch_unparsed)\n            \n            print(f\"Batch results: {len(batch_records)} parsed, {len(batch_unparsed)} unparsed\")\n            print(f\"Total so far: {len(all_records)} records\")\n            \n            # Save intermediate results every few batches\n            if len(all_records) % (batch_size * 3) == 0 or end_page == total_pages:\n                temp_file = f\"temp_postal_codes_{len(all_records)}_records.csv\"\n                export_to_csv(all_records, temp_file)\n                print(f\"Saved intermediate results to {temp_file}\")\n    \n    # Final export\n    print(f\"\\n=== FINAL RESULTS ===\")\n    print(f\"Total records parsed: {len(all_records)}\")\n    print(f\"Total unparsed lines: {len(all_unparsed)}\")\n    \n    if all_records:\n        final_df = export_to_csv(all_records, output_file)\n        \n        # Save unparsed lines for manual review\n        if all_unparsed:\n            unparsed_df = pd.DataFrame(all_unparsed)\n            unparsed_file = \"unparsed_lines.csv\"\n            unparsed_df.to_csv(unparsed_file, index=False)\n            print(f\"Saved {len(all_unparsed)} unparsed lines to {unparsed_file} for manual review\")\n        \n        return final_df, all_unparsed\n    \n    return None, all_unparsed\n\nprint(\"Full PDF processing function ready.\")\nprint(\"\\nTo process the complete PDF, run:\")\nprint(\"process_full_pdf('oficjalny_spis_pna_2025.pdf')\")\nprint(\"\\nTo process just the sample (pages 3-22), run:\")\nprint(\"enhanced_parse_pdf('pages_3_to_22.pdf', start_page=1, end_page=20)\")"

In [None]:
# Export to CSV\ndef export_to_csv(records: List[Dict[str, str]], output_file: str = \"polish_postal_codes.csv\"):\n    \"\"\"Export parsed records to CSV file\"\"\"\n    if not records:\n        print(\"No records to export\")\n        return\n    \n    df = pd.DataFrame(records)\n    \n    # Ensure all expected columns are present\n    for col in COLUMNS:\n        if col not in df.columns:\n            df[col] = \"\"\n    \n    # Reorder columns to match expected structure\n    df = df[COLUMNS]\n    \n    # Save to CSV\n    df.to_csv(output_file, index=False, encoding='utf-8')\n    \n    print(f\"✓ Exported {len(records)} records to {output_file}\")\n    print(f\"Columns: {', '.join(COLUMNS)}\")\n    \n    # Show some statistics\n    print(f\"\\n=== EXPORT STATISTICS ===\")\n    print(f\"Total records: {len(df)}\")\n    print(f\"Unique postal codes: {df['PNA'].nunique()}\")\n    print(f\"Unique voivodeships: {df['Województwo'].nunique()}\")\n    \n    # Show voivodeship distribution\n    voiv_counts = df['Województwo'].value_counts()\n    print(f\"\\nRecords per voivodeship:\")\n    for voiv, count in voiv_counts.items():\n        print(f\"  {voiv}: {count}\")\n    \n    return df\n\n# Export current data to CSV\nif data:\n    print(\"Exporting parsed data to CSV...\")\n    df_export = export_to_csv(data, \"postal_codes_sample.csv\")\n    \n    print(f\"\\n=== SAMPLE OF EXPORTED DATA ===\")\n    print(df_export.head(10).to_string(index=False))\n    \nelse:\n    print(\"No data to export yet.\")"

In [None]:
# Interactive helper for unparsed records
def help_parse_record(line: str) -> Optional[Dict[str, str]]:\n    \"\"\"\n    Interactive helper to manually parse problematic records\n    \"\"\"\n    print(f\"\\nHelp parse this line: {line}\")\n    parts = line.strip().split()\n    print(f\"Parts: {parts}\")\n    \n    try:\n        # Let's try to identify patterns\n        postal_code = \"\"\n        locality = \"\"\n        street = \"\"\n        numbers = \"\"\n        gmina = \"\"\n        powiat = \"\"\n        voivodeship = \"\"\n        \n        # Find postal code\n        for i, part in enumerate(parts):\n            if is_postal_code(part):\n                postal_code = part\n                remaining_parts = parts[i+1:]\n                break\n        \n        if not postal_code:\n            print(\"No valid postal code found\")\n            return None\n            \n        print(f\"Postal code: {postal_code}\")\n        print(f\"Remaining parts: {remaining_parts}\")\n        \n        # Try to find voivodeship\n        voivodeships = [\n            'mazowieckie', 'śląskie', 'wielkopolskie', 'małopolskie', 'lubelskie',\n            'podkarpackie', 'dolnośląskie', 'kujawsko-pomorskie', 'pomorskie',\n            'łódzkie', 'zachodniopomorskie', 'lubuskie', 'podlaskie', 'świętokrzyskie',\n            'opolskie', 'warmińsko-mazurskie'\n        ]\n        \n        voiv_idx = -1\n        for i in range(len(remaining_parts) - 1, -1, -1):\n            if remaining_parts[i] in voivodeships:\n                voivodeship = remaining_parts[i]\n                voiv_idx = i\n                break\n            # Check compound\n            if i > 0:\n                compound = remaining_parts[i-1] + '-' + remaining_parts[i]\n                if compound in voivodeships:\n                    voivodeship = compound\n                    voiv_idx = i - 1\n                    break\n        \n        if voivodeship:\n            print(f\"Found voivodeship: {voivodeship} at position {voiv_idx}\")\n            \n            # Powiat and gmina should be before voivodeship\n            if voiv_idx >= 2:\n                powiat = remaining_parts[voiv_idx - 1]\n                gmina = remaining_parts[voiv_idx - 2]\n                print(f\"Inferred powiat: {powiat}, gmina: {gmina}\")\n                \n                # Everything before gmina is locality/street/numbers\n                middle_parts = remaining_parts[:voiv_idx - 2]\n                if middle_parts:\n                    locality = middle_parts[0]\n                    if len(middle_parts) > 1:\n                        # Rest could be street and numbers\n                        street_and_numbers = middle_parts[1:]\n                        # Simple heuristic: if it contains digits, it's probably numbers\n                        street_parts = []\n                        number_parts = []\n                        for part in street_and_numbers:\n                            if re.search(r'\\d', part):\n                                number_parts.append(part)\n                            else:\n                                if not number_parts:  # Only add to street if we haven't started numbers\n                                    street_parts.append(part)\n                        street = ' '.join(street_parts)\n                        numbers = ' '.join(number_parts)\n        \n        result = {\n            'PNA': postal_code,\n            'Miejscowość': locality,\n            'Ulica': street,\n            'Numery': numbers,\n            'Gmina': gmina,\n            'Powiat': powiat,\n            'Województwo': voivodeship\n        }\n        \n        print(f\"Suggested parsing: {result}\")\n        return result\n        \n    except Exception as e:\n        print(f\"Error in manual parsing: {e}\")\n        return None\n\n# Test manual parsing on unparsed records\nif unparsed:\n    print(\"\\n=== ATTEMPTING TO MANUALLY PARSE DIFFICULT RECORDS ===\")\n    manual_results = []\n    \n    for item in unparsed[:3]:  # Try first 3 unparsed records\n        print(f\"\\nTrying to parse from page {item['page']}: {item['content']}\")\n        manual_parsed = help_parse_record(item['content'])\n        if manual_parsed:\n            manual_results.append(manual_parsed)\n            print(\"✓ Successfully parsed manually\")\n        else:\n            print(\"✗ Still couldn't parse - needs human help\")\n    \n    if manual_results:\n        print(f\"\\nManually parsed {len(manual_results)} additional records!\")\n        data.extend(manual_results)\n        print(f\"Total records now: {len(data)}\")\nelse:\n    print(\"\\nNo unparsed records - great job!\")"

In [None]:
# Enhanced parser with multiline support
def enhanced_parse_pdf(pdf_path: str, start_page: int = 1, end_page: int = None, 
                      max_unparsed: int = 10) -> tuple:
    \"\"\"
    Enhanced PDF parser that handles multiline records and tracks unparsed lines
    Returns (parsed_records, unparsed_lines)
    \"\"\"
    results = []
    unparsed_lines = []
    
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        
        if end_page is None:
            end_page = total_pages
        
        print(f\"Processing pages {start_page} to {end_page} of {total_pages}\")\n        \n        for page_num in range(start_page - 1, min(end_page, total_pages)):\n            page = pdf.pages[page_num]\n            current_page = page_num + 1\n            \n            print(f\"Processing page {current_page}...\", end=' ')\n            \n            try:\n                text = page.extract_text()\n                if not text:\n                    print(\"(no text)\")\n                    continue\n                \n                lines = text.split('\\n')\n                i = 0\n                page_records = 0\n                \n                while i < len(lines):\n                    line = clean_text(lines[i])\n                    \n                    if not line:\n                        i += 1\n                        continue\n                    \n                    # Skip header/footer lines\n                    if any(keyword in line for keyword in ['Poczta Polska', 'Strona', 'Copyright', 'PNA Miejscowość Ulica', 'Część 1']):\n                        i += 1\n                        continue\n                    \n                    # Try simple parsing first\n                    parsed = improved_parse_row(line)\n                    \n                    if parsed:\n                        results.append(parsed)\n                        page_records += 1\n                        i += 1\n                    else:\n                        # Try multiline parsing\n                        parsed_multi, lines_consumed = handle_multiline_records(lines, i)\n                        \n                        if parsed_multi:\n                            results.append(parsed_multi)\n                            page_records += 1\n                            i += lines_consumed\n                        else:\n                            # This line couldn't be parsed\n                            parts = line.split()\n                            if parts and is_postal_code(parts[0]) and len(unparsed_lines) < max_unparsed:\n                                unparsed_lines.append({\n                                    'page': current_page,\n                                    'line_number': i + 1,\n                                    'content': line\n                                })\n                            i += 1\n                \n                print(f\"({page_records} records)\")\n                \n            except Exception as e:\n                print(f\"Error on page {current_page}: {str(e)}\")\n                continue\n    \n    return results, unparsed_lines\n\n# Test the enhanced parser\nprint(\"Testing enhanced parser...\")\ndata, unparsed = enhanced_parse_pdf(pdf_path, start_page=1, end_page=5)\n\nprint(f\"\\n✓ Successfully parsed: {len(data)} records\")\nprint(f\"✗ Could not parse: {len(unparsed)} lines\")\n\nif data:\n    print(\"\\n=== SAMPLE PARSED RECORDS ===\")\n    for i, record in enumerate(data[:10]):\n        print(f\"{i+1:2d}. {record['PNA']} | {record['Miejscowość'][:15]:15} | {record['Ulica'][:15]:15} | {record['Gmina'][:12]:12} | {record['Powiat'][:12]:12} | {record['Województwo']}\")\n\nif unparsed:\n    print(\"\\n=== UNPARSED LINES (need manual help) ===\")\n    for item in unparsed:\n        print(f\"Page {item['page']}, Line {item['line_number']}: {item['content'][:80]}...\")"

In [None]:
# Manual parsing for problematic records
def manual_parse_request(line: str, line_number: int = None, page_number: int = None):
    """
    When we encounter a record that can't be automatically parsed,
    this function will ask for manual input
    """
    print(f"\\n{'='*80}")
    if page_number:
        print(f"PARSING ISSUE - Page {page_number}, Line {line_number}")
    print(f"Cannot automatically parse: {repr(line)}")
    print(f"{'='*80}")
    
    # Try to give some hints about what we found
    parts = line.strip().split()
    if parts:
        if is_postal_code(parts[0]):
            print(f"✓ Found postal code: {parts[0]}")
        else:
            print(f"✗ First part doesn't look like postal code: {parts[0]}")
        
        print(f"Total parts found: {len(parts)}")
        print(f"Parts: {parts}")
    
    print(\"\\nPlease help identify the columns:\")
    print(\"1. Postal Code (PNA): \")
    print(\"2. Locality (Miejscowość): \")
    print(\"3. Street (Ulica): \")
    print(\"4. Numbers (Numery): \")
    print(\"5. Municipality (Gmina): \")
    print(\"6. County (Powiat): \")
    print(\"7. Voivodeship (Województwo): \")
    
    # For now, return None - we'll handle this interactively
    return None

def handle_multiline_records(lines: List[str], start_idx: int) -> tuple:
    \"\"\"
    Handle records that span multiple lines (due to long city names, etc.)
    Returns (parsed_record, number_of_lines_consumed)
    \"\"\"
    combined_line = lines[start_idx].strip()
    lines_used = 1
    
    # Check if this line starts with a postal code
    parts = combined_line.split()
    if not parts or not is_postal_code(parts[0]):
        return None, 0
    
    # If the line seems incomplete (too few parts), try combining with next line
    if len(parts) < 6 and start_idx + 1 < len(lines):
        next_line = lines[start_idx + 1].strip()
        
        # Only combine if next line doesn't start with postal code
        next_parts = next_line.split()
        if next_parts and not is_postal_code(next_parts[0]):
            combined_line += \" \" + next_line
            lines_used = 2
            
            # Check if we need a third line
            if len(combined_line.split()) < 6 and start_idx + 2 < len(lines):
                third_line = lines[start_idx + 2].strip()
                third_parts = third_line.split()
                if third_parts and not is_postal_code(third_parts[0]):
                    combined_line += \" \" + third_line
                    lines_used = 3
    
    # Try to parse the combined line
    parsed = improved_parse_row(combined_line)
    
    return parsed, lines_used

print(\"Advanced parsing functions defined\")"

# Polish Postal Codes PDF Parser

This notebook parses the Polish postal codes PDF and converts it to CSV format.
The PDF contains postal codes with address mappings in a structured table format.


In [None]:
import pdfplumber
import pandas as pd
import re
from typing import List, Dict, Optional
import numpy as np

print("Libraries imported successfully")

In [None]:
# Define the structure of our data
COLUMNS = ["PNA", "Miejscowość", "Ulica", "Numery", "Gmina", "Powiat", "Województwo"]


def clean_text(text: str) -> str:
    """Clean extracted text by removing extra whitespace and newlines"""
    if not text:
        return ""
    return re.sub(r"\s+", " ", text.strip())


def is_postal_code(text: str) -> bool:
    """Check if text matches Polish postal code format (XX-XXX)"""
    if not text:
        return False
    return bool(re.match(r"^\d{2}-\d{3}$", text.strip()))


print("Helper functions defined")

In [None]:
def extract_text_with_positions(page):
    """Extract text with position information from a PDF page"""
    chars = page.chars

    # Group characters by approximate y-position (rows)
    rows = {}
    for char in chars:
        y = round(char["y0"], 1)  # Round to avoid small variations
        if y not in rows:
            rows[y] = []
        rows[y].append(char)

    # Sort rows by y-position (top to bottom)
    sorted_rows = sorted(
        rows.items(), key=lambda x: -x[0]
    )  # Negative for top to bottom

    return sorted_rows


def parse_table_row(row_chars):
    """Parse a single table row into columns based on character positions"""
    if not row_chars:
        return None

    # Sort characters by x-position (left to right)
    row_chars.sort(key=lambda x: x["x0"])

    # Reconstruct text from characters
    full_text = "".join([char["text"] for char in row_chars])

    # Skip header rows and empty rows
    if not full_text.strip() or "PNA" in full_text or "Miejscowość" in full_text:
        return None

    # Try to identify if this row starts with a postal code
    words = full_text.split()
    if not words or not is_postal_code(words[0]):
        return None

    return full_text


print("Text extraction functions defined")

In [None]:
def parse_row_columns(text_line: str) -> Optional[Dict[str, str]]:
    """Parse a text line into structured columns"""
    if not text_line or not text_line.strip():
        return None

    # Split the line into parts
    parts = text_line.strip().split()

    if (
        len(parts) < 4
    ):  # Need at least postal code, locality, gmina, powiat, voivodeship
        return None

    # First part should be postal code
    if not is_postal_code(parts[0]):
        return None

    postal_code = parts[0]

    # Last three parts are typically: gmina, powiat, województwo
    voivodeship = parts[-1]
    powiat = parts[-2]
    gmina = parts[-3]

    # Everything between postal code and last 3 parts is locality, street, numbers
    middle_parts = parts[1:-3]

    if not middle_parts:
        return None

    # First middle part is locality
    locality = middle_parts[0]

    # Try to separate street and numbers from remaining parts
    street = ""
    numbers = ""

    if len(middle_parts) > 1:
        # Look for parts that look like house numbers (contain digits, ranges, etc.)
        street_parts = []
        number_parts = []

        for part in middle_parts[1:]:
            # If part contains digits or typical number patterns, treat as numbers
            if re.search(r"\d", part) or part in ["-", ",", "(", ")"]:
                number_parts.append(part)
            else:
                # If we haven't started collecting numbers, it's part of street
                if not number_parts:
                    street_parts.append(part)
                else:
                    # If we have numbers but encounter text, it might be descriptive
                    number_parts.append(part)

        street = " ".join(street_parts)
        numbers = " ".join(number_parts)

    return {
        "PNA": postal_code,
        "Miejscowość": locality,
        "Ulica": street,
        "Numery": numbers,
        "Gmina": gmina,
        "Powiat": powiat,
        "Województwo": voivodeship,
    }


print("Row parsing function defined")

In [None]:
# Test with the first few pages
def parse_pdf_pages(
    pdf_path: str, start_page: int = 2, end_page: int = None
) -> List[Dict[str, str]]:
    """Parse PDF pages and extract postal code data"""
    results = []

    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)

        if end_page is None:
            end_page = total_pages

        print(f"Processing pages {start_page} to {end_page} of {total_pages}")

        for page_num in range(
            start_page - 1, min(end_page, total_pages)
        ):  # Convert to 0-based indexing
            page = pdf.pages[page_num]
            print(f"Processing page {page_num + 1}")

            try:
                # Extract all text from the page
                text = page.extract_text()

                if not text:
                    continue

                # Split into lines
                lines = text.split("\n")

                for line in lines:
                    line = clean_text(line)
                    if not line:
                        continue

                    # Skip header/footer lines
                    if any(
                        keyword in line
                        for keyword in [
                            "Poczta Polska",
                            "Strona",
                            "Copyright",
                            "PNA Miejscowość Ulica",
                        ]
                    ):
                        continue

                    # Try to parse as data row
                    parsed_row = parse_row_columns(line)
                    if parsed_row:
                        results.append(parsed_row)

            except Exception as e:
                print(f"Error processing page {page_num + 1}: {str(e)}")
                continue

    return results


print("PDF parsing function defined")

In [None]:
# Test with the sample PDF (pages 3-22)
pdf_path = "pages_3_to_22.pdf"

# Parse the test pages
print("Starting to parse PDF...")
data = parse_pdf_pages(pdf_path, start_page=1, end_page=5)  # Test with first 5 pages

print(f"\nExtracted {len(data)} records")

# Show first few records
if data:
    print("\nFirst 10 records:")
    for i, record in enumerate(data[:10]):
        print(f"{i+1:2d}. {record}")
else:
    print("No data extracted. Let's debug...")

In [None]:
# If the above didn't work, let's try a different approach
# Let's examine the raw text structure first


def debug_page_structure(pdf_path: str, page_num: int = 1):
    """Debug the structure of a specific page"""
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[page_num - 1]  # Convert to 0-based

        print(f"=== PAGE {page_num} DEBUG ===")

        # Extract raw text
        text = page.extract_text()
        lines = text.split("\n")

        print(f"Total lines: {len(lines)}\n")

        # Show first 20 lines
        for i, line in enumerate(lines[:20]):
            line_clean = clean_text(line)
            postal_match = (
                is_postal_code(line_clean.split()[0]) if line_clean.split() else False
            )
            print(
                f"{i+1:2d}: {'[POSTAL]' if postal_match else '[-----]'} {repr(line_clean[:100])}"
            )


# Debug the first page
debug_page_structure(pdf_path, 1)

In [None]:
# Based on the debug output, let's refine our parsing approach


def improved_parse_row(line: str) -> Optional[Dict[str, str]]:
    """Improved row parsing based on observed structure"""
    line = clean_text(line)

    if not line:
        return None

    # Skip obvious non-data lines
    skip_patterns = [
        "Poczta Polska",
        "Oficjalny Spis",
        "Strona",
        "Copyright",
        "PNA Miejscowość",
        "Część 1",
        "miejscowości i ulic",
    ]

    if any(pattern in line for pattern in skip_patterns):
        return None

    # Split by whitespace but preserve some structure
    parts = line.split()

    if len(parts) < 4:
        return None

    # First part must be postal code
    if not is_postal_code(parts[0]):
        return None

    postal_code = parts[0]

    # The challenge is that voivodeships, powiats, and gminas can be compound words
    # Let's work backwards from known voivodeships
    voivodeships = [
        "mazowieckie",
        "śląskie",
        "wielkopolskie",
        "małopolskie",
        "lubelskie",
        "podkarpackie",
        "dolnośląskie",
        "kujawsko-pomorskie",
        "pomorskie",
        "łódzkie",
        "zachodniopomorskie",
        "lubuskie",
        "podlaskie",
        "świętokrzyskie",
        "opolskie",
        "warmińsko-mazurskie",
    ]

    # Find voivodeship (should be last or near last)
    voivodeship = ""
    voiv_idx = -1

    for i in range(len(parts) - 1, -1, -1):
        if parts[i] in voivodeships:
            voivodeship = parts[i]
            voiv_idx = i
            break
        # Also check for compound voivodeships
        if i > 0:
            compound = parts[i - 1] + "-" + parts[i]
            if compound in voivodeships:
                voivodeship = compound
                voiv_idx = i - 1
                break

    if not voivodeship:
        return None

    # Powiat should be before voivodeship
    powiat = ""
    powiat_idx = voiv_idx - 1

    if voivodeship.count("-") > 0:  # Compound voivodeship
        powiat_idx = voiv_idx - 1

    if powiat_idx >= 1:
        powiat = parts[powiat_idx]

    # Gmina should be before powiat
    gmina = ""
    gmina_idx = powiat_idx - 1

    if gmina_idx >= 1:
        gmina = parts[gmina_idx]

    # Everything between postal code and gmina is locality/street/numbers
    middle_parts = parts[1:gmina_idx]

    if not middle_parts:
        return None

    # First middle part is locality
    locality = middle_parts[0]

    # Remaining parts are street and numbers
    street_parts = []
    number_parts = []

    collecting_numbers = False
    for part in middle_parts[1:]:
        # If it looks like numbers/ranges, collect as numbers
        if (
            re.search(r"^\d", part)
            or part in ["-", ",", "(", ")", "n", "p"]
            or collecting_numbers
        ):
            number_parts.append(part)
            collecting_numbers = True
        else:
            if not collecting_numbers:
                street_parts.append(part)

    return {
        "PNA": postal_code,
        "Miejscowość": locality,
        "Ulica": " ".join(street_parts),
        "Numery": " ".join(number_parts),
        "Gmina": gmina,
        "Powiat": powiat,
        "Województwo": voivodeship,
    }


print("Improved parsing function defined")

In [None]:
# Test the improved parser
def test_improved_parser(pdf_path: str, num_pages: int = 3):
    """Test the improved parser on a few pages"""
    results = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_num in range(min(num_pages, len(pdf.pages))):
            page = pdf.pages[page_num]
            text = page.extract_text()

            if not text:
                continue

            lines = text.split("\n")

            for line_num, line in enumerate(lines):
                parsed = improved_parse_row(line)
                if parsed:
                    results.append(parsed)

    return results


# Test the improved parser
print("Testing improved parser...")
test_data = test_improved_parser(pdf_path, 3)

print(f"\nExtracted {len(test_data)} records")

if test_data:
    print("\nFirst 15 records:")
    for i, record in enumerate(test_data[:15]):
        print(
            f"{i+1:2d}. {record['PNA']} | {record['Miejscowość']:20} | {record['Ulica']:20} | {record['Numery']:15} | {record['Gmina']:15} | {record['Powiat']:15} | {record['Województwo']}"
        )
else:
    print("Still no data. Let's examine specific lines...")