In [2]:
from bs4 import BeautifulSoup
import re

def generate_index_based_php_extractor(html_file):
    with open(html_file, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f.read(), 'html.parser')

    # Find all div elements (the main containers of data)
    divs = soup.find_all('div')
    # Find all tables (for the property details)
    tables = soup.find_all('table')

    # Function to clean text by collapsing whitespace
    def clean_text(text):
        return re.sub(r'\s+', ' ', text).strip()

    # Analyze and record indices of important elements
    indices = {
        'property_number': None,
        'owner_name_amharic': None,
        'owner_name_english': None,
        'title_deed_number': None,
        'date_issued': None,
        'transfer_type': None,
        'possession_type': None,
        'sub_city': None,
        'folder_number': None,
        'coordinates_x': [],
        'coordinates_y': [],
        'property_table': None
    }

    # Scan through divs to find key elements
    for i, div in enumerate(divs):
        text = clean_text(div.get_text())
        if not text:
            continue

        # Property Number
        if 'AK1181145161502962' in text and indices['property_number'] is None:
            indices['property_number'] = i

        # Owner Name (Amharic)
        if 'ለይኩን ፍስሃ ዘጊዮርጊስ' in text and indices['owner_name_amharic'] is None:
            indices['owner_name_amharic'] = i

        # Owner Name (English)
        if 'LEYKUN FISEHA ZEGIORGIS' in text and indices['owner_name_english'] is None:
            indices['owner_name_english'] = i

        # Title Deed Number
        if 'AK1181145161502962' in text and i != indices['property_number'] and indices['title_deed_number'] is None:
            indices['title_deed_number'] = i

        # Date Issued
        if re.match(r'\d{1,2}/\d{1,2}/\d{4}', text) and indices['date_issued'] is None:
            indices['date_issued'] = i

        # Transfer Type
        if ('በውርስ' in text or 'Inheritance' in text) and indices['transfer_type'] is None:
            indices['transfer_type'] = i

        # Possession Type
        if ('ሊዝ' in text or 'Lease' in text) and indices['possession_type'] is None:
            indices['possession_type'] = i

        # Sub City
        if ('አቃቂ ቃሊቲ' in text or 'Akaki Kaliti' in text) and indices['sub_city'] is None:
            indices['sub_city'] = i

        # Folder Number
        if ('L78 R4' in text or 'F41M5' in text) and indices['folder_number'] is None:
            indices['folder_number'] = i

        # Coordinates (X values)
        if re.match(r'^\d{6}\.\d{3}$', text) and len(indices['coordinates_x']) < 6:
            indices['coordinates_x'].append(i)

        # Coordinates (Y values)
        if re.match(r'^\d{6}\.\d{3}$', text) and i not in indices['coordinates_x'] and len(indices['coordinates_y']) < 6:
            indices['coordinates_y'].append(i)

    # Find the property details table
    for i, table in enumerate(tables):
        table_text = clean_text(table.get_text())
        if 'ወረዳ' in table_text and 'Woreda' in table_text:
            indices['property_table'] = i
            break

    # Generate the PHP code
    php_code = f"""<?php
function extractTitleDeedData($html) {{
    $result = [];
    $dom = new DOMDocument();
    @$dom->loadHTML($html);
    $divs = $dom->getElementsByTagName('div');
    $tables = $dom->getElementsByTagName('table');
    
    // Helper function to clean and get text content safely
    $getCleanText = function($index, $nodeList) {{
        if (!isset($nodeList[$index])) return '';
        $text = trim(preg_replace('/\s+/', ' ', $nodeList[$index]->nodeValue));
        return $text;
    }};
    
    // Property Number
    $result['property_number'] = $getCleanText({indices['property_number']}, $divs);
    
    // Owner Names
    $result['owner_name_amharic'] = $getCleanText({indices['owner_name_amharic']}, $divs);
    $result['owner_name_english'] = $getCleanText({indices['owner_name_english']}, $divs);
    
    // Title Deed Number
    $result['title_deed_number'] = $getCleanText({indices['title_deed_number']}, $divs);
    
    // Date Issued
    $result['date_issued'] = $getCleanText({indices['date_issued']}, $divs);
    
    // Transfer Type (split Amharic/English)
    $transfer = $getCleanText({indices['transfer_type']}, $divs);
    $transferParts = explode('/', $transfer);
    $result['transfer_type_amharic'] = trim($transferParts[0] ?? '');
    $result['transfer_type_english'] = trim($transferParts[1] ?? '');
    
    // Possession Type (split Amharic/English)
    $possession = $getCleanText({indices['possession_type']}, $divs);
    $possessionParts = explode('/', $possession);
    $result['possession_type_amharic'] = trim($possessionParts[0] ?? '');
    $result['possession_type_english'] = trim($possessionParts[1] ?? '');
    
    // Sub City (split Amharic/English)
    $subCity = $getCleanText({indices['sub_city']}, $divs);
    $subCityParts = explode('/', $subCity);
    $result['sub_city_amharic'] = trim($subCityParts[0] ?? '');
    $result['sub_city_english'] = trim($subCityParts[1] ?? '');
    
    // Folder Number
    $result['folder_number'] = $getCleanText({indices['folder_number']}, $divs);
    
    // Coordinates
    $result['coordinates'] = [];
    $xIndices = [{','.join(map(str, indices['coordinates_x']))}];
    $yIndices = [{','.join(map(str, indices['coordinates_y']))}];
    for ($i = 0; $i < min(count($xIndices), count($yIndices)); $i++) {{
        $x = $getCleanText($xIndices[$i], $divs);
        $y = $getCleanText($yIndices[$i], $divs);
        if ($x && $y) {{
            $result['coordinates'][] = ['x' => $x, 'y' => $y];
        }}
    }}
    
    // Property Details Table
    if (isset($tables[{indices['property_table']}])) {{
        $rows = $tables[{indices['property_table']}]->getElementsByTagName('tr');
        if ($rows->length > 1) {{
            $cells = $rows->item(1)->getElementsByTagName('td');
            
            $result['woreda'] = $getCleanText(0, $cells);
            $result['block_number'] = $getCleanText(1, $cells);
            $result['parcel_number'] = $getCleanText(2, $cells);
            $result['house_number'] = $getCleanText(3, $cells);
            $result['area_m2'] = $getCleanText(4, $cells);
            $result['built_up_area'] = $getCleanText(5, $cells);
            $result['floor_number'] = $getCleanText(6, $cells);
            $result['land_use'] = $getCleanText(7, $cells);
            $result['house_use'] = $getCleanText(8, $cells);
        }}
    }}
    
    return $result;
}}
?>"""

    # Save the PHP file
    with open('title_deed_index_extractor.php', 'w', encoding='utf-8') as f:
        f.write(php_code)

    print("PHP index-based extractor script generated as 'title_deed_index_extractor.php'")

# Usage
generate_index_based_php_extractor('AK1181145161502962_source_html_tables.html')

PHP index-based extractor script generated as 'title_deed_index_extractor.php'


  ?>"""
