# POI Enrichment for Properties

This notebook enriches property data with Points of Interest (POI) from various categories using Geoapify API.

## Process:
1. Load properties with GPS coordinates
2. For each property, find nearby POI in multiple categories
3. Calculate distances and rankings
4. Store results in `realitky.cleaned.property_poi` table

## Categories:
- Transport: Bus, Tram, Metro, Train
- Amenities: Restaurants, Shops, Schools, Healthcare, Sports
- Infrastructure: Industry, Airport, Power Plants, Highways

In [None]:
# Import required libraries
import requests
import json
import time
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from typing import List, Dict, Optional
import concurrent.futures
from datetime import datetime

# Initialize Spark session
spark = SparkSession.builder.appName("POI_Enrichment").getOrCreate()

print("Libraries imported successfully")

In [None]:
# Configuration widgets
dbutils.widgets.text("api_key", "your_geoapify_api_key", "Geoapify API Key")
dbutils.widgets.text("batch_size", "10", "Batch Size")
dbutils.widgets.text("process_id", "POI_001", "Process ID")
dbutils.widgets.text("radius_meters", "5000", "Search Radius in Meters")
dbutils.widgets.dropdown("test_mode", "true", ["true", "false"], "Test Mode (limit to 50 records)")

# Get widget values
api_key = dbutils.widgets.get("api_key")
batch_size = int(dbutils.widgets.get("batch_size"))
process_id = dbutils.widgets.get("process_id")
radius_meters = int(dbutils.widgets.get("radius_meters"))
test_mode = dbutils.widgets.get("test_mode").lower() == "true"

print(f"Configuration:")
print(f"- API Key: {'*' * (len(api_key) - 4) + api_key[-4:] if len(api_key) > 4 else 'NOT_SET'}")
print(f"- Batch size: {batch_size}")
print(f"- Process ID: {process_id}")
print(f"- Search radius: {radius_meters}m")
print(f"- Test mode: {test_mode}")

In [None]:
# POI Category mapping to Geoapify categories
POI_CATEGORY_MAPPING = {
    'transport_bus': ['public_transport.bus'],
    'transport_tram': ['public_transport.tram'],
    'transport_metro': ['public_transport.subway'],
    'transport_train': ['public_transport.railway'],
    'restaurant': ['catering.restaurant', 'catering.fast_food', 'catering.cafe'],
    'shop': ['commercial.shopping_mall', 'commercial.supermarket', 'commercial.marketplace'],
    'school': ['education.school', 'education.university', 'education.nursery'],
    'healthcare': ['healthcare.hospital', 'healthcare.clinic', 'healthcare.pharmacy'],
    'sports': ['sport.fitness', 'sport.stadium', 'activity.sport_club'],
    'industry': ['industrial.factory', 'industrial.warehouse'],
    'airport': ['public_transport.airport'],
    'power_plant': ['industrial.power'],
    'highway': ['highway.motorway', 'highway.trunk']
}

print("POI Category Mapping:")
for category, mappings in POI_CATEGORY_MAPPING.items():
    print(f"  {category}: {mappings}")

In [None]:
# POI Enricher Class Definition
class POIEnricher:
    def __init__(self, api_key: str, radius_meters: int = 5000):
        self.api_key = api_key
        self.radius_meters = radius_meters
        self.base_url = 'https://api.geoapify.com/v2/places'
        
        # POI Category mapping to Geoapify categories - moved inside class
        self.poi_category_mapping = {
            'transport_bus': ['public_transport.bus'],
            'transport_tram': ['public_transport.tram'],
            'transport_metro': ['public_transport.subway'],
            'transport_train': ['public_transport.train_station'],
            'restaurant': ['catering.restaurant', 'catering.fast_food', 'catering.cafe'],
            'shop': ['commercial.shopping_mall', 'commercial.supermarket', 'commercial.marketplace'],
            'school': ['education.school', 'education.university', 'education.nursery'],
            'healthcare': ['healthcare.hospital', 'healthcare.clinic', 'healthcare.pharmacy'],
            'sports': ['sport.fitness', 'sport.stadium', 'activity.sport_club'],
            'industry': ['industrial.factory', 'industrial.warehouse'],
            'airport': ['public_transport.airport'],
            'power_plant': ['industrial.power'],
            'highway': ['highway.motorway', 'highway.trunk']
        }
        
    def get_poi_for_property(self, property_id: str, lat: float, lng: float, category_id: str) -> List[Dict]:
        """Get POI for a single property and category"""
        try:
            # Get category mapping
            categories = self.poi_category_mapping.get(category_id, [category_id])
            
            # Get max results for this category
            max_results = self._get_max_results_for_category(category_id)
            
            all_pois = []
            
            for category in categories:
                params = {
                    'categories': category,
                    'filter': f'circle:{lng},{lat},{self.radius_meters}',
                    'limit': max_results,
                    'apiKey': self.api_key
                }
                
                try:
                    response = requests.get(self.base_url, params=params, timeout=30)
                    response.raise_for_status()
                    
                    data = response.json()
                    
                    if 'features' in data:
                        for idx, feature in enumerate(data['features']):
                            poi_record = self._parse_poi_feature(
                                feature, property_id, category_id, lat, lng, idx + 1
                            )
                            if poi_record:
                                all_pois.append(poi_record)
                    
                    # Rate limiting
                    time.sleep(0.1)
                    
                except requests.exceptions.HTTPError as e:
                    print(f"HTTP Error for category {category}: {e}")
                    if e.response.status_code == 400:
                        print(f"Bad request - possibly invalid category: {category}")
                    continue
                except Exception as e:
                    print(f"Unexpected error for category {category}: {e}")
                    continue
                
            return all_pois
            
        except Exception as e:
            print(f"Error fetching POI for property {property_id}, category {category_id}: {e}")
            return []
    
    def _get_max_results_for_category(self, category_id: str) -> int:
        """Get maximum results based on category"""
        limits = {
            'transport_bus': 10,
            'transport_tram': 10,
            'transport_metro': 5,
            'transport_train': 3,
            'restaurant': 5,
            'shop': 5,
            'school': 5,
            'healthcare': 5,
            'sports': 10,
            'industry': 5,
            'airport': 1,
            'power_plant': 1,
            'highway': 2
        }
        return limits.get(category_id, 5)
    
    def _parse_poi_feature(self, feature: Dict, property_id: str, category_id: str, 
                          property_lat: float, property_lng: float, rank: int) -> Optional[Dict]:
        """Parse POI feature from API response"""
        try:
            props = feature.get('properties', {})
            geometry = feature.get('geometry', {})
            coordinates = geometry.get('coordinates', [])
            
            if len(coordinates) < 2:
                print(f"Invalid coordinates for property {property_id}, category {category_id}")
                return None
                
            poi_lng, poi_lat = coordinates[0], coordinates[1]
            
            # Convert coordinates to float to handle Decimal types
            poi_lng = float(poi_lng)
            poi_lat = float(poi_lat)
            property_lat = float(property_lat)
            property_lng = float(property_lng)
            
            # Calculate distance
            try:
                distance_km = self._calculate_distance(
                    property_lat, property_lng, poi_lat, poi_lng
                )
            except Exception as e:
                print(f"Error calculating distance for property {property_id}: {str(e)}")
                return None
            
            # Generate POI ID
            poi_id = f"{property_id}_{category_id}_{rank}_{int(time.time())}"
            
            # Extract attributes
            attributes = {}
            
            # Common attributes
            if 'datasource' in props:
                attributes['data_source'] = props['datasource'].get('sourcename', 'unknown')
            
            if 'contact' in props:
                contact = props['contact']
                if 'phone' in contact:
                    attributes['phone'] = str(contact['phone'])
                if 'email' in contact:
                    attributes['email'] = contact['email']
            
            if 'website' in props:
                attributes['website'] = props['website']
                
            # Category specific attributes
            if category_id.startswith('transport_'):
                if 'public_transport' in props:
                    attributes.update(props['public_transport'])
            elif category_id == 'restaurant':
                if 'cuisine' in props:
                    attributes['cuisine'] = props['cuisine']
                if 'rating' in props:
                    attributes['rating'] = str(props['rating'])
            
            poi_record = {
                'poi_id': poi_id,
                'property_id': property_id,
                'category_id': category_id,
                'poi_name': props.get('name', 'Unknown'),
                'poi_address': props.get('formatted', ''),
                'poi_lat': poi_lat,
                'poi_lng': poi_lng,
                'distance_km': round(distance_km, 3),
                'distance_walking_min': self._estimate_walking_time(distance_km),
                'distance_driving_min': self._estimate_driving_time(distance_km),
                'rank_in_category': rank,
                'poi_attributes': attributes,
                'data_source': 'geoapify',
                'data_quality_score': 1.0,
                'ins_dt': datetime.now(),
                'ins_process_id': process_id,
                'upd_dt': datetime.now(),
                'upd_process_id': process_id,
                'del_flag': False
            }
            
            return poi_record
            
        except Exception as e:
            print(f"Error parsing POI feature for property {property_id}, category {category_id}: {str(e)}")
            return None
    
    def _calculate_distance(self, lat1: float, lng1: float, lat2: float, lng2: float) -> float:
        """Calculate distance between two points using Haversine formula"""
        import math
        
        # Convert all inputs to float to handle Decimal types from Spark
        lat1 = float(lat1)
        lng1 = float(lng1)
        lat2 = float(lat2)
        lng2 = float(lng2)
        
        R = 6371  # Earth's radius in kilometers
        
        lat1_rad = math.radians(lat1)
        lat2_rad = math.radians(lat2)
        delta_lat = math.radians(lat2 - lat1)
        delta_lng = math.radians(lng2 - lng1)
        
        a = (math.sin(delta_lat / 2) * math.sin(delta_lat / 2) +
             math.cos(lat1_rad) * math.cos(lat2_rad) *
             math.sin(delta_lng / 2) * math.sin(delta_lng / 2))
        
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
        distance = R * c
        
        return distance
    
    def _estimate_walking_time(self, distance_km: float) -> int:
        """Estimate walking time in minutes (assuming 5 km/h)"""
        return int(distance_km * 12)  # 60 minutes / 5 km/h = 12 minutes per km
    
    def _estimate_driving_time(self, distance_km: float) -> int:
        """Estimate driving time in minutes (assuming 30 km/h in city)"""
        return int(distance_km * 2)  # 60 minutes / 30 km/h = 2 minutes per km

# Initialize POI enricher
enricher = POIEnricher(api_key, radius_meters)
print("POI Enricher initialized successfully")

In [None]:
# Load properties with GPS coordinates
print("Loading properties with GPS coordinates...")

# Base query for properties with GPS coordinates
base_query = """
SELECT 
    property_id,
    address_latitude,
    address_longitude
FROM realitky.cleaned.properties
WHERE address_latitude IS NOT NULL 
AND address_longitude IS NOT NULL
AND address_latitude BETWEEN 48.0 AND 51.0  -- Czech Republic bounds
AND address_longitude BETWEEN 12.0 AND 19.0
AND del_flag = FALSE
"""

# Apply test mode limit if enabled
if test_mode:
    properties_query = base_query + " LIMIT 50"
    print("🧪 TEST MODE: Limited to 50 properties")
else:
    properties_query = base_query
    print("🚀 PRODUCTION MODE: Processing all properties")

df_properties = spark.sql(properties_query)
total_properties = df_properties.count()
print(f"Found {total_properties} properties with valid GPS coordinates")

# Show sample
print("Sample properties:")
df_properties.show(5)

In [None]:
# Get POI categories from database
print("Loading POI categories...")
categories_df = spark.sql("SELECT category_code FROM realitky.cleaned.poi_categories ORDER BY category_id")
categories = [row.category_code for row in categories_df.collect()]

print(f"POI Categories to process ({len(categories)}):")
for i, category in enumerate(categories, 1):
    print(f"  {i}. {category}")

# Calculate estimated API calls
estimated_calls = total_properties * len(categories)
print(f"\n📊 Estimated API calls: {estimated_calls:,}")

if estimated_calls > 3000:
    print("⚠️  WARNING: This exceeds Geoapify free tier limit (3,000 calls/day)")
    print("   Consider reducing batch_size or enabling test_mode")

In [None]:
# Process properties in batches
def process_property_batch(property_batch: List, categories: List[str]) -> List[Dict]:
    """Process a batch of properties for all POI categories"""
    all_pois = []
    total_batch_size = len(property_batch)
    
    for idx, property_data in enumerate(property_batch, 1):
        property_id = property_data['property_id']
        lat = property_data['address_latitude']
        lng = property_data['address_longitude']
        
        print(f"Processing property {idx}/{total_batch_size}: {property_id}")
        
        for category_idx, category_id in enumerate(categories, 1):
            try:
                print(f"  - Category {category_idx}/{len(categories)}: {category_id}")
                pois = enricher.get_poi_for_property(property_id, lat, lng, category_id)
                all_pois.extend(pois)
                
                print(f"    Found {len(pois)} POI")
                
                # Rate limiting between categories
                time.sleep(0.2)
                
            except Exception as e:
                print(f"    ❌ Error processing {property_id} - {category_id}: {e}")
                continue
        
        # Rate limiting between properties
        print(f"  ✅ Completed property {property_id} ({len([p for p in all_pois if p['property_id'] == property_id])} total POI)")
        time.sleep(1)
    
    return all_pois

print("Processing function defined")

In [None]:
# Execute POI enrichment
print("=" * 60)
print("STARTING POI ENRICHMENT")
print("=" * 60)

# Convert to list for processing
properties_list = [row.asDict() for row in df_properties.limit(batch_size).collect()]

print(f"Processing {len(properties_list)} properties in current batch...")
print(f"Each property will be processed for {len(categories)} categories")

# Process batch with timing
start_time = time.time()
poi_results = process_property_batch(properties_list, categories)
end_time = time.time()

processing_time = end_time - start_time
print(f"\n🎯 BATCH PROCESSING COMPLETED")
print(f"- Properties processed: {len(properties_list)}")
print(f"- POI records found: {len(poi_results)}")
print(f"- Processing time: {processing_time:.2f} seconds")
print(f"- Average time per property: {processing_time/len(properties_list):.2f} seconds")

In [None]:
# Save results to database
if poi_results:
    print("💾 Saving results to database...")
    
    # Create DataFrame from results
    df_poi_results = spark.createDataFrame(poi_results)
    
    print("Sample POI results:")
    df_poi_results.select("property_id", "category_id", "poi_name", "distance_km", "rank_in_category").show(10, truncate=False)
    
    # Insert into POI table using MERGE for upsert
    df_poi_results.createOrReplaceTempView("temp_poi_results")
    
    merge_sql = f"""
    MERGE INTO realitky.cleaned.property_poi AS target
    USING temp_poi_results AS source
    ON target.property_id = source.property_id 
       AND target.category_id = source.category_id 
       AND target.rank_in_category = source.rank_in_category
    
    WHEN MATCHED THEN
        UPDATE SET
            poi_name = source.poi_name,
            poi_address = source.poi_address,
            poi_lat = source.poi_lat,
            poi_lng = source.poi_lng,
            distance_km = source.distance_km,
            distance_walking_min = source.distance_walking_min,
            distance_driving_min = source.distance_driving_min,
            poi_attributes = source.poi_attributes,
            data_quality_score = source.data_quality_score,
            upd_dt = source.upd_dt,
            upd_process_id = source.upd_process_id,
            del_flag = source.del_flag
    
    WHEN NOT MATCHED THEN
        INSERT (
            poi_id, property_id, category_id, poi_name, poi_address,
            poi_lat, poi_lng, distance_km, distance_walking_min, distance_driving_min,
            rank_in_category, poi_attributes, data_source, data_quality_score,
            ins_dt, ins_process_id, upd_dt, upd_process_id, del_flag
        )
        VALUES (
            source.poi_id, source.property_id, source.category_id, source.poi_name, source.poi_address,
            source.poi_lat, source.poi_lng, source.distance_km, source.distance_walking_min, source.distance_driving_min,
            source.rank_in_category, source.poi_attributes, source.data_source, source.data_quality_score,
            source.ins_dt, source.ins_process_id, source.upd_dt, source.upd_process_id, source.del_flag
        )
    """
    
    spark.sql(merge_sql)
    
    print(f"✅ Successfully processed {len(poi_results)} POI records using MERGE")
    
else:
    print("❌ No POI data found")

In [None]:
# Generate detailed statistics
print("=" * 60)
print("PROCESSING STATISTICS")
print("=" * 60)

# POI Statistics by category
stats_query = f"""
SELECT 
    category_id,
    COUNT(*) as poi_count,
    COUNT(DISTINCT property_id) as properties_count,
    ROUND(AVG(distance_km), 3) as avg_distance_km,
    ROUND(MIN(distance_km), 3) as min_distance_km,
    ROUND(MAX(distance_km), 3) as max_distance_km
FROM realitky.cleaned.property_poi
WHERE ins_process_id = '{process_id}'
GROUP BY category_id
ORDER BY poi_count DESC
"""

print("📊 POI Statistics by Category:")
spark.sql(stats_query).show()

# Property coverage statistics
coverage_query = f"""
SELECT 
    p.property_id,
    COUNT(DISTINCT poi.category_id) as categories_covered,
    COUNT(poi.poi_id) as total_poi_found,
    ROUND(AVG(poi.distance_km), 3) as avg_distance_km
FROM (
    SELECT DISTINCT property_id 
    FROM realitky.cleaned.property_poi 
    WHERE ins_process_id = '{process_id}'
) p
LEFT JOIN realitky.cleaned.property_poi poi ON p.property_id = poi.property_id AND poi.ins_process_id = '{process_id}'
GROUP BY p.property_id
ORDER BY total_poi_found DESC
"""

print("🏠 Property Coverage (Top 10):")
spark.sql(coverage_query).limit(10).show()

# Overall summary
summary_query = f"""
SELECT 
    COUNT(DISTINCT property_id) as properties_processed,
    COUNT(*) as total_poi_found,
    COUNT(DISTINCT category_id) as categories_found,
    ROUND(AVG(distance_km), 3) as avg_distance_km,
    ROUND(AVG(data_quality_score), 3) as avg_quality_score
FROM realitky.cleaned.property_poi
WHERE ins_process_id = '{process_id}'
"""

print("📈 Overall Summary:")
spark.sql(summary_query).show()

In [None]:
# Data quality and validation checks
print("=" * 60)
print("DATA QUALITY VALIDATION")
print("=" * 60)

# Check for missing data
missing_data_query = f"""
SELECT 
    SUM(CASE WHEN poi_name IS NULL OR poi_name = '' THEN 1 ELSE 0 END) as missing_names,
    SUM(CASE WHEN poi_address IS NULL OR poi_address = '' THEN 1 ELSE 0 END) as missing_addresses,
    SUM(CASE WHEN poi_lat IS NULL THEN 1 ELSE 0 END) as missing_lat,
    SUM(CASE WHEN poi_lng IS NULL THEN 1 ELSE 0 END) as missing_lng,
    SUM(CASE WHEN distance_km IS NULL OR distance_km < 0 THEN 1 ELSE 0 END) as invalid_distances,
    COUNT(*) as total_records
FROM realitky.cleaned.property_poi
WHERE ins_process_id = '{process_id}'
"""

print("🔍 Data Quality Checks:")
spark.sql(missing_data_query).show()

# Distance distribution
distance_distribution_query = f"""
SELECT 
    CASE 
        WHEN distance_km <= 0.5 THEN '0-0.5km'
        WHEN distance_km <= 1.0 THEN '0.5-1km'
        WHEN distance_km <= 2.0 THEN '1-2km'
        WHEN distance_km <= 5.0 THEN '2-5km'
        ELSE '5km+'
    END as distance_range,
    COUNT(*) as poi_count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
FROM realitky.cleaned.property_poi
WHERE ins_process_id = '{process_id}'
GROUP BY 
    CASE 
        WHEN distance_km <= 0.5 THEN '0-0.5km'
        WHEN distance_km <= 1.0 THEN '0.5-1km'
        WHEN distance_km <= 2.0 THEN '1-2km'
        WHEN distance_km <= 5.0 THEN '2-5km'
        ELSE '5km+'
    END
ORDER BY 
    CASE 
        WHEN distance_range = '0-0.5km' THEN 1
        WHEN distance_range = '0.5-1km' THEN 2
        WHEN distance_range = '1-2km' THEN 3
        WHEN distance_range = '2-5km' THEN 4
        ELSE 5
    END
"""

print("📏 Distance Distribution:")
spark.sql(distance_distribution_query).show()

In [None]:
# Final summary and next steps
processing_summary = f"""
=== POI ENRICHMENT COMPLETED ===

✅ **Results:**
- Properties processed: {len(properties_list)}
- POI records created: {len(poi_results) if poi_results else 0}
- Process ID: {process_id}
- Test mode: {test_mode}

🔄 **Next Steps:**
1. Update property_poi_summary table
2. Calculate location scores
3. Verify data quality
4. Scale to full dataset (if test_mode was used)

⚠️ **Important Notes:**
- This {'was a limited test run' if test_mode else 'processed the current batch'}
- For full dataset, implement proper batching
- Monitor API rate limits
- Consider caching results for nearby properties

💰 **API Usage:**
- Estimated calls made: {len(properties_list) * len(categories) if poi_results else 0}
- Geoapify free tier: 3,000 requests/day
- Consider upgrading for production use

🎯 **Performance:**
- Average processing time per property: {(end_time - start_time)/len(properties_list):.2f} seconds
- Total batch processing time: {end_time - start_time:.2f} seconds

📋 **Data Quality:**
- All POI records include distance calculations
- Coordinates validated within Czech Republic bounds
- API responses parsed with error handling
"""

print(processing_summary)

# Save processing log
log_data = {
    'process_id': process_id,
    'processing_date': datetime.now(),
    'properties_processed': len(properties_list),
    'poi_records_created': len(poi_results) if poi_results else 0,
    'categories_processed': len(categories),
    'test_mode': test_mode,
    'processing_time_seconds': end_time - start_time,
    'api_calls_estimated': len(properties_list) * len(categories),
    'batch_size': batch_size,
    'radius_meters': radius_meters
}

print("📝 Processing completed successfully!")