In [None]:
!pip install pandas
!pip install geopy

In [None]:
import pandas as pd
import ssl
import certifi
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

ctx = ssl.create_default_context(cafile=certifi.where())

print("--- Step 1: Loading Data ---")
df = pd.read_csv('muaban_net.csv')

# Drop unnecessary fields and rename
fields_to_drop = [
    'area_unit', 'dimension_2', 'raw_n_bedrooms', 'raw_n_bathrooms', 
    'scraper', 'raw_price', 'balcony_direction', 'facing_direction'
]
df.drop(columns=[c for c in fields_to_drop if c in df.columns], inplace=True)
df.rename(columns={'legal_docs': 'legal'}, inplace=True)

# Convert float columns to integer
for col in ['n_bedrooms', 'n_bathrooms', 'n_floors']:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

print("--- Step 2: Geocoding (1 request per second) ---")
geolocator = Nominatim(user_agent="vn_real_estate_project", ssl_context=ctx, timeout=10)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.1)

unique_addresses = df['address'].unique()
total_addr = len(unique_addresses)
address_coords_map = {}

print(f"Total unique addresses to process: {total_addr}")

for index, addr in enumerate(unique_addresses):
    # Print progress every 5 addresses
    if index % 5 == 0:
        print(f"Progress: {index}/{total_addr} addresses searched...")

    try:
        location = geocode(addr)
        if location:
            address_coords_map[addr] = (location.latitude, location.longitude)
        else:
            # Fallback: If street address fails, try geocoding just the District and City
            short_addr = ", ".join(addr.split(',')[-2:]) 
            location = geocode(short_addr)
            if location:
                address_coords_map[addr] = (location.latitude, location.longitude)
            else:
                address_coords_map[addr] = (None, None)
    except Exception:
        address_coords_map[addr] = (None, None)

# Map coordinates back to the original dataframe
df['latitude'] = df['address'].map(lambda x: address_coords_map[x][0])
df['longitude'] = df['address'].map(lambda x: address_coords_map[x][1])

print("--- Step 3: Saving Result ---")
df.to_csv('muaban_net_preprocessed.csv', index=False, encoding='utf-8-sig')
print("Done! Processed file: muaban_net_preprocessed.csv")

--- Step 1: Loading Data ---
--- Step 2: Geocoding (1 request per second) ---
Total unique addresses to process: 1620
Progress: 0/1620 addresses searched...
Progress: 5/1620 addresses searched...
Progress: 10/1620 addresses searched...
Progress: 15/1620 addresses searched...
Progress: 20/1620 addresses searched...
Progress: 25/1620 addresses searched...
Progress: 30/1620 addresses searched...
Progress: 35/1620 addresses searched...
Progress: 40/1620 addresses searched...
Progress: 45/1620 addresses searched...
Progress: 50/1620 addresses searched...
Progress: 55/1620 addresses searched...
Progress: 60/1620 addresses searched...
Progress: 65/1620 addresses searched...
Progress: 70/1620 addresses searched...
Progress: 75/1620 addresses searched...
Progress: 80/1620 addresses searched...
Progress: 85/1620 addresses searched...
Progress: 90/1620 addresses searched...
Progress: 95/1620 addresses searched...
Progress: 100/1620 addresses searched...
Progress: 105/1620 addresses searched...
Pr