In [1]:
!pip3 install pandas
!pip3 install geopy

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [None]:
import pandas as pd
import ssl
import certifi
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

ctx = ssl.create_default_context(cafile = certifi.where())

import os

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
RAW_DATA_DIR = os.path.join(PROJECT_ROOT, 'data', 'raw')
PREPROCESSED_DIR = os.path.join(PROJECT_ROOT, 'data', 'preprocessed')

print("Loading Data...")
df = pd.read_csv(os.path.join(RAW_DATA_DIR, 'muaban_net.csv'))

print("Data Loaded!")

# Drop unnecessary fields and rename
fields_to_drop = [
    'area_unit', 'dimension_2', 'raw_n_bedrooms', 'raw_n_bathrooms', 
    'scraper', 'raw_price', 'balcony_direction', 'facing_direction', 'raw_area'
]
df.drop(columns = [c for c in fields_to_drop if c in df.columns], inplace = True)

df.rename(columns = {'legal_docs': 'legal'}, inplace = True)
df.rename(columns = {'front_width': 'front_road_width'}, inplace = True)


# Convert float columns to integer
for col in ['n_bedrooms', 'n_bathrooms', 'n_floors']:
    df[col] = pd.to_numeric(df[col], errors = 'coerce').fillna(0).astype(int)

print("Geocoding (1 request per second)...")
geolocator = Nominatim(user_agent = "vn_real_estate_project", ssl_context = ctx, timeout = 10)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds = 1.1)

unique_addresses = df['address'].unique()
total_addr = len(unique_addresses)
address_coords_map = {}

for index, addr in enumerate(unique_addresses):    
    try:
        location = geocode(addr)
        if location:
            address_coords_map[addr] = (location.latitude, location.longitude)
        else:
            # Fallback: If street address fails, try geocoding just the District and City
            short_addr = ", ".join(addr.split(',')[-2:]) 
            location = geocode(short_addr)
            if location:
                address_coords_map[addr] = (location.latitude, location.longitude)
            else:
                address_coords_map[addr] = (None, None)
    except Exception:
        address_coords_map[addr] = (None, None)

# Map coordinates back to the original dataframe
df['latitude'] = df['address'].map(lambda x: address_coords_map[x][0])
df['longitude'] = df['address'].map(lambda x: address_coords_map[x][1])

print("--- Step 3: Saving Result ---")
df.to_csv('data/muabannet_preprocessed.csv', index = False, encoding = 'utf-8-sig')
print("Done!")

Loading Data...
Data Loaded
Geocoding (1 request per second)...
--- Step 3: Saving Result ---
Done!
