### Import Libraries

In [None]:
import re
import time
import logging 
import pandas as pd
import numpy as np
import datetime as dt
from datetime import datetime
from geopy.geocoders import Nominatim, GoogleV3

### Read & Load Data

In [None]:
df = pd.read_csv("Rental_data.csv")

### Extracting Full Address, Latitude, Longitude

In [None]:
# Replace elements in the "address" column before run the geolocator
df['address'] = df['address'].str.replace(r'\bLor\b', 'Lorong', regex=True)
df['address'] = df['address'].str.replace(r'\bNth\b', 'North', regex=True)
df['address'] = df['address'].str.replace(r'\bBt\b', 'Bukit', regex=True)
df['address'] = df['address'].str.replace(r'\bUpp\b', 'Upper', regex=True)
df['address'] = df['address'].str.replace(r'\bTg\b', 'Tanjong', regex=True)

In [None]:
# Start the timer
start_time = time.time()

# Initialize Nominatim geocoder 
geolocator = Nominatim(user_agent="james", timeout=10)

# Function to get full address, latitude, longitude
def geocode_and_get_details(address):
    location = geolocator.geocode(address + ",Singapore")
    if location:
        lat, lon = location.latitude, location.longitude
        full_address = location.address
        print(f"Geocoded: {address} -> Latitude: {lat}, Longitude: {lon}")
        return full_address, lat, lon
    else:
        logging.warning(f"Failed to geocode: {address}")
        return None, None, None  # Return defaults if geocoding fails

# Apply the combined function to the DataFrame
df['full_address'], df['latitude'], df['longitude'] = zip(*df['address'].apply(geocode_and_get_details))

# End the timer
end_time = time.time()
elapsed_time = end_time - start_time

In [None]:
print(f"Geocoding process took {elapsed_time:.2f} seconds")

In [None]:
# Initialize Google Geocoder
api_key = 'AIzaSyBvuUbJevwEF4sQl1jKd78C1MJSk89PxCA'
google_geolocator = GoogleV3(api_key=api_key)

# Create a function to geocode addresses using Google Geocoding API
def geocode_with_google(address):
    try:
        location_g = google_geolocator.geocode(address + ", Singapore")
        if location_g:
            lat, lon = location_g.latitude, location_g.longitude
            full_address = location_g.address
            print(f"Geocoded (Google): {address} -> Latitude: {lat}, Longitude: {lon}")
            return full_address, lat, lon
        else:
            print(f"Google Geocoding API couldn't find a result for: {address}")
            return None, None, None
    except Exception as e:
        print(f"Error during Google Geocoding: {str(e)}")
        return None, None, None

remaining_df = df.loc[df['full_address'].isnull()]
# Apply the Google Geocoding function to the remaining DataFrame
remaining_df['full_address_google'], remaining_df['latitude_google'], remaining_df['longitude_google'] = zip(*remaining_df['address'].apply(geocode_with_google))

# Define a condition for rows where 'full_address' is null
condition = df['full_address'].isnull()

# Assign the values from remaining_df to df based on the condition
df.loc[condition, 'latitude'] = remaining_df['latitude_google'].values
df.loc[condition, 'longitude'] = remaining_df['longitude_google'].values

In [None]:
# Clean the full_address_google by removing the postal code
remaining_df['full_address_google'] = remaining_df['full_address_google'].str.split(',').str[0].str.strip()

# Apply the geocode_and_get_details function to the remaining_df
geocoded_results = remaining_df['full_address_google'].apply(geocode_and_get_details)

# Extract the full address part from tuples in 'geocoded_results'
geocoded_addresses = geocoded_results.apply(lambda x: x[0] if isinstance(x, tuple) else x)

# Update the DataFrame using .loc with the condition
df.loc[condition, 'full_address'] = geocoded_addresses

In [None]:
# Geocoding for remaining address
property_full_address = pd.read_csv('address_list.csv')

# Define a custom function to replace None & Singapore with values from other DataFrame
def replace_full_address(row):
    if row['full_address'] is None or row['full_address'].lower() == 'singapore' or row['full_address'].lower() == 'singapore, central, 178957, singapore' :
        matching_row = property_full_address[property_full_address['address'] == row['address']]
        if not matching_row.empty:
            return matching_row.iloc[0]['geocoded_address']
    return row['full_address']

# Apply the custom function to update 'full_address' column
df['full_address'] = df.apply(replace_full_address, axis=1)

In [None]:
# Geocoding for address with 'Woodlands Drive'

# Define a function to update full_address
def update_full_address(row):
    if 'Woodlands Drive' in row['address'] and row['full_address'] == None:
        return row['address'] + ', Woodlands, Northwest, 730888, Singapore'
    return row['full_address']

# Apply the update_full_address function to the DataFrame
df['full_address'] = df.apply(update_full_address, axis=1)

In [None]:
# Remove all "Singapore" 
df['full_address'] = df['full_address'].apply(lambda x: ', '.join([str(part).strip() for part in str(x).split(', ') if part != 'Singapore']))

# Add ', Singapore' to the end of every row
df['full_address'] = df['full_address'] + ', Singapore'

### Extracting Planning Areas

In [None]:
# Read the CSV file into a DataFrame
# This file is to map planning area based on subzone
subzone_planning_area = pd.read_csv('planning_area_subzone_list.csv')

# Create a dictionary mapping subzones to planning areas
subzone_to_planning_area = dict(zip(subzone_planning_area['Subzone'], subzone_planning_area['Planning_area']))

In [None]:
# Define the list of planning areas and regions
planning_areas = [
    "Ang Mo Kio","Bedok","Bishan","Boon Lay","Bukit Batok","Bukit Merah","Bukit Panjang","Bukit Timah","Central Water Catchment",
    "Changi","Changi Bay","Choa Chu Kang","Clementi","Downtown Core","Geylang","Hougang","Jurong East","Jurong West","Kallang",
    "Lim Chu Kang","Mandai","Marina East","Marina South","Marine Parade","Museum","Newton","North-Eastern Islands","Novena",
    "Orchard","Outram","Pasir Ris","Paya Lebar","Pioneer","Punggol","Queenstown","River Valley","Rochor","Seletar","Sembawang",
    "Seng Kang","Serangoon","Simpang","Singapore River","Southern Islands","Straits View","Sungei Kadut","Tampines","Tanglin",
    "Tengah","Toa Payoh","Tuas","Western Islands","Western Water Catchment","Woodlands","Yishun"
]


# Create empty columns for planning_area and region
df['planning_area'] = ""

# Define a function to extract planning_area and region
def extract_planning_area_and_region(address):
    # Split the address by ","
    address_parts = address.split(',')
    
    # Initialize variables to store planning area and region
    planning_area = ""
    
    # Iterate through the address parts
    for part in address_parts:
        part = part.strip()  # Remove leading/trailing spaces
        
        if part in planning_areas:
            planning_area = part
            break  # Exit the loop if planning area is found

    # If no planning area found, check for subzone and assign planning area based on the subzone
    if not planning_area:
        for part in address_parts:
            part = part.strip()
            if part in subzone_to_planning_area:
                planning_area = subzone_to_planning_area[part]
                break  # Exit the loop if subzone is found

    return planning_area

# Apply the function to extract values and populate the DataFrame
df['planning_area'] = df['full_address'].apply(extract_planning_area_and_region)

### Extracting Region

In [None]:
# Dictionary mapping planning areas to regions
planning_area_to_region = {
    'Ang Mo Kio': 'North-East',
    'Bedok': 'East',
    'Bishan': 'Central',
    'Boon Lay': 'West',
    'Bukit Batok': 'West',
    'Bukit Merah': 'Central',
    'Bukit Panjang': 'West',
    'Bukit Timah': 'Central',
    'Central Water Catchment': 'North',
    'Changi': 'East',
    'Changi Bay': 'East',
    'Choa Chu Kang': 'West',
    'Clementi': 'West',
    'Downtown Core': 'Central',
    'Geylang': 'Central',
    'Hougang': 'North-East',
    'Jurong East': 'West',
    'Jurong West': 'West',
    'Kallang': 'Central',
    'Lim Chu Kang': 'North',
    'Mandai': 'North',
    'Marina East': 'Central',
    'Marina South': 'Central',
    'Marine Parade': 'Central',
    'Museum': 'Central',
    'Newton': 'Central',
    'North-Eastern Islands': 'North-East',
    'Novena': 'Central',
    'Orchard': 'Central',
    'Outram': 'Central',
    'Pasir Ris': 'East',
    'Paya Lebar': 'East',
    'Pioneer': 'West',
    'Punggol': 'North-East',
    'Queenstown': 'Central',
    'River Valley': 'Central',
    'Rochor': 'Central',
    'Seletar': 'North-East',
    'Sembawang': 'North',
    'Seng Kang': 'North-East',
    'Serangoon': 'North-East',
    'Simpang': 'North',
    'Singapore River': 'Central',
    'Southern Islands': 'Central',
    'Straits View': 'Central',
    'Sungei Kadut': 'North',
    'Tampines': 'East',
    'Tanglin': 'Central',
    'Tengah': 'West',
    'Toa Payoh': 'Central',
    'Tuas': 'West',
    'Western Islands': 'West',
    'Western Water Catchment': 'West',
    'Woodlands': 'North',
    'Yishun': 'North'
}
# Assign regions based on the planning_area column
df['region'] = df['planning_area'].map(planning_area_to_region)

### Extracting Nearest MRT Station, Distance & Time to the nearest station

In [None]:
mrt = pd.read_csv('MRT Stations.csv')

In [None]:
# Function to calculate Haversine distance between two sets of coordinates
def haversine(lat1, lon1, lat2, lon2):
    # Radius of the Earth in kilometers
    radius = 6371.0
    
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = radius * c
    
    return distance

# Function to find the nearest MRT station and its Haversine distance
def find_nearest_mrt(row):
    address_lat = row['latitude']
    address_lon = row['longitude']
    
    min_distance = float('inf')
    nearest_station = ''
    
    for _, mrt_row in mrt.iterrows():
        mrt_lat = mrt_row['latitude']
        mrt_lon = mrt_row['longitude']
        
        distance = haversine(address_lat, address_lon, mrt_lat, mrt_lon)
        
        if distance < min_distance:
            min_distance = distance
            nearest_station = mrt_row['STN_NAME']
    
    return nearest_station, min_distance

# Apply the function to your 'df' DataFrame
df[['Nearest MRT Station', 'Distance to Nearest MRT']] = df.apply(find_nearest_mrt, axis=1, result_type='expand')

In [None]:
# Define walking speed in km/h
walking_speed_kmph = 5

# Function to calculate walking time
def calculate_walking_time(row):
    # Get the walking distance (in kilometers)
    walking_distance_km = row['Distance to Nearest MRT']

    # Calculate walking time (in hours)
    walking_time_hours = walking_distance_km / walking_speed_kmph

    # Convert to minutes if needed
    walking_time_minutes = walking_time_hours * 60

    return walking_time_minutes

# Apply the function to your 'df' DataFrame and create a new column for walking time
df['Walking Time to Nearest MRT (minutes)'] = df.apply(calculate_walking_time, axis=1)

In [None]:
# Perform the join based on the "Nearest MRT Station" column
df = df.merge(mrt[['STN_NAME', 'latitude', 'longitude']], left_on='Nearest MRT Station', right_on='STN_NAME', how='left')

# Drop the duplicate "STN_NAME" column if needed
df.drop(columns=['STN_NAME','nearest_mrt','time_to_nearest_mrt'], inplace=True)

### Final Cleanup 

In [None]:
# Define a dictionary to map old column names to new column names
column_mapping = {'latitude_x': 'latitude', 'longitude_x': 'longitude', 'latitude_y': 'latitude_mrt'
                  ,'longitude_y':'longitude_mrt','Distance to Nearest MRT':'Distance_to_Nearest_MRT_km'
                  ,'Walking Time to Nearest MRT (minutes)':'Walking_Time_to_Nearest_MRT_min'
                  ,'room_size':'room_size_sqft','Nearest MRT Station':'Nearest_MRT_Station'}

# Use the .rename() method to rename columns
df = df.rename(columns=column_mapping)

# Define the desired order of columns
desired_order = ['address', 'full_address', 'latitude', 'longitude', 'planning_area', 'region', 'Nearest_MRT_Station'
                 , 'latitude_mrt', 'longitude_mrt', 'Distance_to_Nearest_MRT_km', 'Walking_Time_to_Nearest_MRT_min'
                 , 'price', 'unit_type', 'room_type', 'room_size_sqft', 'status', 'updated_date', 'link']

# Select the columns in the desired order
df = df[desired_order]

In [None]:
# Save the DataFrame to the CSV file
df.to_csv('Rental_data_final.csv', index=False)