# HDB Rental Data Preprocessing

This notebook handles the preprocessing of HDB rental data for our prediction model.  
It cleans the data, handles missing values, performs feature engineering, and prepares  
the data for modeling.

In [46]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import os
import pandas as pd
import pickle

In [12]:
# Basic settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set(font_scale=1.2)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# 1. Load Data

In [13]:
# Assuming the raw data is stored in the data/raw directory
data_path = '../data/raw/RentingOutofFlats2025.csv'

# Check if the file exists
if not os.path.exists(data_path):
    print(f"Error: {data_path} does not exist.")
    print("Please ensure you have downloaded the HDB rental data and placed it in the correct directory.")
else:
    # Load the data
    df = pd.read_csv(data_path)
    
    # Display basic information about the dataset
    print("Dataset loaded successfully.")
    print(f"Shape: {df.shape}")
    print("\nFirst 5 rows:")
    display(df.head())
    
    print("\nData types:")
    display(df.dtypes)
    
    print("\nSummary statistics:")
    display(df.describe(include='all').T)

Dataset loaded successfully.
Shape: (155464, 6)

First 5 rows:


Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,monthly_rent
0,2021-01,ANG MO KIO,105,ANG MO KIO AVE 4,4-ROOM,2000
1,2021-01,ANG MO KIO,107,ANG MO KIO AVE 4,3-ROOM,1750
2,2021-01,ANG MO KIO,108,ANG MO KIO AVE 4,3-ROOM,1750
3,2021-01,ANG MO KIO,111,ANG MO KIO AVE 4,5-ROOM,2230
4,2021-01,ANG MO KIO,111,ANG MO KIO AVE 4,5-ROOM,2450



Data types:


rent_approval_date    object
town                  object
block                 object
street_name           object
flat_type             object
monthly_rent           int64
dtype: object


Summary statistics:


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
rent_approval_date,155464.0,51.0,2021-03,3826.0,,,,,,,
town,155464.0,27.0,JURONG WEST,10829.0,,,,,,,
block,155464.0,2769.0,2,941.0,,,,,,,
street_name,155464.0,595.0,ANG MO KIO AVE 3,2071.0,,,,,,,
flat_type,155464.0,6.0,4-ROOM,56053.0,,,,,,,
monthly_rent,155464.0,,,,2721.46,750.98,300.0,2100.0,2700.0,3250.0,7600.0


# 2. Data Cleaning

In [37]:
df_clean = df.copy()

# 2.1 Convert 'rent_approval_date' to datetime
if 'rent_approval_date' in df_clean.columns:
    df_clean['rent_approval_date'] = pd.to_datetime(df_clean['rent_approval_date'])
    
    # Extract year and month as separate features
    df_clean['approval_year'] = df_clean['rent_approval_date'].dt.year
    df_clean['approval_month'] = df_clean['rent_approval_date'].dt.month
    
    print("\nDate conversion complete. New date-related columns:")
    display(df_clean[['rent_approval_date', 'approval_year', 'approval_month']].head())

# 2.2 Check for missing values
print("\nMissing values in each column:")
display(df_clean.isnull().sum())

# Handle missing values if any
if df_clean.isnull().sum().sum() > 0:
    # For categorical columns
    cat_cols = df_clean.select_dtypes(include=['object']).columns
    for col in cat_cols:
        if df_clean[col].isnull().sum() > 0:
            df_clean[col] = df_clean[col].fillna('Unknown')
    
    # For numerical columns
    num_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns
    imputer = SimpleImputer(strategy='median')
    df_clean[num_cols] = imputer.fit_transform(df_clean[num_cols])
    
    print("\nAfter handling missing values:")
    display(df_clean.isnull().sum())

# 2.3 Combine 'block' and 'street_name' into 'address'
if 'block' in df_clean.columns and 'street_name' in df_clean.columns:
    df_clean['address'] = df_clean['block'].astype(str) + ' ' + df_clean['street_name']
    print("\nAddress column created:")
    display(df_clean[['block', 'street_name', 'address']].head())
else:
    print("\n'block' or 'street_name' column not found. Skipping address creation.")


Date conversion complete. New date-related columns:


Unnamed: 0,rent_approval_date,approval_year,approval_month
0,2021-01-01,2021,1
1,2021-01-01,2021,1
2,2021-01-01,2021,1
3,2021-01-01,2021,1
4,2021-01-01,2021,1



Missing values in each column:


rent_approval_date    0
town                  0
block                 0
street_name           0
flat_type             0
monthly_rent          0
approval_year         0
approval_month        0
dtype: int64


Address column created:


Unnamed: 0,block,street_name,address
0,105,ANG MO KIO AVE 4,105 ANG MO KIO AVE 4
1,107,ANG MO KIO AVE 4,107 ANG MO KIO AVE 4
2,108,ANG MO KIO AVE 4,108 ANG MO KIO AVE 4
3,111,ANG MO KIO AVE 4,111 ANG MO KIO AVE 4
4,111,ANG MO KIO AVE 4,111 ANG MO KIO AVE 4


# 3. Feature Engineering

In [45]:
df_features = df_clean.copy()
# 3.1 Create features for time trend
current_date = datetime.now()
df_features['months_since_approval'] = ((current_date.year - df_features['approval_year']) * 12 + 
                              (current_date.month - df_features['approval_month']))

# 3.2 Remove unnecessary columns
if 'rent_approval_date' in df_features.columns:
    df_features.drop(columns=['rent_approval_date', 'approval_year', 'approval_month', 'block', 'street_name'], inplace=True)
    print("\nRemoved 'rent_approval_date' column.")
else:
    print("\n'rent_approval_date' column not found, nothing to remove.")

# 3.3 Process categorical features
# Convert categorical features to numerical using label encoding and keep the mapping
cat_cols = df_features.select_dtypes(include=['object']).columns
label_encoders = {}
for col in cat_cols:
    if col != 'rent_approval_date':
        df_features[col], le = pd.factorize(df_features[col])
        label_encoders[col] = le

# 3.4 Normalize numerical features but exclude the target variable "monthly_rent"
num_cols = df_features.select_dtypes(include=['int64', 'float64']).columns
num_cols = num_cols[num_cols != 'monthly_rent']  # Exclude target variable
scaler = StandardScaler()
df_features[num_cols] = scaler.fit_transform(df_features[num_cols])

print(label_encoders)
display(df_features.head())



Removed 'rent_approval_date' column.
{'town': Index(['ANG MO KIO', 'BEDOK', 'BISHAN', 'BUKIT BATOK', 'BUKIT MERAH',
       'BUKIT PANJANG', 'BUKIT TIMAH', 'CENTRAL', 'CHOA CHU KANG', 'CLEMENTI',
       'GEYLANG', 'HOUGANG', 'JURONG EAST', 'JURONG WEST', 'KALLANG/WHAMPOA',
       'MARINE PARADE', 'PASIR RIS', 'PUNGGOL', 'QUEENSTOWN', 'SEMBAWANG',
       'SENGKANG', 'SERANGOON', 'TAMPINES', 'TOA PAYOH', 'WOODLANDS', 'YISHUN',
       'TENGAH'],
      dtype='object'), 'flat_type': Index(['4-ROOM', '3-ROOM', '5-ROOM', '2-ROOM', 'EXECUTIVE', '1-ROOM'], dtype='object'), 'address': Index(['105 ANG MO KIO AVE 4', '107 ANG MO KIO AVE 4', '108 ANG MO KIO AVE 4',
       '111 ANG MO KIO AVE 4', '114 ANG MO KIO AVE 4', '117 ANG MO KIO AVE 4',
       '118 ANG MO KIO AVE 4', '119 ANG MO KIO AVE 3', '120 ANG MO KIO AVE 3',
       '121 ANG MO KIO AVE 3',
       ...
       '435B NORTHSHORE DR', '144 POTONG PASIR AVE 2', '604B TAMPINES AVE 9',
       '41 STIRLING RD', '479C YISHUN ST 42', '463B BT BATOK 

Unnamed: 0,town,flat_type,monthly_rent,address,months_since_approval
0,-1.64,-1.0,2000,-1.47,51
1,-1.64,-0.08,1750,-1.47,51
2,-1.64,-0.08,1750,-1.47,51
3,-1.64,0.84,2230,-1.47,51
4,-1.64,0.84,2450,-1.47,51


In [None]:
# TODO: Add MRT distance features using OneMap API if feasible

# TODO: Add simulated features based on survey results - get from external sources if unrestricted

#  4. Save Basic Processed Data

In [47]:
df_save = df_features.copy()
# 4.1 Save the processed data
output_path = '../data/processed/RentingOutofFlats2025_processed.csv'
df_save.to_csv(output_path, index=False)
print(f"\nProcessed data saved to {output_path}")

# 4.2 Save the label encoders
with open('../data/processed/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)
print("Label encoders saved.")

# 4.3 Save the scaler
with open('../data/processed/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("Scaler saved.")


Processed data saved to ../data/processed/RentingOutofFlats2025_processed.csv
Label encoders saved.
Scaler saved.
