# BTO Cost Estimator Agent Data Cleaning

# Data Preparation
Parse BTO text copied from **HDB Annexes** on **BTO Flat Supply and Pricing Details**

In [1]:
import pandas as pd
import re
bto_df = pd.DataFrame()

In [2]:
def parse_bto_text(table_text):
    lines = [line.strip() for line in table_text.strip().split('\n') if line.strip()]
    data = []

    current_exercise = None
    current_project_type = None
    current_project = None
    current_wait_time = None

    for line in lines:
        # Detect exercise name (e.g., "Feb 2025")
        if re.match(r'^[A-Za-z]{3,9}\s+\d{4}$', line):
            current_exercise = line
            continue

        # Detect Project Type
        if line in ["Standard Projects", "Plus Project", "Prime Project"]:
            current_project_type = line
            continue

        # Detect Project + Waiting Time
        if "$" not in line and re.search(r'\d+(/?\d*)$', line):
            parts = line.rsplit(" ", 1)
            if len(parts) == 2:
                current_project = parts[0]
                current_wait_time = parts[1]
            else:
                current_project = line
            continue

        # Detect flat row (contains price)
        if "$" in line:
            # Extract price
            price_match = re.search(r'\$(\d{1,3}(?:,\d{3})*)\s*-\s*\$(\d{1,3}(?:,\d{3})*)', line)
            if not price_match:
                continue
            min_price = int(price_match.group(1).replace(",", ""))
            max_price = int(price_match.group(2).replace(",", ""))

            # Remove price part
            row_text = line[:price_match.start()].strip()
            tokens = row_text.split()

            # Detect flat type
            if len(tokens) > 1 and tokens[1].lower() == "flexi":
                flat_type = " ".join(tokens[:2])
                tokens = tokens[2:]
            else:
                flat_type = tokens[0]
                tokens = tokens[1:]

            # Start from the right to parse units & internal floor
            units = tokens[-1]

            # Check if internal floor is a range
            if "-" in tokens[-3:-1]:
                est_internal_area = " ".join(tokens[-4:-1])
                floor_tokens_end = -4
            else:
                est_internal_area = tokens[-2]
                floor_tokens_end = -2

            # Remaining tokens = floor area
            est_floor_area = " ".join(tokens[:floor_tokens_end])

            # Normalize ranges
            est_floor_area = re.sub(r"\s*-\s*", " - ", est_floor_area)
            est_internal_area = re.sub(r"\s*-\s*", " - ", est_internal_area)

            data.append({
                "Exercise": current_exercise,
                "Project Type": current_project_type,
                "Project": current_project,
                "Waiting Time": current_wait_time,
                "Flat Type": flat_type,
                "Estimated Floor Area": est_floor_area,
                "Estimated Internal Floor Area": est_internal_area,
                "Units": int(units),
                "Min Price": min_price,
                "Max Price": max_price
            })

    return pd.DataFrame(data)

In [25]:
table_text = """
Oct 2023
Standard Projects
Rail Green I & II @ CCK 39/48
2-room Flexi 38 36 114 $106,000 - $132,000
2-room Flexi 48 46 283 $134,000 - $168,000
3-room 69 66 152 $216,000 - $272,000
4-room 93 - 95 90 - 92 802 $319,000 - $437,000
5-room 113 - 115 110 - 112 518 $463,000 - $595,000
3Gen 120 115 26 $471,000 - $530,000
Plantation Edge I & II 36/40
2-room Flexi 38 36 52 $114,000 - $133,000
2-room Flexi 48 46 211 $142,000 - $183,000
3-room 69 66 81 $232,000 - $279,000
4-room 93 90 353 $353,000 - $418,000
5-room 113 110 313 $460,000 - $568,000
Plus Project
Rajah Residences 52
2-room Flexi 38 36 31 $176,000 - $216,000
2-room Flexi 48 46 101 $214,000 - $294,000
4-room 93 90 607 $480,000 - $675,000
Tenteram Vantage 45
3-room 69 66 88 $339,000 - $420,000
4-room 93 90 952 $488,000 - $628,000
Prime Project
Verandah @ Kallang 42
2-room Flexi 39 36 28 $193,000 - $237,000
2-room Flexi 49 46 112 $240,000 - $303,000
3-room 69 66 110 $368,000 - $475,000
4-room 93 90 893 $535,000 - $675,000
Tanglin Halt Cascadia 54
3-room 66 63 155 $364,000 - $509,000
4-room 89 86 818 $537,000 - $702,000
"""

df = parse_bto_text(table_text)
df


Unnamed: 0,Exercise,Project Type,Project,Waiting Time,Flat Type,Estimated Floor Area,Estimated Internal Floor Area,Units,Min Price,Max Price
0,Oct 2023,Standard Projects,Rail Green I & II @ CCK,39/48,2-room Flexi,38,36,114,106000,132000
1,Oct 2023,Standard Projects,Rail Green I & II @ CCK,39/48,2-room Flexi,48,46,283,134000,168000
2,Oct 2023,Standard Projects,Rail Green I & II @ CCK,39/48,3-room,69,66,152,216000,272000
3,Oct 2023,Standard Projects,Rail Green I & II @ CCK,39/48,4-room,93 - 95,90 - 92,802,319000,437000
4,Oct 2023,Standard Projects,Rail Green I & II @ CCK,39/48,5-room,113 - 115,110 - 112,518,463000,595000
5,Oct 2023,Standard Projects,Rail Green I & II @ CCK,39/48,3Gen,120,115,26,471000,530000
6,Oct 2023,Standard Projects,Plantation Edge I & II,36/40,2-room Flexi,38,36,52,114000,133000
7,Oct 2023,Standard Projects,Plantation Edge I & II,36/40,2-room Flexi,48,46,211,142000,183000
8,Oct 2023,Standard Projects,Plantation Edge I & II,36/40,3-room,69,66,81,232000,279000
9,Oct 2023,Standard Projects,Plantation Edge I & II,36/40,4-room,93,90,353,353000,418000


In [26]:
bto_df = pd.concat([bto_df, df], ignore_index=True)
bto_df

Unnamed: 0,Exercise,Project Type,Project,Waiting Time,Flat Type,Estimated Floor Area,Estimated Internal Floor Area,Units,Min Price,Max Price
0,Feb 2025,Standard Projects,Woodlands North Verge,47/48,2-room Flexi,40,38,67,140000,204000
1,Feb 2025,Standard Projects,Woodlands North Verge,47/48,2-room Flexi,48,46,268,166000,251000
2,Feb 2025,Standard Projects,Woodlands North Verge,47/48,3-room,69,66,85,275000,363000
3,Feb 2025,Standard Projects,Woodlands North Verge,47/48,4-room,93 - 96,90 - 93,608,365000,528000
4,Feb 2025,Standard Projects,Woodlands North Verge,47/48,5-room,113 - 116,110 - 113,535,486000,661000
...,...,...,...,...,...,...,...,...,...,...
196,Oct 2023,Prime Project,Verandah @ Kallang,42,2-room Flexi,49,46,112,240000,303000
197,Oct 2023,Prime Project,Verandah @ Kallang,42,3-room,69,66,110,368000,475000
198,Oct 2023,Prime Project,Verandah @ Kallang,42,4-room,93,90,893,535000,675000
199,Oct 2023,Prime Project,Tanglin Halt Cascadia,54,3-room,66,63,155,364000,509000


In [27]:
# Export to CSV
bto_df.to_csv("BTO_Pricing.csv", index=False)
bto_df = pd.read_csv("BTO_Pricing.csv")