In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

# Load Data

In [2]:
clean_hdb_df1 = pd.read_csv('Processed_Data/hdb_data_with_coords_mrt_dist_id.csv',
    parse_dates=['month', 'lease_commence_date'])

df = clean_hdb_df1.copy()

In [3]:
df.columns

Index(['remaining_lease', 'floor_area_sqm', 'storey_range', 'street_name',
       'lease_commence_date', 'month', 'flat_type', 'resale_price', 'town',
       'block', 'flat_model', 'storey_range_min', 'storey_range_max',
       'full_address', 'Latitude', 'Longitude', 'nearest_mrt_id', 'mrt_name',
       'mrt_latitude', 'mrt_longitude', 'mrt_road', 'Distance_from_MRT_km',
       'transaction_id'],
      dtype='object')

# Define DiD model components

In [4]:
set(df['mrt_name'])

{'ADMIRALTY MRT STATION',
 'ALJUNIED MRT STATION',
 'ANG MO KIO MRT STATION',
 'BAKAU LRT STATION',
 'BANGKIT LRT STATION',
 'BARTLEY MRT STATION',
 'BAYSHORE MRT STATION',
 'BEAUTY WORLD MRT STATION',
 'BEDOK MRT STATION',
 'BEDOK NORTH MRT STATION',
 'BEDOK RESERVOIR MRT STATION',
 'BEDOK SOUTH MRT STATION',
 'BENCOOLEN MRT STATION',
 'BENDEMEER MRT STATION',
 'BISHAN MRT STATION',
 'BOON KENG MRT STATION',
 'BOON LAY MRT STATION',
 'BOTANIC GARDENS MRT STATION',
 'BRADDELL MRT STATION',
 'BRAS BASAH MRT STATION',
 'BRIGHT HILL MRT STATION',
 'BUANGKOK MRT STATION',
 'BUGIS MRT STATION',
 'BUKIT BATOK MRT STATION',
 'BUKIT GOMBAK MRT STATION',
 'BUKIT PANJANG LRT STATION',
 'BUKIT PANJANG MRT STATION',
 'BUONA VISTA MRT STATION',
 'CALDECOTT MRT STATION',
 'CANBERRA MRT STATION',
 'CANTONMENT MRT STATION',
 'CHANGI AIRPORT MRT STATION',
 'CHENG LIM LRT STATION',
 'CHINATOWN MRT STATION',
 'CHINESE GARDEN MRT STATION',
 'CHOA CHU KANG LRT STATION',
 'CHOA CHU KANG MRT STATION',
 'CLEM

In [5]:
set(df['nearest_mrt_id'])

{'BP1',
 'BP10',
 'BP11',
 'BP12',
 'BP13',
 'BP2',
 'BP3',
 'BP4',
 'BP5',
 'BP6',
 'BP7',
 'BP8',
 'BP9',
 'CC10',
 'CC11',
 'CC12',
 'CC13',
 'CC14',
 'CC15',
 'CC16',
 'CC19',
 'CC2',
 'CC20',
 'CC21',
 'CC22',
 'CC23',
 'CC26',
 'CC27',
 'CC28',
 'CC3',
 'CC30',
 'CC31',
 'CC5',
 'CC7',
 'CC8',
 'CC9',
 'CG2',
 'DT1',
 'DT13',
 'DT19',
 'DT21',
 'DT22',
 'DT23',
 'DT24',
 'DT25',
 'DT26',
 'DT27',
 'DT28',
 'DT29',
 'DT30',
 'DT31',
 'DT32',
 'DT33',
 'DT34',
 'DT5',
 'DT6',
 'EW1',
 'EW10',
 'EW11',
 'EW12',
 'EW15',
 'EW16',
 'EW17',
 'EW18',
 'EW19',
 'EW2',
 'EW20',
 'EW21',
 'EW22',
 'EW23',
 'EW24 / NS1',
 'EW25',
 'EW26',
 'EW27',
 'EW28',
 'EW3',
 'EW4',
 'EW5',
 'EW6',
 'EW7',
 'EW8',
 'EW9',
 'NE1 / CC29',
 'NE10',
 'NE11',
 'NE12',
 'NE13',
 'NE14',
 'NE15',
 'NE16',
 'NE17',
 'NE3',
 'NE4',
 'NE7',
 'NE8',
 'NE9',
 'NS10',
 'NS11',
 'NS12',
 'NS13',
 'NS14',
 'NS15',
 'NS16',
 'NS17',
 'NS18',
 'NS19',
 'NS2',
 'NS20',
 'NS3',
 'NS4',
 'NS5',
 'NS8',
 'NS9',
 'PE1',
 '

### DT line stage 2 MRTs

In [6]:
dtl2_ids = [
    "DT1", "DT2", "DT3", "DT5", "DT6", "DT7",
    "DT8", "DT9", "DT10", "DT11", "DT12", "DT13"
]

### Treated group: Towns near DT line stage2

In [7]:
df["treated"] = (
    df["nearest_mrt_id"].isin(dtl2_ids) &
    (df["Distance_from_MRT_km"] <= 1)
).astype(int)

In [8]:
df["treated"]
print(np.sum(df["treated"]))

2686


### Define the post-opening period

In [9]:
cutoff = pd.Timestamp("2015-12-27") # DT line stage2 opening date
df["post"] = (df["month"] >= cutoff).astype(int)

In [10]:
df["post"] 

0         1
1         1
2         1
3         1
4         1
         ..
962941    0
962942    0
962943    0
962944    0
962945    0
Name: post, Length: 962946, dtype: int64

### Interaction term between treated group (across all time) and after DT stage 2 opening

In [11]:
df["treated_post"] = df["treated"] * df["post"]

# Modeling


## Model1: baseline model
OLS regression for Difference in Differences model, applied for DT line 2

note rsale price is natural logged

In [12]:
df["log_price"] = np.log(df["resale_price"])

model = smf.ols(
    "log_price ~ treated + post + treated_post + remaining_lease + floor_area_sqm",
    data=df
).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.566
Model:                            OLS   Adj. R-squared:                  0.566
Method:                 Least Squares   F-statistic:                 2.511e+05
Date:                Thu, 30 Oct 2025   Prob (F-statistic):               0.00
Time:                        11:14:09   Log-Likelihood:            -5.0747e+05
No. Observations:              962946   AIC:                         1.015e+06
Df Residuals:                  962940   BIC:                         1.015e+06
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          11.1124      0.002   68

## Model1 Takeways

In [13]:
# Print main takeaways
print("=== Main Takeaways from Difference-in-Differences Model ===\n")

print(f"R-squared: {model.rsquared:.3f}")
print(f"Adjusted R-squared: {model.rsquared_adj:.3f}")
print(f"Number of observations: {int(model.nobs)}\n")

# Extract key coefficients
coef = model.params
pvalues = model.pvalues

# Interpret key variables
effect = coef["treated_post"]
effect_pct = (np.exp(effect) - 1) * 100

print(f"Effect of DTL2 opening (treated_post): {effect:.4f} ({effect_pct:.1f}% increase in prices, caused by DTL2)")
print(f"P-value for treated_post: {pvalues['treated_post']:.4e}")

print(f"\nEffect of being near DTL2 stations before opening (treated): {coef['treated']:.4f} (18% higher than control flats)")
print(f"Effect of post period (overall market trend): {coef['post']:.4f} (20% higher than general market trend)")

print(f"\nLease effect (per additional year): {coef['remaining_lease']:.4f} (0.7% increase per additional lease year)")
print(f"Floor area effect (per sqm): {coef['floor_area_sqm']:.4f} (1.3% increase per additional sqm floor area)")

# Optional: basic interpretation line
if pvalues["treated_post"] < 0.05:
    print("\n The DiD interaction (treated_post) is statistically significant.")
    print(f"   This suggests that, after DTL2 opening, resale prices near DTL2 stations")
    print(f"   increased by approximately {effect_pct:.1f}% relative to other areas.")
else:
    print("\n The DiD interaction (treated_post) is not statistically significant.")
    print("   No clear evidence of a causal price increase after DTL2 opening.")


=== Main Takeaways from Difference-in-Differences Model ===

R-squared: 0.566
Adjusted R-squared: 0.566
Number of observations: 962946

Effect of DTL2 opening (treated_post): 0.0954 (10.0% increase in prices, caused by DTL2)
P-value for treated_post: 3.9502e-07

Effect of being near DTL2 stations before opening (treated): 0.1833 (18% higher than control flats)
Effect of post period (overall market trend): 0.2003 (20% higher than general market trend)

Lease effect (per additional year): 0.0069 (0.7% increase per additional lease year)
Floor area effect (per sqm): 0.0131 (1.3% increase per additional sqm floor area)

 The DiD interaction (treated_post) is statistically significant.
   This suggests that, after DTL2 opening, resale prices near DTL2 stations
   increased by approximately 10.0% relative to other areas.


# Model 2, 
## A Difference-in-Differences (DiD) model with town and flat-type fixed effects.

C(town) adds town fixed effects: one dummy per town.
- Controls for persistent price differences between towns (e.g., proximity to CBD, amenities).

C(flat_type) adds flat-type fixed effects: one dummy per flat type.
- Controls for systematic differences between flat types (e.g., 3-room vs 5-room).

So now, the model is comparing similar types of flats across towns, not across the whole island.

This removes bias from:
- High baseline prices in central towns (which may also happen to have DTL2 stations).
- Differences in composition of flat types before/after the DTL2 period.

In [14]:
model2 = smf.ols(
    "log_price ~ treated + post + treated_post + C(town) + C(flat_type)",
    data=df
).fit()

In [15]:
print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.581
Model:                            OLS   Adj. R-squared:                  0.581
Method:                 Least Squares   F-statistic:                 3.808e+04
Date:                Thu, 30 Oct 2025   Prob (F-statistic):               0.00
Time:                        11:14:11   Log-Likelihood:            -4.9094e+05
No. Observations:              962946   AIC:                         9.820e+05
Df Residuals:                  962910   BIC:                         9.824e+05
Df Model:                          35                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

# Main Final Takeaway

In [16]:
# Extract key results
coef = model2.params
pvalues = model2.pvalues

treated_post = coef["treated_post"]
treated = coef["treated"]
post = coef["post"]

treated_post_pct = (np.exp(treated_post) - 1) * 100
treated_pct = (np.exp(treated) - 1) * 100
post_pct = (np.exp(post) - 1) * 100

print("\n============================")
print(" Main Takeaways for Policymakers ")
print("============================\n")

#Model performance
print(f"Model explains {model2.rsquared*100:.1f}% of variation in resale prices.")
print(f"Sample size: {int(model2.nobs):,} transactions.\n")

#Policy-Relevant Effects
print("1. **Impact of Downtown Line Stage 2 (DTL2)**")
print(f"   • After DTL2 opened, resale prices of flats near DTL2 stations rose by "
      f"approximately {treated_post_pct:.1f}% relative to other areas.")
if pvalues["treated_post"] < 0.05:
    print("   • This uplift is statistically significant, suggesting a clear accessibility-driven price premium.\n")
else:
    print("   • This uplift is not statistically significant, suggesting no robust price effect.\n")

print("2. **Baseline differences (Before DTL2)**")
if pvalues["treated"] < 0.05:
    direction = "higher" if treated > 0 else "lower"
    print(f"   • Before DTL2, flats near DTL2 stations were about {abs(treated_pct):.1f}% {direction} "
          f"in price than others (holding town and flat type constant).")
else:
    print("   • Before DTL2, no significant difference in prices between treated and control areas.")
print()

print("3. **Overall Market Trend**")
print(f"   • Across Singapore, resale prices rose by roughly {post_pct:.1f}% in the post-DTL2 period, "
      f"reflecting general housing market growth.\n")

#Spatial context (optional summary)
high_price_towns = [t for t in model2.params.index if "C(town)" in t and model2.params[t] > 0.15]
low_price_towns = [t for t in model2.params.index if "C(town)" in t and model2.params[t] < -0.1]

print("4. **Town-level Variation**")
print(f"   • High-value towns include: {', '.join([t.split('[')[1].strip(']T.') for t in high_price_towns])}")
print(f"   • Lower-value towns include: {', '.join([t.split('[')[1].strip(']T.') for t in low_price_towns])}\n")

#Policy insights
print("5. **Policy Insights**")
print("   • The opening of DTL2 appears to have generated a measurable uplift in housing value.")
print("   • Infrastructure investments can lead to localized wealth gains; policymakers may wish to consider:")
print("       - Equitable access to new MRT lines.")
print("       - Balancing redevelopment or affordability near new stations.")
print("       - Monitoring spillover effects on nearby towns.\n")

print("============================")
print("End of Summary")
print("============================\n")



 Main Takeaways for Policymakers 

Model explains 58.1% of variation in resale prices.
Sample size: 962,946 transactions.

1. **Impact of Downtown Line Stage 2 (DTL2)**
   • After DTL2 opened, resale prices of flats near DTL2 stations rose by approximately 10.7% relative to other areas.
   • This uplift is statistically significant, suggesting a clear accessibility-driven price premium.

2. **Baseline differences (Before DTL2)**
   • Before DTL2, flats near DTL2 stations were about 2.5% lower in price than others (holding town and flat type constant).

3. **Overall Market Trend**
   • Across Singapore, resale prices rose by roughly 92.3% in the post-DTL2 period, reflecting general housing market growth.

4. **Town-level Variation**
   • High-value towns include: BISHAN, BUKIT MERAH, BUKIT TIMAH, CENTRAL AREA, MARINE PARADE
   • Lower-value towns include: JURONG WES, LIM CHU KANG, WOODLANDS, YISHUN

5. **Policy Insights**
   • The opening of DTL2 appears to have generated a measurable 