In [2]:
import pandas as pd

train_data = pd.read_csv('../../data/processed/train.csv')
test_data = pd.read_csv('../../data/processed/test.csv')

# Perform one-hot encoding
train_data = pd.get_dummies(train_data, columns=['flat_model'], prefix='model')
test_data = pd.get_dummies(test_data, columns=['flat_model'], prefix='model')

# Ensure that both train and test have the same one-hot encoded columns
train_columns = set(train_data.columns)
test_columns = set(test_data.columns)
for col in train_columns - test_columns:
    if col.startswith('model_'):
        test_data[col] = 0
extra_columns = [col for col in test_data.columns if col.startswith('model_') and col not in train_columns]
test_data.drop(columns=extra_columns, inplace=True)

# Define the feature columns and target columns
numeric_features = ['time', 'storey_avg', 'floor_area_sqm', 'flat_type_encoded', 'remaining_lease_months']
model_columns = [col for col in train_data.columns if col.startswith('model_')]
feature_columns = numeric_features + model_columns
target_column = 'resale_price'

X_train = train_data[feature_columns]
y_train = train_data[target_column]
X_test = test_data[feature_columns]
y_test = test_data[target_column]

print("Feature Columns Used:")
print(feature_columns)


Feature Columns Used:
['time', 'storey_avg', 'floor_area_sqm', 'flat_type_encoded', 'remaining_lease_months', 'model_2-room', 'model_3Gen', 'model_Adjoined flat', 'model_Apartment', 'model_DBSS', 'model_Improved', 'model_Improved-Maisonette', 'model_Maisonette', 'model_Model A', 'model_Model A-Maisonette', 'model_Model A2', 'model_Multi Generation', 'model_New Generation', 'model_Premium Apartment', 'model_Premium Apartment Loft', 'model_Premium Maisonette', 'model_Simplified', 'model_Standard', 'model_Terrace', 'model_Type S1', 'model_Type S2']


In [3]:
# 2. Pipeline + Hyperparameter Search
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('ridge', Ridge(random_state=42))
])

param_dist = {
    'ridge__alpha': np.logspace(-3, 3, 30)
}

search = RandomizedSearchCV(
    pipe, param_dist,
    n_iter=20, cv=3,
    scoring='neg_mean_absolute_error',
    random_state=42, n_jobs=-1
)
search.fit(X_train, y_train)
print("Best α:", search.best_params_)


Best α: {'ridge__alpha': np.float64(621.0169418915616)}


In [4]:

# 3. Evaluate
best = search.best_estimator_
pred = best.predict(X_test)
print("MAE:",  mean_absolute_error(y_test, pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, pred)))


MAE: 73845.25950691383
RMSE: 99482.11200274542
