### Import libraries

In [1]:
# import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from scipy.stats import pearsonr


%matplotlib inline

___

### Function to check for high collinearity

In [2]:
def check_collinearity(df):
    # calculate the correlation matrix
    corr_matrix = df.corr().abs()
    
    # select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    
    # find index of feature columns with correlation greater than 0.9
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
    
    return to_drop

___

### Load dataset

In [3]:
# load datasets
df_train = pd.read_csv("../datasets/ML_train.csv",index_col=0, low_memory= False)
df_test = pd.read_csv("../datasets/ML_test.csv",index_col=0, low_memory= False)

# set max_rows and max_columns to enable readability
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
df_test.head(1)

Unnamed: 0,town,flat_type,Tranc_Year,Tranc_Month,floor_area_sqft,max_floor_lvl,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,exec_sold,multigen_sold,studio_apartment_sold,Latitude,Longitude,Mall_Nearest_Distance,Hawker_Nearest_Distance,mrt_nearest_distance,bus_stop_nearest_distance,pri_sch_nearest_distance,sec_sch_nearest_dist,remaining_lease
0,YISHUN,4 ROOM,2012,11,904,12,0,0,0,92,40,0,0,0,1.437066,103.831121,877.431572,687.576779,686.660434,75.683952,426.46791,156.322353,65


### Trial #1 - Baseline

Baseline for feature selection and manipulation. </br>

Numeric Feeatures:
- floor_area_sqft
- remaining_lease
- mrt_nearest_distance
- max_floor_lvl

Categorical Features:
- town
- flat_type

In [5]:
# numeric features to be used as part of prediction
numeric_features = df_train[['floor_area_sqft', 'remaining_lease', 'mrt_nearest_distance', 'max_floor_lvl']]

In [6]:
numeric_features.head(2)

Unnamed: 0,floor_area_sqft,remaining_lease,mrt_nearest_distance,max_floor_lvl
0,968,84,330.083069,25
1,1399,65,903.659703,9


In [7]:
# categorical features to be used as part of prediction
cat_features = df_train[['town', 'flat_type']]

In [8]:
df_dummies = pd.get_dummies(cat_features, drop_first=True)

In [9]:
df_dummies.head()

Unnamed: 0,town_BEDOK,town_BISHAN,town_BUKIT BATOK,town_BUKIT MERAH,town_BUKIT PANJANG,town_BUKIT TIMAH,town_CENTRAL AREA,town_CHOA CHU KANG,town_CLEMENTI,town_GEYLANG,town_HOUGANG,town_JURONG EAST,town_JURONG WEST,town_KALLANG/WHAMPOA,town_MARINE PARADE,town_PASIR RIS,town_PUNGGOL,town_QUEENSTOWN,town_SEMBAWANG,town_SENGKANG,town_SERANGOON,town_TAMPINES,town_TOA PAYOH,town_WOODLANDS,town_YISHUN,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI-GENERATION
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0


In [10]:
# merge both categorical and numeric features
df_merged = numeric_features.merge(df_dummies, left_index=True, right_index=True)

In [11]:
# check for high collinearity
highly_correlated = check_collinearity(df_merged)
if highly_correlated:
    print("The following columns are highly correlated and need to be dropped: ", highly_correlated)
else:
    print("No high collinearity detected in the dataframe.")

No high collinearity detected in the dataframe.


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [12]:
df_merged.head(2)

Unnamed: 0,floor_area_sqft,remaining_lease,mrt_nearest_distance,max_floor_lvl,town_BEDOK,town_BISHAN,town_BUKIT BATOK,town_BUKIT MERAH,town_BUKIT PANJANG,town_BUKIT TIMAH,town_CENTRAL AREA,town_CHOA CHU KANG,town_CLEMENTI,town_GEYLANG,town_HOUGANG,town_JURONG EAST,town_JURONG WEST,town_KALLANG/WHAMPOA,town_MARINE PARADE,town_PASIR RIS,town_PUNGGOL,town_QUEENSTOWN,town_SEMBAWANG,town_SENGKANG,town_SERANGOON,town_TAMPINES,town_TOA PAYOH,town_WOODLANDS,town_YISHUN,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI-GENERATION
0,968,84,330.083069,25,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,1399,65,903.659703,9,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [13]:
# assign target and predictor features
X = df_merged
y = df_train['resale_price']

In [14]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# scale the training and test data using the Standard Scaler
scaler1 = StandardScaler().fit(X_train)
X_train_scaled = scaler1.transform(X_train)
X_test_scaled = scaler1.transform(X_test)

In [16]:
# fit the linear, Ridge, Lasso, and Elastic Net regression models to the training data
reg_linear1 = LinearRegression().fit(X_train_scaled, y_train)
reg_ridge1 = Ridge().fit(X_train_scaled, y_train)
reg_lasso1 = Lasso().fit(X_train_scaled, y_train)
reg_elasticnet1 = ElasticNet().fit(X_train_scaled, y_train)

  model = cd_fast.enet_coordinate_descent(


In [17]:
# Predict the target values for the test data for each model
y_pred_linear1 = reg_linear1.predict(X_test_scaled)
y_pred_ridge1 = reg_ridge1.predict(X_test_scaled)
y_pred_lasso1 = reg_lasso1.predict(X_test_scaled)
y_pred_elasticnet1 = reg_elasticnet1.predict(X_test_scaled)

In [18]:
models = [("linear", y_pred_linear1), ("ridge", y_pred_ridge1), 
          ("lasso", y_pred_lasso1), ("elasticnet", y_pred_elasticnet1)]

for name, y_pred in models:
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print(f"RMSE for {name}: {rmse}")
    print(f"R2 for {name}: {r2}")

RMSE for linear: 58110.52588670875
R2 for linear: 0.8343924943254446
RMSE for ridge: 58110.555928419446
R2 for ridge: 0.8343923230953969
RMSE for lasso: 58112.39289845401
R2 for lasso: 0.8343818526690348
RMSE for elasticnet: 69766.25305271908
R2 for elasticnet: 0.7612951723919021


***Key Findings***
- RMSE score of best performing model - Linear Regression, performs better than baseline submitted on kaggle (RMSE 58110 vs. RMSE 142970).
- Best model is one that has the lowest RMSE and highest R2 score.
- Based on the scores above, the better performing model would be the Linear regression model.

In [19]:
# Calculate cross-validation scores for each model
cv_scores_linear1 = cross_val_score(reg_linear1, X_train_scaled, y_train, cv=5)
cv_scores_ridge1 = cross_val_score(reg_ridge1, X_train_scaled, y_train, cv=5)
cv_scores_lasso1 = cross_val_score(reg_lasso1, X_train_scaled, y_train, cv=5)
cv_scores_elasticnet1 = cross_val_score(reg_elasticnet1, X_train_scaled, y_train, cv=5)

print("Linear Regression Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_linear1)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_linear1)))

print("\nRidge Regression Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_ridge1)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_ridge1)))

print("\nLasso Regression Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_lasso1)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_lasso1)))

print("\nElastic Net Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_elasticnet1)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_elasticnet1)))

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Linear Regression Cross Validation Results:
Mean: 0.84
Standard Deviation: 0.00

Ridge Regression Cross Validation Results:
Mean: 0.84
Standard Deviation: 0.00

Lasso Regression Cross Validation Results:
Mean: 0.84
Standard Deviation: 0.00

Elastic Net Cross Validation Results:
Mean: 0.76
Standard Deviation: 0.00


***Key Findings***
- Mean accuracy of ElasticNet model is 0.76 which is the lowest of all 4 models.
- Linear, Ridge and Lasso perform similarly well with an accuracy of 0.84.

### Trial #2 - Swap Features

Swap mrt_nearest_distance with Hawker_Nearest_Distance to identify impact on model. </br>

Numeric Feeatures:
- floor_area_sqft
- remaining_lease
- Hawker_Nearest_Distance (previously mrt_nearest_distance in Trial #1)
- max_floor_lvl

Categorical Features:
- town
- flat_type

In [20]:
# numeric features to be used as part of prediction
numeric_features = df_train[['floor_area_sqft', 'remaining_lease', 'Hawker_Nearest_Distance', 'max_floor_lvl']]

In [21]:
numeric_features.head(2)

Unnamed: 0,floor_area_sqft,remaining_lease,Hawker_Nearest_Distance,max_floor_lvl
0,968,84,154.753357,25
1,1399,65,640.151925,9


In [22]:
# categorical features to be used as part of prediction
cat_features = df_train[['town', 'flat_type']]

In [23]:
df_dummies = pd.get_dummies(cat_features, drop_first=True)

In [24]:
df_dummies.head(2)

Unnamed: 0,town_BEDOK,town_BISHAN,town_BUKIT BATOK,town_BUKIT MERAH,town_BUKIT PANJANG,town_BUKIT TIMAH,town_CENTRAL AREA,town_CHOA CHU KANG,town_CLEMENTI,town_GEYLANG,town_HOUGANG,town_JURONG EAST,town_JURONG WEST,town_KALLANG/WHAMPOA,town_MARINE PARADE,town_PASIR RIS,town_PUNGGOL,town_QUEENSTOWN,town_SEMBAWANG,town_SENGKANG,town_SERANGOON,town_TAMPINES,town_TOA PAYOH,town_WOODLANDS,town_YISHUN,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI-GENERATION
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [25]:
# merge both
df_merged2 = numeric_features.merge(df_dummies, left_index=True, right_index=True)

In [26]:
# check for high collinearity
highly_correlated = check_collinearity(df_merged2)
if highly_correlated:
    print("The following columns are highly correlated and need to be dropped: ", highly_correlated)
else:
    print("No high collinearity detected in the dataframe.")

No high collinearity detected in the dataframe.


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [27]:
df_merged2.head(2)

Unnamed: 0,floor_area_sqft,remaining_lease,Hawker_Nearest_Distance,max_floor_lvl,town_BEDOK,town_BISHAN,town_BUKIT BATOK,town_BUKIT MERAH,town_BUKIT PANJANG,town_BUKIT TIMAH,town_CENTRAL AREA,town_CHOA CHU KANG,town_CLEMENTI,town_GEYLANG,town_HOUGANG,town_JURONG EAST,town_JURONG WEST,town_KALLANG/WHAMPOA,town_MARINE PARADE,town_PASIR RIS,town_PUNGGOL,town_QUEENSTOWN,town_SEMBAWANG,town_SENGKANG,town_SERANGOON,town_TAMPINES,town_TOA PAYOH,town_WOODLANDS,town_YISHUN,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI-GENERATION
0,968,84,154.753357,25,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,1399,65,640.151925,9,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [28]:
# assign target and predictor features
X = df_merged2
y = df_train['resale_price']

In [29]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# scale the training and test data using the Standard Scaler
scaler2 = StandardScaler().fit(X_train)
X_train_scaled = scaler2.transform(X_train)
X_test_scaled = scaler2.transform(X_test)

In [31]:
# fit the linear, Ridge, Lasso, and Elastic Net regression models to the training data
reg_linear2 = LinearRegression().fit(X_train_scaled, y_train)
reg_ridge2 = Ridge().fit(X_train_scaled, y_train)
reg_lasso2 = Lasso().fit(X_train_scaled, y_train)
reg_elasticnet2 = ElasticNet().fit(X_train_scaled, y_train)

  model = cd_fast.enet_coordinate_descent(


In [32]:
# Predict the target values for the test data for each model
y_pred_linear2 = reg_linear2.predict(X_test_scaled)
y_pred_ridge2 = reg_ridge2.predict(X_test_scaled)
y_pred_lasso2 = reg_lasso2.predict(X_test_scaled)
y_pred_elasticnet2 = reg_elasticnet2.predict(X_test_scaled)

In [33]:
models = [("linear", y_pred_linear2), ("ridge", y_pred_ridge2), 
          ("lasso", y_pred_lasso2), ("elasticnet", y_pred_elasticnet2)]

for name, y_pred in models:
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print(f"RMSE for {name}: {rmse}")
    print(f"R2 for {name}: {r2}")

RMSE for linear: 61489.89423787021
R2 for linear: 0.8145708973183514
RMSE for ridge: 61489.9566888585
R2 for ridge: 0.8145705206633987
RMSE for lasso: 61494.734178316256
R2 for lasso: 0.8145417054927563
RMSE for elasticnet: 71718.4847624646
R2 for elasticnet: 0.7477491633367318


---Trial #1--- </br>
RMSE for linear: 58110.52588670875 </br>
R2 for linear: 0.8343924943254446 </br>
RMSE for ridge: 58110.555928419446 </br>
R2 for ridge: 0.8343923230953969 </br>
RMSE for lasso: 58112.39289845401 </br>
R2 for lasso: 0.8343818526690348 </br>
RMSE for elasticnet: 69766.25305271908 </br>
R2 for elasticnet: 0.7612951723919021 </br>

---Trial #2--- </br>
RMSE for linear: 61489.89423787021 </br>
R2 for linear: 0.8145708973183514 </br>
RMSE for ridge: 61489.9566888585 </br>
R2 for ridge: 0.8145705206633987 </br>
RMSE for lasso: 61494.734178316256 </br>
R2 for lasso: 0.8145417054927563 </br>
RMSE for elasticnet: 71718.4847624646 </br>
R2 for elasticnet: 0.7477491633367318

***Key Findings***
- RMSE score for Trial #2 models is worse than those in Trial #1
- R2 score is lower for all models as compared to performance in Trial #1
- It can be concluded that mrt_nearest_distance is a better predictor than Hawker_Nearest_Distance.

In [34]:
# Calculate cross-validation scores for each model
cv_scores_linear = cross_val_score(reg_linear2, X_train_scaled, y_train, cv=5)
cv_scores_ridge = cross_val_score(reg_ridge2, X_train_scaled, y_train, cv=5)
cv_scores_lasso = cross_val_score(reg_lasso2, X_train_scaled, y_train, cv=5)
cv_scores_elasticnet = cross_val_score(reg_elasticnet2, X_train_scaled, y_train, cv=5)

print("Linear Regression Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_linear)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_linear)))

print("\nRidge Regression Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_ridge)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_ridge)))

print("\nLasso Regression Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_lasso)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_lasso)))

print("\nElastic Net Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_elasticnet)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_elasticnet)))

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Linear Regression Cross Validation Results:
Mean: 0.82
Standard Deviation: 0.00

Ridge Regression Cross Validation Results:
Mean: 0.82
Standard Deviation: 0.00

Lasso Regression Cross Validation Results:
Mean: 0.82
Standard Deviation: 0.00

Elastic Net Cross Validation Results:
Mean: 0.75
Standard Deviation: 0.00


***Key Findings***
- Mean accuracy of all models is lower than that in Trial #1.
- Linear, Ridge and Lasso perform similarly well with an accuracy of 0.82 but not as high as in Trial #1.
- Models in Trial #1 perform better than those in Trial #2

### Trial #3 - Adding Features

Include both mrt_nearest_distance and bus_stop_nearest_distance to determine impact on model. </br>

Numeric Feeatures:
- floor_area_sqft
- remaining_lease
- mrt_nearest_distance
- bus_stop_nearest_distance
- max_floor_lvl

Categorical Features:
- town
- flat_type

In [35]:
# numeric features to be used as part of prediction
numeric_features = df_train[['floor_area_sqft', 'remaining_lease', 'mrt_nearest_distance','bus_stop_nearest_distance', 'max_floor_lvl']]

In [36]:
numeric_features.head(2)

Unnamed: 0,floor_area_sqft,remaining_lease,mrt_nearest_distance,bus_stop_nearest_distance,max_floor_lvl
0,968,84,330.083069,29.427395,25
1,1399,65,903.659703,58.207761,9


In [37]:
# categorical features to be used as part of prediction
cat_features = df_train[['town', 'flat_type']]

In [38]:
df_dummies = pd.get_dummies(cat_features, drop_first=True)

In [39]:
df_dummies.head(2)

Unnamed: 0,town_BEDOK,town_BISHAN,town_BUKIT BATOK,town_BUKIT MERAH,town_BUKIT PANJANG,town_BUKIT TIMAH,town_CENTRAL AREA,town_CHOA CHU KANG,town_CLEMENTI,town_GEYLANG,town_HOUGANG,town_JURONG EAST,town_JURONG WEST,town_KALLANG/WHAMPOA,town_MARINE PARADE,town_PASIR RIS,town_PUNGGOL,town_QUEENSTOWN,town_SEMBAWANG,town_SENGKANG,town_SERANGOON,town_TAMPINES,town_TOA PAYOH,town_WOODLANDS,town_YISHUN,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI-GENERATION
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [40]:
# merge both
df_merged3 = numeric_features.merge(df_dummies, left_index=True, right_index=True)

In [41]:
# check for high collinearity
highly_correlated = check_collinearity(df_merged3)
if highly_correlated:
    print("The following columns are highly correlated and need to be dropped: ", highly_correlated)
else:
    print("No high collinearity detected in the dataframe.")

No high collinearity detected in the dataframe.


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [42]:
df_merged3.head(2)

Unnamed: 0,floor_area_sqft,remaining_lease,mrt_nearest_distance,bus_stop_nearest_distance,max_floor_lvl,town_BEDOK,town_BISHAN,town_BUKIT BATOK,town_BUKIT MERAH,town_BUKIT PANJANG,town_BUKIT TIMAH,town_CENTRAL AREA,town_CHOA CHU KANG,town_CLEMENTI,town_GEYLANG,town_HOUGANG,town_JURONG EAST,town_JURONG WEST,town_KALLANG/WHAMPOA,town_MARINE PARADE,town_PASIR RIS,town_PUNGGOL,town_QUEENSTOWN,town_SEMBAWANG,town_SENGKANG,town_SERANGOON,town_TAMPINES,town_TOA PAYOH,town_WOODLANDS,town_YISHUN,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI-GENERATION
0,968,84,330.083069,29.427395,25,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,1399,65,903.659703,58.207761,9,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [43]:
# assign target and predictor features
X = df_merged3
y = df_train['resale_price']

In [44]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
# Scale the training and test data using the Standard Scaler
scaler3 = StandardScaler().fit(X_train)
X_train_scaled = scaler3.transform(X_train)
X_test_scaled = scaler3.transform(X_test)

In [46]:
# Fit the linear, Ridge, Lasso, and Elastic Net regression models to the training data
reg_linear3 = LinearRegression().fit(X_train_scaled, y_train)
reg_ridge3 = Ridge().fit(X_train_scaled, y_train)
reg_lasso3 = Lasso().fit(X_train_scaled, y_train)
reg_elasticnet3 = ElasticNet().fit(X_train_scaled, y_train)

  model = cd_fast.enet_coordinate_descent(


In [47]:
# Predict the target values for the test data for each model
y_pred_linear3 = reg_linear3.predict(X_test_scaled)
y_pred_ridge3 = reg_ridge3.predict(X_test_scaled)
y_pred_lasso3 = reg_lasso3.predict(X_test_scaled)
y_pred_elasticnet3 = reg_elasticnet3.predict(X_test_scaled)

In [48]:
models = [("linear", y_pred_linear3), ("ridge", y_pred_ridge3), 
          ("lasso", y_pred_lasso3), ("elasticnet", y_pred_elasticnet3)]

for name, y_pred in models:
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print(f"RMSE for {name}: {rmse}")
    print(f"R2 for {name}: {r2}")

RMSE for linear: 58116.3770114255
R2 for linear: 0.83435914274452
RMSE for ridge: 58116.404290411265
R2 for ridge: 0.8343589872456442
RMSE for lasso: 58118.213729167524
R2 for lasso: 0.8343486727075177
RMSE for elasticnet: 69747.09680607652
R2 for elasticnet: 0.7614262403674225


---Trial #1--- </br>
RMSE for linear: 58110.52588670875 </br>
R2 for linear: 0.8343924943254446 </br>
RMSE for ridge: 58110.555928419446 </br>
R2 for ridge: 0.8343923230953969 </br>
RMSE for lasso: 58112.39289845401 </br>
R2 for lasso: 0.8343818526690348 </br>
RMSE for elasticnet: 69766.25305271908 </br>
R2 for elasticnet: 0.7612951723919021 </br>

---Trial #2--- </br>
RMSE for linear: 61489.89423787021 </br>
R2 for linear: 0.8145708973183514 </br>
RMSE for ridge: 61489.9566888585 </br>
R2 for ridge: 0.8145705206633987 </br>
RMSE for lasso: 61494.734178316256 </br>
R2 for lasso: 0.8145417054927563 </br>
RMSE for elasticnet: 71718.4847624646 </br>
R2 for elasticnet: 0.7477491633367318 </br>

---Trial #3--- </br>
RMSE for linear: 58116.3770114255 </br>
R2 for linear: 0.83435914274452 </br>
RMSE for ridge: 58116.404290411265 </br>
R2 for ridge: 0.8343589872456442 </br>
RMSE for lasso: 58118.213729167524 </br>
R2 for lasso: 0.8343486727075177 </br>
RMSE for elasticnet: 69747.09680607652 </br>
R2 for elasticnet: 0.7614262403674225

***Key Findings***
- Previously in Trial #2, it was concluded that models in Trial #1 performed better.
- The RMSE score for models in Trial #3 does not perform as well as those in Trial #1.
- Similarly, the R2 score is not as high as those in Trial #1.
- Models in Trial #1 still performs better than those in Trial #2 and #3.

In [49]:
# Calculate cross-validation scores for each model
cv_scores_linear = cross_val_score(reg_linear3, X_train_scaled, y_train, cv=5)
cv_scores_ridge = cross_val_score(reg_ridge3, X_train_scaled, y_train, cv=5)
cv_scores_lasso = cross_val_score(reg_lasso3, X_train_scaled, y_train, cv=5)
cv_scores_elasticnet = cross_val_score(reg_elasticnet3, X_train_scaled, y_train, cv=5)

print("Linear Regression Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_linear)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_linear)))

print("\nRidge Regression Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_ridge)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_ridge)))

print("\nLasso Regression Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_lasso)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_lasso)))

print("\nElastic Net Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_elasticnet)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_elasticnet)))

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Linear Regression Cross Validation Results:
Mean: 0.84
Standard Deviation: 0.00

Ridge Regression Cross Validation Results:
Mean: 0.84
Standard Deviation: 0.00

Lasso Regression Cross Validation Results:
Mean: 0.84
Standard Deviation: 0.00

Elastic Net Cross Validation Results:
Mean: 0.76
Standard Deviation: 0.00


***Key Findings***
- Mean accuracy is the same as in Trial #1.

### Trial #4 - Adding a Relationship

Add a relationship feature for bus_stop_nearest_distance and mrt_nearest_distance to determine impact on model. </br>

Numeric Feeatures:
- floor_area_sqft
- remaining_lease
- mrt_nearest_distance
- bus_stop_nearest_distance
- mrt_bus_stop (relationship feature combining mrt_nearest_distance and bus_stop_nearest_distance)
- max_floor_lvl

Categorical Features:
- town
- flat_type

In [50]:
# numeric features to be used as part of prediction
numeric_features = df_train[['floor_area_sqft', 'remaining_lease', 'bus_stop_nearest_distance', 'mrt_nearest_distance', 'max_floor_lvl']]

In [51]:
numeric_features.head(2)

Unnamed: 0,floor_area_sqft,remaining_lease,bus_stop_nearest_distance,mrt_nearest_distance,max_floor_lvl
0,968,84,29.427395,330.083069,25
1,1399,65,58.207761,903.659703,9


In [52]:
# categorical features to be used as part of prediction
cat_features = df_train[['town', 'flat_type']]

In [53]:
df_dummies = pd.get_dummies(cat_features, drop_first=True)

In [54]:
df_dummies.head(2)

Unnamed: 0,town_BEDOK,town_BISHAN,town_BUKIT BATOK,town_BUKIT MERAH,town_BUKIT PANJANG,town_BUKIT TIMAH,town_CENTRAL AREA,town_CHOA CHU KANG,town_CLEMENTI,town_GEYLANG,town_HOUGANG,town_JURONG EAST,town_JURONG WEST,town_KALLANG/WHAMPOA,town_MARINE PARADE,town_PASIR RIS,town_PUNGGOL,town_QUEENSTOWN,town_SEMBAWANG,town_SENGKANG,town_SERANGOON,town_TAMPINES,town_TOA PAYOH,town_WOODLANDS,town_YISHUN,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI-GENERATION
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [55]:
# merge both
df_merged4 = numeric_features.merge(df_dummies, left_index=True, right_index=True)

In [56]:
df_merged4.head(2)

Unnamed: 0,floor_area_sqft,remaining_lease,bus_stop_nearest_distance,mrt_nearest_distance,max_floor_lvl,town_BEDOK,town_BISHAN,town_BUKIT BATOK,town_BUKIT MERAH,town_BUKIT PANJANG,town_BUKIT TIMAH,town_CENTRAL AREA,town_CHOA CHU KANG,town_CLEMENTI,town_GEYLANG,town_HOUGANG,town_JURONG EAST,town_JURONG WEST,town_KALLANG/WHAMPOA,town_MARINE PARADE,town_PASIR RIS,town_PUNGGOL,town_QUEENSTOWN,town_SEMBAWANG,town_SENGKANG,town_SERANGOON,town_TAMPINES,town_TOA PAYOH,town_WOODLANDS,town_YISHUN,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI-GENERATION
0,968,84,29.427395,330.083069,25,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,1399,65,58.207761,903.659703,9,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [57]:
# create polynomial features
poly_features = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

mrt_bus_stop = df_merged4[['mrt_nearest_distance', 'bus_stop_nearest_distance']]
poly_mrt_bus_stop = poly_features.fit_transform(mrt_bus_stop)

# add polynomial features back to dataframe
df_merged4 = pd.concat([df_merged4, pd.DataFrame(poly_mrt_bus_stop[:, -1:], columns=['mrt_bus_stop_interaction'])], axis=1)

In [58]:
# check for high collinearity
highly_correlated = check_collinearity(df_merged)
if highly_correlated:
    print("The following columns are highly correlated and need to be dropped: ", highly_correlated)
else:
    print("No high collinearity detected in the dataframe.")

No high collinearity detected in the dataframe.


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [59]:
df_merged4.head(2)

Unnamed: 0,floor_area_sqft,remaining_lease,bus_stop_nearest_distance,mrt_nearest_distance,max_floor_lvl,town_BEDOK,town_BISHAN,town_BUKIT BATOK,town_BUKIT MERAH,town_BUKIT PANJANG,town_BUKIT TIMAH,town_CENTRAL AREA,town_CHOA CHU KANG,town_CLEMENTI,town_GEYLANG,town_HOUGANG,town_JURONG EAST,town_JURONG WEST,town_KALLANG/WHAMPOA,town_MARINE PARADE,town_PASIR RIS,town_PUNGGOL,town_QUEENSTOWN,town_SEMBAWANG,town_SENGKANG,town_SERANGOON,town_TAMPINES,town_TOA PAYOH,town_WOODLANDS,town_YISHUN,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI-GENERATION,mrt_bus_stop_interaction
0,968,84,29.427395,330.083069,25,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,9713.484742
1,1399,65,58.207761,903.659703,9,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,52600.007804


In [60]:
X = df_merged4
y = df_train['resale_price']

In [61]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
# scale the training and test data using the Standard Scaler
scaler4 = StandardScaler().fit(X_train)
X_train_scaled = scaler4.transform(X_train)
X_test_scaled = scaler4.transform(X_test)

In [63]:
# fit the linear, Ridge, Lasso, and Elastic Net regression models to the training data
reg_linear4 = LinearRegression().fit(X_train_scaled, y_train)
reg_ridge4 = Ridge().fit(X_train_scaled, y_train)
reg_lasso4 = Lasso().fit(X_train_scaled, y_train)
reg_elasticnet4 = ElasticNet().fit(X_train_scaled, y_train)

  model = cd_fast.enet_coordinate_descent(


In [64]:
# Predict the target values for the test data for each model
y_pred_linear4 = reg_linear4.predict(X_test_scaled)
y_pred_ridge4 = reg_ridge4.predict(X_test_scaled)
y_pred_lasso4 = reg_lasso4.predict(X_test_scaled)
y_pred_elasticnet4 = reg_elasticnet4.predict(X_test_scaled)

In [65]:
models = [("linear", y_pred_linear4), ("ridge", y_pred_ridge4), 
          ("lasso", y_pred_lasso4), ("elasticnet", y_pred_elasticnet4)]

for name, y_pred in models:
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print(f"RMSE for {name}: {rmse}")
    print(f"R2 for {name}: {r2}")

RMSE for linear: 58116.398184351594
R2 for linear: 0.8343590220521284
RMSE for ridge: 58116.42388180355
R2 for ridge: 0.8343588755684457
RMSE for lasso: 58118.214205502336
R2 for lasso: 0.8343486699921729
RMSE for elasticnet: 69619.3374204709
R2 for elasticnet: 0.762299455810419


---Trial #1--- </br>
RMSE for linear: 58110.52588670875 </br>
R2 for linear: 0.8343924943254446 </br>
RMSE for ridge: 58110.555928419446 </br>
R2 for ridge: 0.8343923230953969 </br>
RMSE for lasso: 58112.39289845401 </br>
R2 for lasso: 0.8343818526690348 </br>
RMSE for elasticnet: 69766.25305271908 </br>
R2 for elasticnet: 0.7612951723919021 </br>

---Trial #2--- </br>
RMSE for linear: 61489.89423787021 </br>
R2 for linear: 0.8145708973183514 </br>
RMSE for ridge: 61489.9566888585 </br>
R2 for ridge: 0.8145705206633987 </br>
RMSE for lasso: 61494.734178316256 </br>
R2 for lasso: 0.8145417054927563 </br>
RMSE for elasticnet: 71718.4847624646 </br>
R2 for elasticnet: 0.7477491633367318 </br>

---Trial #3--- </br>
RMSE for linear: 58116.3770114255 </br>
R2 for linear: 0.83435914274452 </br>
RMSE for ridge: 58116.404290411265 </br>
R2 for ridge: 0.8343589872456442 </br>
RMSE for lasso: 58118.213729167524 </br>
R2 for lasso: 0.8343486727075177 </br>
RMSE for elasticnet: 69747.09680607652 </br>
R2 for elasticnet: 0.7614262403674225 </br>

---Trial #4--- </br>
RMSE for linear: 58116.398184351594 </br>
R2 for linear: 0.8343590220521284 </br>
RMSE for ridge: 58116.42388180355 </br>
R2 for ridge: 0.8343588755684457 </br>
RMSE for lasso: 58118.214205502336 </br>
R2 for lasso: 0.8343486699921729 </br>
RMSE for elasticnet: 69619.3374204709 </br>
R2 for elasticnet: 0.762299455810419

***Key Findings***
- The RMSE score for models in Trial #4 did not perform as well as those in Trial #1.
- Similarly, the R2 score is not as high as those in Trial #1.
- Models in Trial #1 still performs better than those in Trial #2, #3 and #4.

In [66]:
# Calculate cross-validation scores for each model
cv_scores_linear = cross_val_score(reg_linear4, X_train_scaled, y_train, cv=5)
cv_scores_ridge = cross_val_score(reg_ridge4, X_train_scaled, y_train, cv=5)
cv_scores_lasso = cross_val_score(reg_lasso4, X_train_scaled, y_train, cv=5)
cv_scores_elasticnet = cross_val_score(reg_elasticnet4, X_train_scaled, y_train, cv=5)

print("Linear Regression Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_linear)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_linear)))

print("\nRidge Regression Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_ridge)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_ridge)))

print("\nLasso Regression Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_lasso)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_lasso)))

print("\nElastic Net Cross Validation Results:")
print("Mean: {:.2f}".format(np.mean(cv_scores_elasticnet)))
print("Standard Deviation: {:.2f}".format(np.std(cv_scores_elasticnet)))

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Linear Regression Cross Validation Results:
Mean: 0.84
Standard Deviation: 0.00

Ridge Regression Cross Validation Results:
Mean: 0.84
Standard Deviation: 0.00

Lasso Regression Cross Validation Results:
Mean: 0.84
Standard Deviation: 0.00

Elastic Net Cross Validation Results:
Mean: 0.76
Standard Deviation: 0.00


***Key Findings***
- Mean accuracy is the same as in Trial #1.

___

## Summary

- Trial #1 produced the best model
- Linear Regression within Trial #1 was the best performing model

___

## Predictions

In [67]:
# prepare unseen data
df_test.head(2)

Unnamed: 0,town,flat_type,Tranc_Year,Tranc_Month,floor_area_sqft,max_floor_lvl,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,exec_sold,multigen_sold,studio_apartment_sold,Latitude,Longitude,Mall_Nearest_Distance,Hawker_Nearest_Distance,mrt_nearest_distance,bus_stop_nearest_distance,pri_sch_nearest_distance,sec_sch_nearest_dist,remaining_lease
0,YISHUN,4 ROOM,2012,11,904,12,0,0,0,92,40,0,0,0,1.437066,103.831121,877.431572,687.576779,686.660434,75.683952,426.46791,156.322353,65
1,JURONG WEST,5 ROOM,2019,8,1205,14,0,0,0,28,25,0,0,0,1.336957,103.695668,534.037705,2122.346226,169.478175,88.993058,439.756851,739.371688,86


In [68]:
# numeric features to be used as part of prediction
numeric_features_2 = df_test[['floor_area_sqft', 'remaining_lease', 'mrt_nearest_distance', 'max_floor_lvl']]

In [69]:
cat_features_2 = df_test[['town', 'flat_type']]

In [70]:
df_dummies_2 = pd.get_dummies(cat_features_2, drop_first=True)

In [71]:
df_test = numeric_features_2.merge(df_dummies_2, left_index=True, right_index=True)

In [72]:
X_pred_ss = scaler1.transform(df_test)

In [73]:
y_pred_final = pd.Series(reg_linear1.predict(X_pred_ss), name='predicted')

In [74]:
df_ori = pd.read_csv('../datasets/test.csv', low_memory=False)

In [75]:
y_result = pd.concat([df_ori['id'], y_pred_final], axis=1)

In [76]:
y_result.reset_index(drop=True, inplace=True)

In [77]:
y_result.shape

(16737, 2)

In [78]:
y_result.head(2)

Unnamed: 0,id,predicted
0,114982,338862.731019
1,95653,534563.808552


In [79]:
y_result.to_csv('../datasets/result.csv', index=False)