In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression  
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [7]:
# 1. Load the dataset
df = pd.read_csv("clean_house_l5_dataset.csv")
df.head()

Unnamed: 0,Size_sqft,Bedrooms,Bathrooms,YearBuilt,Price,Location_City,Location_Rural,Location_Suburb,HouseAge,Rooms_per_1000sqft,Size_per_Bedroom,Is_City,LogPrice
0,1.030281,-1.463643,0.088986,-1.279342,812100.0,1,0,0,1.279342,-1.061465,3.123085,1,13.60738
1,-0.482463,-1.463643,1.347506,1.326476,547000.0,1,0,0,-1.326476,-0.265637,1.30952,1,13.212206
2,0.468877,0.00743,-1.169534,-1.339942,693700.0,1,0,0,1.339942,-0.689547,-0.16397,1,13.449796
3,1.079817,0.742966,1.347506,-0.91574,848300.0,1,0,0,0.91574,-0.199111,-0.307614,1,13.650991
4,0.788954,1.478502,-1.169534,0.962873,806000.0,0,0,1,-0.962873,-0.311002,-0.610027,0,13.59984


In [8]:
# 2. Separate features(x) and target variable (y)
x = df.drop(columns=["Price","LogPrice"])
y = df["Price"]

In [9]:
# 3. Split the dataset into training and testing sets
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2,random_state = 42)

In [10]:
# 4 Building the Linear Regression model and making predictions
lr = LinearRegression()
# Fit the model to the training data
lr.fit(x_train,y_train)
# Make predictions on the test data
y_pred_lr = lr.predict(x_test)

In [11]:
# convert the predictions of the Linear Regression model to a DataFrame
y_pred_lr_df = pd.DataFrame(y_pred_lr, columns=['L_r_Predicted_Price'])
y_pred_lr_df.head()

Unnamed: 0,L_r_Predicted_Price
0,656754.667208
1,822634.717344
2,188637.494731
3,594040.937865
4,609615.224851


In [12]:
# 5 Building the Random Forest Regressor model and making predictions
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# Fit the model to the training data
rf.fit(x_train, y_train)
# Make predictions on the test data
y_pred_rf = rf.predict(x_test)


In [13]:
# convert the predictions of the Random Forest model to a DataFrame
y_pred_rf_df = pd.DataFrame(y_pred_rf, columns=['R_f_Predicted_Price'])
y_pred_rf_df.head()

Unnamed: 0,R_f_Predicted_Price
0,789031.0
1,821977.0
2,290899.0
3,557028.0
4,538756.0


In [14]:
# 6 Evaluating the performance of both models
# function to calculate MAE, MSE, RMSE, R2 Score
def display_evaluation_metrics(model_name,y_actual, y_pred):
    mae = mean_absolute_error(y_actual, y_pred)
    mse = mean_squared_error(y_actual, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_actual, y_pred)
    # print the evaluation metrics
    print(f"\n{model_name} Evaluation Metrics:")
    print(f"MAE:      {mae:.0f}")
    print(f"MSE:      {mse:.0f}")    
    print(f"RMSE:     {rmse:.0f}")
    print(f"R2 Score: {r2:.2f}")
# print evaluation metrics for both models
display_evaluation_metrics("Linear Regression", y_test, y_pred_lr)
display_evaluation_metrics("Random Forest Regressor", y_test, y_pred_rf)


Linear Regression Evaluation Metrics:
MAE:      63086
MSE:      5718940941
RMSE:     75624
R2 Score: 0.85

Random Forest Regressor Evaluation Metrics:
MAE:      52524
MSE:      5283317455
RMSE:     72686
R2 Score: 0.86


In [16]:
# 7. Single row prediction (sanity check)
i = 3 # index of the row to predict    
single_row = x_test.iloc[[i]]  # Note the double brackets to keep it as DataFrame
y_actual = y_test.iloc[i]

# print("\nSingle Row Input Features:")
# print(single_row)

pred_lr = float(lr.predict(single_row)[0])
pred_rf = float(rf.predict(single_row)[0])

print(f"\nActual Price: ${y_actual:,.0f}")
print(f"Linear Regression Predicted Price: ${pred_lr:,.0f}")
print(f"Random Forest Predicted Price: ${pred_rf:,.0f}")


Actual Price: $554,800
Linear Regression Predicted Price: $594,041
Random Forest Predicted Price: $557,028
