In [70]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# Load the dataset
df = pd.read_csv("clean_house_dataset.csv")


In [71]:
df.head(5)

Unnamed: 0,Size_sqft,Bedrooms,Bathrooms,YearBuilt,Price,Location_City,Location_Rural,Location_Suburb,HouseAge,Rooms_per_1000sqft,Size_per_Bedroom,Is_City,LogPrice
0,1.030281,-1.463643,0.088986,-1.279342,812100.0,1,0,0,1.279342,-1.061465,3.123085,1,13.60738
1,-0.482463,-1.463643,1.347506,1.326476,547000.0,1,0,0,-1.326476,-0.265637,1.30952,1,13.212206
2,0.468877,0.00743,-1.169534,-1.339942,693700.0,1,0,0,1.339942,-0.689547,-0.16397,1,13.449796
3,1.079817,0.742966,1.347506,-0.91574,848300.0,1,0,0,0.91574,-0.199111,-0.307614,1,13.650991
4,0.788954,1.478502,-1.169534,0.962873,806000.0,0,0,1,-0.962873,-0.311002,-0.610027,0,13.59984


In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Size_sqft           99 non-null     float64
 1   Bedrooms            99 non-null     float64
 2   Bathrooms           99 non-null     float64
 3   YearBuilt           99 non-null     float64
 4   Price               99 non-null     float64
 5   Location_City       99 non-null     int64  
 6   Location_Rural      99 non-null     int64  
 7   Location_Suburb     99 non-null     int64  
 8   HouseAge            99 non-null     float64
 9   Rooms_per_1000sqft  99 non-null     float64
 10  Size_per_Bedroom    99 non-null     float64
 11  Is_City             99 non-null     int64  
 12  LogPrice            99 non-null     float64
dtypes: float64(9), int64(4)
memory usage: 10.2 KB


In [73]:
df.shape

(99, 13)

In [74]:
df.isna().sum()

Size_sqft             0
Bedrooms              0
Bathrooms             0
YearBuilt             0
Price                 0
Location_City         0
Location_Rural        0
Location_Suburb       0
HouseAge              0
Rooms_per_1000sqft    0
Size_per_Bedroom      0
Is_City               0
LogPrice              0
dtype: int64

In [75]:
# separte features (x) and the target variable (y)
x=df.drop(columns=["Price","LogPrice"])
y=df["Price"]

In [76]:
#Splitting the data into training adn testing
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [83]:
# Rraing the Model
# We are using liniear regerssion model
lr=LinearRegression()
rf=RandomForestRegressor(n_estimators=100,random_state=42)


In [84]:
lr.fit(x_train,y_train)

rf.fit(x_train,y_train)

In [85]:
#Evaluation of the model
# make the prediction of the test set
lr_pred=lr.predict(x_test)
rf_pred=rf.predict(x_test)

In [97]:
def print_metrics(name,y, y_predict):
    r2=r2_score(y,y_predict)
    mae=mean_absolute_error(y,y_predict)
    mse=mean_squared_error(y,y_predict)
    rmse = np.sqrt(mse)
    print(f"Prediction of {name}")
    print(f"R2:{r2:.3f}")
    print(f"MAE:{mae:,.0f}")
    print(f"MSE:{mse:,.0f}")
    print(f"RMSE:{rmse:,.0f}")
print_metrics("LR",y_test,lr_pred)
print_metrics("RF",y_test,rf_pred)

Prediction of LR
R2:0.848
MAE:63,086
MSE:5,718,940,941
RMSE:75,624
Prediction of RF
R2:0.859
MAE:52,524
MSE:5,283,317,455
RMSE:72,686


In [106]:
i = 0
i2 = 5
i3 = 10   # use 10 instead of 20 if you have only 12 rows

# ---- first row ----
x_one_df = x_test.iloc[[i]]
y_true = y_test.iloc[i]
p_lr_one = float(lr.predict(x_one_df)[0])
p_rf_one = float(rf.predict(x_one_df)[0])
print("Single Row Sanity Check 1")
print(f"Actual Price: ${y_true:,.0f}")
print(f"LR Prediction: ${p_lr_one:,.0f}")
print(f"RF Prediction: ${p_rf_one:,.0f}")
print("-"*40)

# ---- second row ----
x_one_df2 = x_test.iloc[[i2]]
y_true2 = y_test.iloc[i2]
p_lr_one2 = float(lr.predict(x_one_df2)[0])
p_rf_one2 = float(rf.predict(x_one_df2)[0])
print("Single Row Sanity Check 2")
print(f"Actual Price: ${y_true2:,.0f}")
print(f"LR Prediction: ${p_lr_one2:,.0f}")
print(f"RF Prediction: ${p_rf_one2:,.0f}")
print("-"*40)

# ---- third row ----
x_one_df3 = x_test.iloc[[i3]]
y_true3 = y_test.iloc[i3]
p_lr_one3 = float(lr.predict(x_one_df3)[0])
p_rf_one3 = float(rf.predict(x_one_df3)[0])
print("Single Row Sanity Check 3")
print(f"Actual Price: ${y_true3:,.0f}")
print(f"LR Prediction: ${p_lr_one3:,.0f}")
print(f"RF Prediction: ${p_rf_one3:,.0f}")
print("-"*40)








Single Row Sanity Check 1
Actual Price: $642,500
LR Prediction: $656,755
RF Prediction: $789,031
----------------------------------------
Single Row Sanity Check 2
Actual Price: $419,200
LR Prediction: $411,139
RF Prediction: $297,368
----------------------------------------
Single Row Sanity Check 3
Actual Price: $366,000
LR Prediction: $299,971
RF Prediction: $311,571
----------------------------------------


In [117]:
print(f"Actual Price: ${y_true:,.0f}")
print(f"LR Prediction: ${p_lr_one:,.0f}")
print(f"RF Prediction: ${p_rf_one:,.0f}")


Actual Price: $642,500
LR Prediction: $656,755
RF Prediction: $789,031
