In [1]:
#Importing the necessary libraries, numpy, pandas, and necessary sklearn libraries to build the regression models

import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score

In [78]:
#Reading the csv containing the property data

property = pd.read_csv('property.csv', encoding= 'unicode_escape')

In [79]:
#Forcing variables into numeric values, rendering '-' strings as NaN

property['Area in m²'] = pd.to_numeric(property['Area in m²'], errors='coerce')
property['Price'] = pd.to_numeric(property['Price'], errors='coerce')

In [80]:
#Initializing the KNN imputer and creating a new dataframe out of imputed values

data_for_imputation = property[['Area in m²', 'Price']]
imputer = KNNImputer(n_neighbors=7)
imputed_data = imputer.fit_transform(data_for_imputation)
imputed_df = pd.DataFrame(imputed_data, columns=['Area in m²', 'Price'])

In [81]:
#Replacing columns in the original dataframe with imputed dataframe columns

property['Area in m²'] = imputed_df['Area in m²']
property['Price'] = imputed_df['Price']

In [82]:
#Confirming that there are no longer any null values

property.isna().sum()

Address          0
Bedrooms         0
Bathrooms        0
Parking Spots    0
Study            0
Area in m²       0
Price            0
dtype: int64

In [83]:
#Splitting the dataframe into two, one containing the features and one containing just the prices (target variable), and then partitioning the data

x = property.drop(['Address', 'Price'], axis=1)
y = property['Price']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.45, random_state=24)

In [84]:
#Linear regression model

linear_model = LinearRegression()
linear_model.fit(x_train, y_train)
y_pred_linear = linear_model.predict(x_test)

In [85]:
#Gradient boosting model

gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
gbr_model.fit(x_train, y_train)
y_pred_gb = gbr_model.predict(x_test)

In [86]:
#Random forest model

rf_model = RandomForestRegressor(n_estimators=100, max_depth=10) 
rf_model.fit(x_train, y_train)
y_pred_knn = rf_model.predict(x_test)

In [87]:
#Creating an imaginary test apartment to predict the price of

testapart = {
    'Bedrooms': [4],
    'Bathrooms': [3],
    'Parking Spots': [2],
    'Study': [1],
    'Area in m²': [200]
}

test = pd.DataFrame(testapart)

In [88]:
#Predicting the prices using the three aforementioned methods

predicted_price_linear = linear_model.predict(test)
predicted_price_gb = gbr_model.predict(test)
predicted_price_knn = rf_model.predict(test)

print('Linear Regression:',round(predicted_price_linear[0]))
print('Gradient Boosting:',round(predicted_price_gb[0]))
print('Random Forest:',round(predicted_price_knn[0]))

Linear Regression: 1410216
Gradient Boosting: 1499871
Random Forest: 1604838


In [89]:
models = {
    'Linear Regression': linear_model,
    'Gradient Boosting Regression': gbr_model,
    'Random Forest Regression': rf_model
}

for name, model in models.items():
    
    scores = cross_val_score(model, x_train, y_train, cv=10, scoring='neg_mean_squared_error')
    rmse_scores = (-scores) ** 0.5
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    rmse = round(root_mean_squared_error(y_test, y_pred),2)
    r2 = round(r2_score(y_test, y_pred),3)

    print(f"Model: {name}")
    print(f"Cross-Validation RMSE: {round(rmse_scores.mean(),2)}")
    print(f"Test RMSE: {rmse}")
    print(f"R-squared: {r2}")
    print("="*30)
    
'''We have concluded that the best model to predict apartment prices in Sydney Olympic Park is the Gradient Boosting Regression Model,
    with an RMSE value of 70463.59 and an R-squared value of 0.845.'''

Model: Linear Regression
Cross-Validation RMSE: 127168.68
Test RMSE: 77612.81
R-squared: 0.812
Model: Gradient Boosting Regression
Cross-Validation RMSE: 117681.02
Test RMSE: 70463.59
R-squared: 0.845
Model: Random Forest Regression
Cross-Validation RMSE: 106885.98
Test RMSE: 73915.64
R-squared: 0.829
