In [1]:
import pandas as pd

In [2]:
# Load the housing prices data
df = pd.read_csv('Real estate.csv')

In [3]:
# Display the first few rows of the dataset
print(df.head())

   No  X1 transaction date  X2 house age  \
0   1             2012.917          32.0   
1   2             2012.917          19.5   
2   3             2013.583          13.3   
3   4             2013.500          13.3   
4   5             2012.833           5.0   

   X3 distance to the nearest MRT station  X4 number of convenience stores  \
0                                84.87882                               10   
1                               306.59470                                9   
2                               561.98450                                5   
3                               561.98450                                5   
4                               390.56840                                5   

   X5 latitude  X6 longitude  Y house price of unit area  
0     24.98298     121.54024                        37.9  
1     24.98034     121.53951                        42.2  
2     24.98746     121.54391                        47.3  
3     24.98746     121.54391  

In [4]:
# Perform exploratory data analysis (EDA)
print(df.describe())

               No  X1 transaction date  X2 house age  \
count  414.000000           414.000000    414.000000   
mean   207.500000          2013.148971     17.712560   
std    119.655756             0.281967     11.392485   
min      1.000000          2012.667000      0.000000   
25%    104.250000          2012.917000      9.025000   
50%    207.500000          2013.167000     16.100000   
75%    310.750000          2013.417000     28.150000   
max    414.000000          2013.583000     43.800000   

       X3 distance to the nearest MRT station  \
count                              414.000000   
mean                              1083.885689   
std                               1262.109595   
min                                 23.382840   
25%                                289.324800   
50%                                492.231300   
75%                               1454.279000   
max                               6488.021000   

       X4 number of convenience stores  X5 latitude  

In [5]:
# Check for missing values
print(df.isnull().sum())

No                                        0
X1 transaction date                       0
X2 house age                              0
X3 distance to the nearest MRT station    0
X4 number of convenience stores           0
X5 latitude                               0
X6 longitude                              0
Y house price of unit area                0
dtype: int64


In [6]:
# Check for any categorical variables
print(df.dtypes)

No                                          int64
X1 transaction date                       float64
X2 house age                              float64
X3 distance to the nearest MRT station    float64
X4 number of convenience stores             int64
X5 latitude                               float64
X6 longitude                              float64
Y house price of unit area                float64
dtype: object


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

In [8]:
# Split data into features (X) and target variable (y)
X = df.drop(columns=['Y house price of unit area'])
y = df['Y house price of unit area']

In [9]:
# Step 2: Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Step 3: Train Decision Trees Model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)


In [11]:
# Step 4: Evaluate Decision Trees Model
dt_predictions = dt_model.predict(X_test)
dt_mse = mean_squared_error(y_test, dt_predictions)
dt_mae = mean_absolute_error(y_test, dt_predictions)
dt_rmse = np.sqrt(dt_mse)

In [12]:
print("Decision Trees Model Performance:")
print("Mean Squared Error (MSE):", dt_mse)
print("Mean Absolute Error (MAE):", dt_mae)
print("Root Mean Squared Error (RMSE):", dt_rmse)

Decision Trees Model Performance:
Mean Squared Error (MSE): 53.89337349397591
Mean Absolute Error (MAE): 5.360240963855421
Root Mean Squared Error (RMSE): 7.341210628634483


In [13]:
# Step 5: Train Random Forests Model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

In [14]:
# Step 6: Evaluate Random Forests Model
rf_predictions = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_rmse = np.sqrt(rf_mse)

In [15]:
print("\nRandom Forests Model Performance:")
print("Mean Squared Error (MSE):", rf_mse)
print("Mean Absolute Error (MAE):", rf_mae)
print("Root Mean Squared Error (RMSE):", rf_rmse)


Random Forests Model Performance:
Mean Squared Error (MSE): 31.859230951807227
Mean Absolute Error (MAE): 3.8668554216867457
Root Mean Squared Error (RMSE): 5.644398192173123
