In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
df = pd.read_csv('immo-ml-data.csv')
df.columns

Index(['Type', 'Price', 'Bedrooms', 'Is_Equiped_Kitchen', 'State', 'Facades',
       'Swim_pool', 'Region', 'Bedroom_Bin_Code', 'Log_Living_Area',
       'Sqrt_Total_Outdoor_Area'],
      dtype='object')

In [None]:
# Bin Bedrooms
bins = [-1, 1, 3, 5, 10, float('inf')]  
labels = ['Studio/Small', 'Small Family', 'Medium Family', 'Large', 'Luxury']

df['Bedroom_Bin'] = pd.cut(df['Bedrooms'], bins=bins, labels=labels)

print(df[['Bedrooms', 'Bedroom_Bin']].head())


In [None]:
# Encode Bedroom bins
bedroom_bin_mapping = {
    'Studio/Small': 1, 
    'Small Family': 2, 
    'Medium Family': 3, 
    'Large': 4, 
    'Luxury': 5
}
df['Bedroom_Bin_Code'] = df['Bedroom_Bin'].map(bedroom_bin_mapping)
df = df.drop(columns=['Bedroom_Bin'])

In [None]:
# Columns with less than 0.3% feature importance
df = df.drop(columns=['Garden', 'Terrace', 'Is_Open_Fire'])

In [None]:
# Aggregation of outdoor areas
df['Total_Outdoor_Area'] = df['Terrace_Area'] + df['Garden_Area']
df = df.drop(columns=['Terrace_Area', 'Garden_Area'])

In [None]:
df['Log_Living_Area'] = np.log1p(df['Living_Area'])
df = df.drop(columns=['Living_Area'])

In [None]:
df['Sqrt_Total_Outdoor_Area'] = np.sqrt(df['Total_Outdoor_Area'])
df = df.drop(columns=['Total_Outdoor_Area'])

# Model

In [15]:
X = df.drop(columns=['Price']) # Features
y = df['Price'] # Target

In [16]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

In [17]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)

(8388, 65)
(8388,)


In [18]:
df.dtypes

Type                         int64
Price                        int64
Bedrooms                     int64
Is_Equiped_Kitchen           int64
State                        int64
Facades                      int64
Swim_pool                    int64
Region                       int64
Bedroom_Bin_Code             int64
Log_Living_Area            float64
Sqrt_Total_Outdoor_Area    float64
dtype: object

In [19]:
model = RandomForestRegressor(
    max_depth=10,
    min_samples_leaf=2,
    min_samples_split=2,
    n_estimators=300,
    random_state=42
)

model.fit(X_train, y_train)

In [20]:
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error: {rmse:.2f}")

r2 = r2_score(y_test, y_pred)
print(f'R² Score: {r2:.2f}')

Root Mean Squared Error: 117981.10
R² Score: 0.58


In [21]:
feature_names = poly.get_feature_names_out(input_features=X.columns)
importances = model.feature_importances_
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances * 100})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print(feature_importances)

                             Feature  Importance
62                 Log_Living_Area^2   14.330076
8                    Log_Living_Area   12.956227
57            Region Log_Living_Area   11.440919
27          Bedrooms Log_Living_Area    7.233011
60  Bedroom_Bin_Code Log_Living_Area    7.172968
..                               ...         ...
29              Is_Equiped_Kitchen^2    0.023007
5                          Swim_pool    0.020513
50                       Swim_pool^2    0.015615
7                   Bedroom_Bin_Code    0.013301
59                Bedroom_Bin_Code^2    0.012689

[65 rows x 2 columns]


In [28]:
print(y_train.value_counts())


Price
299000    161
249000    122
295000    111
395000    108
399000    105
         ... 
279600      1
339552      1
352060      1
380522      1
143500      1
Name: count, Length: 1527, dtype: int64


**Baseline Scores:**
Root Mean Squared Error: 119698.57
R² Score: 0.57

**After removing features with less than 0.3% importance: (Better)**
Root Mean Squared Error: 119681.10
R² Score: 0.57

vs

**After removing features with less than 1% importance:**
Root Mean Squared Error: 119843.01
R² Score: 0.57

**After log transforming price:**
Root Mean Squared Error: 122009.71
R² Score: 0.55

**After aggregating Garden_Area and Terrace_Area = Total_Outdoor_Area:**
Root Mean Squared Error: 119334.62
R² Score: 0.57

**After log transforming Living_Area:**
Root Mean Squared Error: 119267.08
R² Score: 0.57

**After Sqrt Transforming Total_Outdoor_Area: (Better)**
Root Mean Squared Error: 119183.91
R² Score: 0.57

vs

**After log transforming Total_Outdoor_Area:**
Root Mean Squared Error: 119187.34
R² Score: 0.57


**After creating Price_per_Sqm:**
Root Mean Squared Error: 8473.28
R² Score: 1.00
Causes data leakage.

**After adding Region_Avg_Living_Area:**
Root Mean Squared Error: 119234.81
R² Score: 0.57

**After adding Outdoor_to_Indoor_Ratio:**
Root Mean Squared Error: 119589.11
R² Score: 0.57

**After Binning Bedrooms:**
Root Mean Squared Error: 119178.53
R² Score: 0.57

**After PolynomialFeatures: (Better)**
Root Mean Squared Error: 117981.10
MAE: 83525.19
R² Score: 0.58

vs

**After PolynomialFeatures and removing Binned Bedrooms:**
Root Mean Squared Error: 118007.19
R² Score: 0.58


**After Gaussian noise augmentation**
RMSE: 119116.71
MAE: 85081.56
R² Score: 0.57


**After Weighted Loss**
RMSE: 117884.64
MAE: 84652.49
R² Score: 0.58

**After log transforming price (with weighted loss)**
RMSE: 118399.11
MAE: 82595.23
R² Score: 0.58

**After adding Municipality back in**
RMSE: 116049.13
MAE: 80424.82
R² Score: 0.59

**After adding Average_Income per municipality**
RMSE: 110102.78
MAE: 75670.42
R² Score: 0.63


# Visuals

## Tree plot of one of the trees

In [None]:
from sklearn.tree import plot_tree

In [None]:
tree = model.estimators_[0]  # Access the first tree in the forest

# Plot the tree
plt.figure(figsize=(60, 30))
plot_tree(
    tree,
    feature_names=poly.get_feature_names_out(X.columns),
    filled=True,
    rounded=True,
    fontsize=10,
    max_depth=4,
)
plt.title("Simplified Decision Tree (Top 4 Levels)")
plt.show()

## Scatterplot of avtual vs predicted prices

In [None]:
# Csv file created by model
df = pd.read_csv("model_predictions.csv")

In [None]:
plt.figure(figsize=(8, 8))
plt.scatter(df["Actual Price"], df["Predicted Price"], alpha=0.6, color="blue")
plt.plot([df["Actual Price"].min(), df["Actual Price"].max()], 
         [df["Actual Price"].min(), df["Actual Price"].max()], 
         '--r', linewidth=2)
plt.title("Actual vs Predicted Prices")
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.grid()
plt.show()

Most predictions are close to the diagonal which is good!

But more spread at higher prices so less accuracy there.

## Residual plot

In [None]:
df["Residuals"] = df["Actual Price"] - df["Predicted Price"]

plt.figure(figsize=(8, 8))
plt.scatter(df["Predicted Price"], df["Residuals"], alpha=0.6, color="purple")
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)

plt.title("Residuals Plot")
plt.xlabel("Predicted Price")
plt.ylabel("Residuals (Actual - Predicted)")
plt.grid()
plt.show()

The higher the price, the more scattered the plot. Indicates that the model struggles with higher prices.

In [None]:
## Histogram of residuals
plt.figure(figsize=(8, 6))
plt.hist(df["Residuals"], bins=20, alpha=0.7, color="green")
plt.axvline(x=0, color='r', linestyle='--', linewidth=2)  # Zero line
plt.title("Histogram of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.grid()
plt.show()

Mostly centered on 0 (good) and roughly symmetrical.

Think the main issue stems from a lack of data on higher price points.

# Dealing with higher price outliers

I don't want to make my dataset any smaller so going to look into different strategies to artificially inflate the dataset.

1) **SMOTE (Synthetic Minority Oversampling Technique) for Regression**
Generates synthetic data points by interpolating between existing high-price data points.


Not worth the hassle because SMOTE works best with classification models and I have too many records where the price values have only one sample.

2) **Gaussian Noise Augmentation**
Add small random noise to the features of high-price properties to create similar but slightly varied data points. This avoids duplicating data and increases model robustness.

RMSE: 119116.71
MAE: 85081.56
R² Score: 0.57
Had an adverse effect on results.


3) **Weighted Loss Function**
Assign higher weights to errors on high-price examples to prioritize these during model training.

RMSE: 117884.64
MAE: 84652.49
R² Score: 0.58

4) **Logarithmic Transformation**
Compressing the price range can reduce the impact of outliers and improve the model's ability to generalize.
RMSE: 118399.11
MAE: 82595.23
R² Score: 0.58

### Shap
Takes 30 min to run so removing from main code and storing the snippet here.

In [None]:
import shap # Needs shap_env

# Put in rf_pipeline.py
def visualize_shap(self) -> None:
    """
    Compute and visualize SHAP values.
    """
    explainer = shap.TreeExplainer(self.model)
    shap_values = explainer.shap_values(self.X_test)
    print("Generating SHAP summary plot...")
    shap.summary_plot(shap_values, self.X_test, feature_names=self.feature_names, show=False)
    plt.savefig("shap_summary_plot.png", dpi=300)
    print("SHAP summary plot saved as 'shap_summary_plot.png'.")
    plt.show()


# Put in main.py
print("\nGenerating SHAP visualizations...")
start_time = time.time()
pipeline.visualize_shap()
elapsed_time = time.time() - start_time
print(f"SHAP visualization completed in {elapsed_time:.2f} seconds.")

Aproximate time for model to run:

**1 min 11 sec**

## Final Metrics:

Training - RMSE: 79468.51, MAE: 54756.00, R²: 0.81, MAPE: 15.86%, sMAPE: 15.06%

Test - RMSE: 110102.78, MAE: 75670.42, R²: 0.63, MAPE: 22.02%, sMAPE: 20.38%


SHAP visualization completed in 1977.21 seconds (~32min).
