# **08 Predictive model**

## Objectives

* Write your notebook objective here, for example, "Fetch data from Kaggle and save as raw data", or "engineer features for modelling"

## Inputs

* housing_cleaned.csv

## Outputs

* test_set.csv


---

# Baseline model

Import packages

In [114]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error 


In [126]:
# read in the cleaned data
df = pd.read_csv('../data/clean/housing_cleaned.csv')

---

In [127]:
# convert to datetime
df['transfer_date'] = pd.to_datetime(df['transfer_date'])
# keep only the transfer_date for 2024 onwards
df = df[df['transfer_date'].dt.year >= 2024]

Split index: 5667, Total records: 7084
Date at split index: 2025-08-01 00:00:00


"\ntrain_df = df.iloc[:split_index]\ntest_df = df.iloc[split_index:]\n\n# separate target variable\ny_train = train_df['sold_price']\ny_test = test_df['sold_price']\nX_train = train_df.drop(columns=['sold_price'])\nX_test = test_df.drop(columns=['sold_price'])\n"

In [154]:
# set model features
num_feats   = ['log_total_floor_area','IMD_Rank','energy_band_num']
bin_feats   = ['new_build','is_leasehold']  # pass through as 0/1
cat_feats   = ['property_type','county']

# Control category order to set baselines via drop='first'
ptype_order = ['D','S','T','F']  # baseline becomes 'D' (Detached)
county_order = [
    'WEST MIDLANDS','WARWICKSHIRE','WORCESTERSHIRE','LEICESTERSHIRE',
    'LEICESTER','STAFFORDSHIRE','DERBYSHIRE','CITY OF DERBY','STOKE-ON-TRENT'
]  # baseline becomes WEST MIDLANDS

# Build the ColumnTransformer (code suggested by Copilot)
pre = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feats),
        ('cat', OneHotEncoder(drop='first',
                              categories=[ptype_order, county_order],
                              handle_unknown='ignore'),
         cat_feats),
        ('bin', 'passthrough', bin_feats),
    ],
    remainder='drop'
)

# End-to-end pipeline
pipeline = Pipeline([
    ('pre', pre),
    ('model', LinearRegression()) # or Ridge(alpha=1.0)
])

X = df[num_feats + cat_feats + bin_feats]
y = df['log_price']  # log1p(price)

# Test using the most recent 20% of the data as the test set
# Use a sorted split based on date
df = df.sort_values(by='transfer_date').reset_index(drop=True)
# split index at 80%
split_index = int(len(df) * 0.8)
print(f"Split index: {split_index}, Total records: {len(df)}")
print(f"Date at split index: {df.iloc[split_index]['transfer_date']}")

train_df = df.iloc[:split_index]
test_df = df.iloc[split_index:]

# separate target variable
y_train = train_df['log_price']
y_test = test_df['log_price']
X_train = train_df.drop(columns=['log_price'])
X_test = test_df.drop(columns=['log_price'])

# fit the pipeline
pipeline.fit(X_train, y_train)


Split index: 5667, Total records: 7084
Date at split index: 2025-08-01 00:00:00


### Note: the following functions are custom functions to evaluate regression models provided by the Code Institute for their Data Analysis with AI course.

In [155]:
def regression_performance(X_train, y_train, X_test, y_test, pipeline):
    """
    Gets train/test sets and pipeline and evaluates the performance
    for each set (train and test) by calling regression_evaluation()
    which will evaluate the pipeline performance.

    Parameters:
    - X_train: The input features of the training set.
    - y_train: The target values of the training set.
    - X_test: The input features of the test set.
    - y_test: The target values of the test set.
    - pipeline: The regression pipeline to evaluate.

    Returns:
    None
    """

    print("Model Evaluation \n")
    print("* Train Set")
    regression_evaluation(X_train, y_train, pipeline)
    print("* Test Set")
    regression_evaluation(X_test, y_test, pipeline)


def regression_evaluation(X, y, pipeline):
    """
    Gets features and target (either from train or test set) and pipeline.
    
    Args:
        X (array-like): The input features.
        y (array-like): The target values.
        pipeline (object): The trained pipeline for regression.
    
    Returns:
        None
    
    Prints the following performance metrics comparing the prediction to the target:
    - R2 Score
    - Mean Absolute Error
    - Mean Squared Error
    - Root Mean Squared Error
    """
    prediction = pipeline.predict(X)
    print("R2 Score:", r2_score(y, prediction).round(3))
    print("Mean Absolute Error:", mean_absolute_error(y, prediction).round(3))
    print("Mean Squared Error:", mean_squared_error(y, prediction).round(3))
    print(
        "Root Mean Squared Error:", np.sqrt(mean_squared_error(y, prediction)).round(3)
    )
    print("\n")


In [None]:
print("Regression Model Performance \nNote the MAE and MSE are in log(GBP) not GBP")
regression_performance(X_train, y_train, X_test, y_test, pipeline)

# calculate MAE and MSE should be in GBP not log(GBP) 
mae_train = mean_absolute_error(y_train_gbp, y_pred_train)
print(f"The train set Mean Absolute Error in GBP: {mae_train:,.0f}")

mae_test = mean_absolute_error(y_test_gbp, y_pred_gbp)
print(f"The test  set Mean Absolute Error in GBP: {mae:,.0f}")

alpha_scatter=0.5
# change the actuals and predictions to GBP
y_train_gbp = expm1(y_train)
y_test_gbp = expm1(y_test)
y_pred_gbp = expm1(pipeline.predict(X_test))
y_pred_train = expm1(pipeline.predict(X_train))

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
sns.scatterplot(x=y_train_gbp, y=y_pred_train, alpha=alpha_scatter, ax=axes[0])
sns.lineplot(x=y_train_gbp, y=y_train_gbp, color="red", ax=axes[0])
axes[0].set_xlabel("Actual")
axes[0].set_ylabel("Predictions")
axes[0].set_title("Train Set")

sns.scatterplot(x=y_test_gbp, y=y_pred_gbp, alpha=alpha_scatter, ax=axes[1])
sns.lineplot(x=y_test_gbp, y=y_test_gbp, color="red", ax=axes[1])
axes[1].set_xlabel("Actual")
axes[1].set_ylabel("Predictions")
axes[1].set_title("Test Set")

plt.show()


In [174]:
y_train.head()

0    12.083911
1    11.736077
2    11.982935
3    11.790565
4    12.100718
Name: log_price, dtype: float64

In [None]:
# Save in a csv for analysis in Tableau
# merge X_train, X_test, y_train_gbp and y_test_gbp into a single dataframe
to_save = pd.concat([X_train, X_test], axis=0)
predicted_price = pd.concat([
    pd.Series(y_pred_train, name="predicted_price"),
    pd.Series(y_pred_gbp, name="predicted_price")
], axis=0)
to_save = pd.concat([
    to_save.reset_index(drop=True), 
    predicted_price.reset_index(drop=True), 
], axis=1)
# rename columns
to_save = to_save.rename(columns={"y_train_gbp": "actual_price_gbp", "y_test_gbp": "actual_price_gbp", "y_pred_train": "predicted_price_gbp", "y_pred_gbp": "predicted_price_gbp"})
to_save.head()
to_save.to_csv('../data/processed/test_set.csv', index=False)


Unnamed: 0,transaction,price,transfer_date,postcode,property_type,new_build,tenure,PAON,SAON,Street,...,is_new_build_energy_band_num,outcode,log_total_floor_area,is_flat,is_leasehold,is_flat_leasehold,new_build_energy_band_num,property_type_order,property_type_order_legend,predicted_price
0,{3DCCB7CA-86B0-5B9D-E063-4704A8C0331E},177000,2024-01-04,CV34 5GQ,F,0,L,28,,BIRCH END,...,0,CV34,4.204693,False,False,False,0,1,1 - F,180939.733761
1,{3DCCB7CA-1F01-5B9D-E063-4704A8C0331E},125000,2024-01-12,LE18 1JW,F,0,F,289B,,LEICESTER ROAD,...,0,LE18,4.483229,False,False,False,0,1,1 - F,195928.883235
2,{3DCCB7CA-83D3-5B9D-E063-4704A8C0331E},160000,2024-01-15,B42 2RL,S,0,F,31,,CADDICK ROAD,...,0,B42,4.290459,False,False,False,0,3,3 - S,218048.398551
3,{3DCCB7CA-4BDA-5B9D-E063-4704A8C0331E},132000,2024-01-16,ST3 2AE,S,0,F,25,,ASH GROVE,...,0,ST3,4.406719,False,False,False,0,3,3 - S,158507.245106
4,{3DCCB7CA-8767-5B9D-E063-4704A8C0331E},180000,2024-01-26,CV37 8ZB,F,1,L,7,,MALLARD COURT,...,6,CV37,4.077537,False,False,False,6,1,1 - F,206096.496773


### Conclusions 
- The performance on the train and test set is similar, indicating that the model is not overfitting.
- The test set performance has an R2 performance of 0.74. 
- This is a good R2 value for a baseline model.
- We also note in the plots of Prediction x Actual, the predictions tend to follow the actual value. 
- It is evident, however, that actual higher priced sold value properties are being undervalued. This may be because these proprties, in desirable areas command a premium price. 
- In addition, very low actual prices are being overvalued. This may be due to factors such as the condition of these properties. This needs further investigation.