# 4. Model Building — UK Housing
**Worked on by:** Marin Janushaj  
**Goal:** Train and compare machine learning models to predict housing prices.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df = pd.read_parquet("../data/clean/uk_housing_clean.parquet")
df.head()

Unnamed: 0,transaction_id,price,date,type,is_new,duration,town,district,county,category,record_status
0,{81B82214-7FBC-4129-9F6B-4956B4A663AD},25000,1995-08-18,T,N,F,OLDHAM,OLDHAM,GREATER MANCHESTER,A,A
1,{8046EC72-1466-42D6-A753-4956BF7CD8A2},42500,1995-08-09,S,N,F,GRAYS,THURROCK,THURROCK,A,A
2,{278D581A-5BF3-4FCE-AF62-4956D87691E6},45000,1995-06-30,T,N,F,HIGHBRIDGE,SEDGEMOOR,SOMERSET,A,A
3,{1D861C06-A416-4865-973C-4956DB12CD12},43150,1995-11-24,T,N,F,BEDFORD,NORTH BEDFORDSHIRE,BEDFORDSHIRE,A,A
4,{DD8645FD-A815-43A6-A7BA-4956E58F1874},18899,1995-06-23,S,N,F,WAKEFIELD,LEEDS,WEST YORKSHIRE,A,A


In [4]:
df["year"] = pd.to_datetime(df["date"]).dt.year
df = df[["price", "type", "is_new", "duration", "county", "year"]]
df.head()

Unnamed: 0,price,type,is_new,duration,county,year
0,25000,T,N,F,GREATER MANCHESTER,1995
1,42500,S,N,F,THURROCK,1995
2,45000,T,N,F,SOMERSET,1995
3,43150,T,N,F,BEDFORDSHIRE,1995
4,18899,S,N,F,WEST YORKSHIRE,1995


In [6]:
df_encoded = pd.get_dummies(df, drop_first=True)
df_encoded.head()

Unnamed: 0,price,year,type_F,type_O,type_S,type_T,is_new_Y,duration_L,duration_U,county_BATH AND NORTH EAST SOMERSET,...,county_WEST MIDLANDS,county_WEST SUSSEX,county_WEST YORKSHIRE,county_WILTSHIRE,county_WINDSOR AND MAIDENHEAD,county_WOKINGHAM,county_WORCESTERSHIRE,county_WREKIN,county_WREXHAM,county_YORK
0,25000,1995,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,42500,1995,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,45000,1995,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,43150,1995,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,18899,1995,False,False,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


In [7]:
X = df_encoded.drop("price", axis=1)
y = df_encoded["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

: 

In [2]:
import pandas as pd
import numpy as np

df = pd.read_parquet("../data/clean/uk_housing_clean.parquet")
df["year"] = pd.to_datetime(df["date"]).dt.year

# keep only columns we’ll use
df = df[["price", "type", "is_new", "duration", "county", "year"]]

In [3]:
# remove extreme price tails for stability (optional)
df = df[df["price"].between(1_000, 2_000_000)]

# take a random sample that fits your RAM (adjust n if needed)
df_small = df.sample(n=300_000, random_state=42)   # try 200_000 if RAM is tight
df_small.shape

(300000, 6)

In [4]:
!pip install category-encoders --quiet
from category_encoders import TargetEncoder

# target encode county against price
te = TargetEncoder(cols=["county"])
df_small["county_te"] = te.fit_transform(df_small["county"], df_small["price"])

In [5]:
X = df_small.drop(columns=["price", "county"])  # drop raw county (we have county_te)
X = pd.get_dummies(X, columns=["type", "is_new", "duration"], drop_first=True)
y = np.log1p(df_small["price"])  # log target for stability
X.shape, y.shape

((300000, 9), (300000,))

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LinearRegression(n_jobs=None)  # n_jobs not used here; just explicit
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(np.expm1(y_test), np.expm1(y_pred))
rmse = mean_squared_error(np.expm1(y_test), np.expm1(y_pred), squared=False)
r2 = r2_score(y_test, y_pred)

print(f"MAE:  £{mae:,.0f}")
print(f"RMSE: £{rmse:,.0f}")
print(f"R² (on log target): {r2:.3f}")

MAE:  £63,914
RMSE: £124,693
R² (on log target): 0.591




In [7]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train, y_train)
y_pred2 = gbr.predict(X_test)

mae2 = mean_absolute_error(np.expm1(y_test), np.expm1(y_pred2))
rmse2 = mean_squared_error(np.expm1(y_test), np.expm1(y_pred2), squared=False)
r22 = r2_score(y_test, y_pred2)
print(f"[GBR] MAE: £{mae2:,.0f} | RMSE: £{rmse2:,.0f} | R²(log): {r22:.3f}")

[GBR] MAE: £57,443 | RMSE: £120,127 | R²(log): 0.667




In [11]:
import joblib
from lightgbm import LGBMModel
joblib.dump({"model": LGBMModel, "target_encoder": te, "columns": X.columns.tolist()}, "../data/clean/price_model_lgbm.pkl")
print("✅ saved model to ../data/clean/price_model_lgbm.pkl")

✅ saved model to ../data/clean/price_model_lgbm.pkl


In [4]:
import pandas as pd
import numpy as np
from pathlib import Path

# modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import lightgbm as lgb
import joblib

# target encoder (if missing: pip install category-encoders)
from category_encoders import TargetEncoder

In [5]:
# load cleaned data
df = pd.read_parquet("../data/clean/uk_housing_clean.parquet")

# add year from date
df["year"] = pd.to_datetime(df["date"]).dt.year

# keep the columns we use
df = df[["price", "type", "is_new", "duration", "county", "year"]]

# filter unrealistic prices (optional) and sample to fit in RAM
df = df[df["price"].between(1_000, 2_000_000)]
df_small = df.sample(n=200_000, random_state=42)  # if RAM tight, drop to 100_000

# log target for stability
df_small["log_price"] = np.log1p(df_small["price"])

In [6]:
X = df_small.drop(columns=["price", "log_price"])
y = df_small["log_price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
te = TargetEncoder(cols=["county"])
X_train = X_train.copy()
X_test  = X_test.copy()

X_train["county_te"] = te.fit_transform(X_train["county"], y_train)
X_test["county_te"]  = te.transform(X_test["county"])

In [8]:
def one_hot_and_align(df_in, columns_reference=None):
    df_out = pd.get_dummies(df_in, columns=["type", "is_new", "duration"], drop_first=True)
    if "county" in df_out.columns:
        df_out = df_out.drop(columns=["county"])
    if columns_reference is not None:
        # add missing cols with 0 and keep order
        for c in columns_reference:
            if c not in df_out.columns:
                df_out[c] = 0
        df_out = df_out[columns_reference]
    return df_out

X_train_proc = one_hot_and_align(X_train)
train_cols = X_train_proc.columns.tolist()
X_test_proc  = one_hot_and_align(X_test, columns_reference=train_cols)

X_train_proc.shape, X_test_proc.shape

((160000, 9), (40000, 9))

In [9]:
lgbm = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

lgbm.fit(X_train_proc, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001572 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 164
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 8
[LightGBM] [Info] Start training from score 11.739035


In [10]:
pred_log = lgbm.predict(X_test_proc)
mae  = mean_absolute_error(np.expm1(y_test), np.expm1(pred_log))
rmse = mean_squared_error(np.expm1(y_test), np.expm1(pred_log), squared=False)
r2   = r2_score(y_test, pred_log)

print(f"MAE:  £{mae:,.0f}")
print(f"RMSE: £{rmse:,.0f}")
print(f"R² (on log target): {r2:.3f}")

MAE:  £56,620
RMSE: £116,576
R² (on log target): 0.684




In [11]:
bundle = {
    "model": lgbm,                # <class 'lightgbm.sklearn.LGBMRegressor'> instance
    "target_encoder": te,         # fitted TargetEncoder instance
    "columns": train_cols,        # exact training feature order
}

joblib.dump(bundle, "../data/clean/price_model_lgbm.pkl")
print("✅ Saved to data/clean/price_model_lgbm.pkl")

✅ Saved to data/clean/price_model_lgbm.pkl


In [13]:
loaded = joblib.load(Path("../data/clean/price_model_lgbm.pkl").open("rb"))
mdl = loaded["model"]; enc = loaded["target_encoder"]; cols = loaded["columns"]

row = pd.DataFrame([{
    "type": "T", "is_new": "N", "duration": "F",
    "county": "GREATER LONDON", "year": 2015
}])

row["county_te"] = enc.transform(row[["county"]])["county"]
row = pd.get_dummies(row, columns=["type", "is_new", "duration"], drop_first=True)
row = row.drop(columns=["county"])
for c in cols:
    if c not in row.columns:
        row[c] = 0
row = row[cols]

y_log = mdl.predict(row)[0]
price = float(np.expm1(y_log))
print(f"One-row test prediction: £{price:,.0f}")

One-row test prediction: £663,430
