In [1]:
%%bash
# delete all '__pycache__' directories throughout the project
cd ..
rm -rf `find . -type d -name __pycache__`

# output the file structure from the project's root directory
tree

[01;34m.[0m
├── [00mMakefile[0m
├── [00mREADME.md[0m
├── [01;34martifacts[0m
│   └── [00mmodel.pkl[0m
├── [00mconfig.yaml[0m
├── [01;34mdata[0m
│   └── [00mraw.parquet[0m
├── [01;34mlogs[0m
│   ├── [00m10_11_2024_04_20_54.log[0m
│   └── [00m10_11_2024_04_43_40.log[0m
├── [01;34mnotebooks[0m
│   └── [00mrental-prediction-service.ipynb[0m
├── [00mpoetry.lock[0m
├── [00mpyproject.toml[0m
├── [01;34msrc[0m
│   ├── [00m__init__.py[0m
│   ├── [00mconfig.py[0m
│   ├── [00mdata.py[0m
│   ├── [00mdatabase.py[0m
│   ├── [00minference.py[0m
│   ├── [00mlogger.py[0m
│   ├── [00mmodel.py[0m
│   └── [00mmodel_service.py[0m
└── [01;34mtests[0m
    └── [00m__init__.py[0m

7 directories, 19 files


#### **`Dependencies`**

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import warnings

import pandas as pd

from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# rental-prediction-service modules
from src.data import encode_neighborhood_ids, preprocess_data
from src.database import read_table
from src.model import compute_rsquared, split_data
from src.model_inference import ModelInferenceService, Record

warnings.filterwarnings("ignore")

In [4]:
# set the pd.DataFrame and pd.Series display options
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

**`Data ingestion and pre-processing`**

In [5]:
# raw data
read_table().head(3)

[32m2024-10-11 04:59:41.158[0m | [1mINFO    [0m | [36msrc.database[0m:[36mread_table[0m:[36m167[0m - [1mFetching raw data from the 'postgres' database's 'rentals.raw' table.[0m


Unnamed: 0,address,zip,neighborhood,neighborhood_id,year_built,area,rooms,bedrooms,bathrooms,balcony,storage,parking,furnished,garage,garden,energy,facilities,rent
0,1071 HN Amsterdam (Cornelis Schuytbuurt),1071 HN,Cornelis Schuytbuurt,48,1870,167.0,3,2,2,yes,no,no,yes,no,Not present,D,Roof terrace,4500
1,1071 HK Amsterdam (Concertgebouwbuurt),1071 HK,Concertgebouwbuurt,47,1890,150.0,3,2,2,yes,no,yes,yes,no,Not present,A,"Cable TV, Internet connection, Fireplace, Bath, Toilet",3450
2,1071 HK Amsterdam (Concertgebouwbuurt),1071 HK,Concertgebouwbuurt,47,1890,150.0,3,2,2,yes,no,yes,yes,no,Not present,A,"Cable TV, Internet connection, Fireplace, Bath, Toilet",3450


In [6]:
# pre-processed data
read_table().pipe(preprocess_data).head(3)

[32m2024-10-11 05:00:01.592[0m | [1mINFO    [0m | [36msrc.database[0m:[36mread_table[0m:[36m167[0m - [1mFetching raw data from the 'postgres' database's 'rentals.raw' table.[0m
[32m2024-10-11 05:00:01.613[0m | [1mINFO    [0m | [36msrc.data[0m:[36mpreprocess_data[0m:[36m79[0m - [1mValidating, pre-processing, and transforming the raw data into ML-ready features and targets.[0m


Unnamed: 0,year_built,area,bedrooms,bathrooms,garden_size,balcony,parking,furnished,garage,storage,neighborhood_id,rent
0,1870,167.0,2,2,0,True,False,True,False,False,48,4500
1,1890,150.0,2,2,0,True,True,True,False,False,47,3450
2,1923,90.0,2,1,0,True,False,True,False,False,106,2000


In [7]:
# machine learning-ready data
# NOTE: the 'neighborhood_id' feature has been encoded
read_table().pipe(preprocess_data).pipe(encode_neighborhood_ids).head(3)

[32m2024-10-11 05:00:23.041[0m | [1mINFO    [0m | [36msrc.database[0m:[36mread_table[0m:[36m167[0m - [1mFetching raw data from the 'postgres' database's 'rentals.raw' table.[0m
[32m2024-10-11 05:00:23.060[0m | [1mINFO    [0m | [36msrc.data[0m:[36mpreprocess_data[0m:[36m79[0m - [1mValidating, pre-processing, and transforming the raw data into ML-ready features and targets.[0m


Unnamed: 0,year_built,area,bedrooms,bathrooms,garden_size,balcony,parking,furnished,garage,storage,neighborhood_mean_area,neighborhood_mean_bedrooms,neighborhood_mean_bathrooms,neighborhood_mean_garden_size,rent
0,1870,167.0,2,2,0,True,False,True,False,False,127.5,2.5,1.4,5.766667,4500
1,1890,150.0,2,2,0,True,True,True,False,False,107.25,2.166667,1.166667,0.0,3450
2,1923,90.0,2,1,0,True,False,True,False,False,122.0,2.111111,1.222222,10.444444,2000


**`Data splitting`**

In [14]:
# split the machine learning-ready data into train, validation, and test sets
Xtrain, Xval, Xtest, ytrain, yval, ytest = (
    read_table()
    .pipe(preprocess_data)
    .pipe(encode_neighborhood_ids)
    .pipe(split_data)
)

[32m2024-10-11 05:04:48.644[0m | [1mINFO    [0m | [36msrc.database[0m:[36mread_table[0m:[36m167[0m - [1mFetching raw data from the 'postgres' database's 'rentals.raw' table.[0m
[32m2024-10-11 05:04:48.677[0m | [1mINFO    [0m | [36msrc.data[0m:[36mpreprocess_data[0m:[36m79[0m - [1mValidating, pre-processing, and transforming the raw data into ML-ready features and targets.[0m
[32m2024-10-11 05:04:48.714[0m | [1mINFO    [0m | [36msrc.model[0m:[36msplit_data[0m:[36m38[0m - [1mSplitting the ML-ready features and targets into train, validation, and test sets.[0m


**`Model building`**

In [15]:
# fit a base model to the train set
model: XGBRegressor = XGBRegressor(base_score=0.5, n_jobs=-1)
model.fit(Xtrain, ytrain)

**`Model evaluation`**

In [16]:
# compute the base model's test set R²
compute_rsquared(ytest, model.predict(Xtest))

0.82

**`Hyperparameter tuning`**

In [17]:
# define the parameter search space
# NOTE: hyperparameters are optimized on a parameter-by-parameter basis, that is, ...
# 'n_estimators' is optimized 1st, then 'max_depth', then 'learning_rate' etc. 
search_space: dict[str, list[float | int]] = {
    # "n_estimators": [100, 200, 500], # R²: 
    # "max_depth": [3, 6, 10], # R²: 
    # "learning_rate": [0.05, 0.1, 0.2, 0.3], # R²: 
    # "gamma": [0.01, 0.1, 1], # R²: 
    # "min_child_weight": [0, 5, 20, 50] # R²: 
}

# instantiate an object of type, 'GridSearchCV'
gscv: GridSearchCV = GridSearchCV(
    estimator=XGBRegressor(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.3,
        gamma=0.1,
        base_score=0.5,
        n_jobs=-1
        ),
    param_grid=search_space,
    scoring="r2",
    refit="r2",
    cv=5,
    verbose=False
)

# fit the 'gscv' object to the train set
gscv.fit(Xtrain, ytrain)

# output the validation set R² and 'best' parameters
display(
    compute_rsquared(yval, gscv.best_estimator_.predict(Xval)),
    gscv.best_params_
)

0.63

{}

In [18]:
# extract the 'gscv' object's 'best' model
best_model: XGBRegressor = gscv.best_estimator_

# fit the 'best' model to the train set and evaluate it on the validation set
best_model.early_stopping_rounds = 20
best_model.fit(Xtrain, ytrain, eval_set=[(Xval, yval)], verbose=False)

# output the 'best' model's test set R²
# NOTE: hyperparameter tuning didn't make a difference, that is, ...
# the 'best' model's test set R², 0.82, is identical to the base model's test set R²
compute_rsquared(ytest, best_model.predict(Xtest))

0.82

**`Inference`**

In [32]:
# instantiate an object of type, 'Record'
record: Record = Record(
    year_built=2016,
    area=105.0,
    bedrooms=3,
    bathrooms=2,
    garden_size=10.0,
    balcony=False,
    parking=True,
    furnished=False,
    garage=True,
    storage=False,
    neighborhood_id=10
)

# instantiate an object of type, 'ModelService'
service: ModelInferenceService = ModelInferenceService()

# load the trained model 
service.load_model()

# output the prediction
service.predict(record)

[32m2024-10-13 07:08:45.417[0m | [1mINFO    [0m | [36msrc.model_inference[0m:[36mload_model[0m:[36m59[0m - [1mChecking if '/Users/ncheymbamalu/Desktop/PersonalProjects/rental-prediction-service/artifacts/model.pkl' exists.[0m
[32m2024-10-13 07:08:45.419[0m | [1mINFO    [0m | [36msrc.model_inference[0m:[36mload_model[0m:[36m65[0m - [1m'/Users/ncheymbamalu/Desktop/PersonalProjects/rental-prediction-service/artifacts/model.pkl' found. Loading the trained ML model.[0m
[32m2024-10-13 07:08:45.422[0m | [1mINFO    [0m | [36msrc.model_inference[0m:[36mpredict[0m:[36m78[0m - [1mGenerating the prediction...[0m


3113