In [1]:
%%bash
# delete all '__pycache__' directories throughout the project
cd ..
rm -rf `find . -type d -name __pycache__`

# output the file structure from the project's root directory
tree

[01;34m.[0m
├── [00mLICENSE[0m
├── [00mMakefile[0m
├── [00mREADME.md[0m
├── [01;34martifacts[0m
│   └── [00mmodel.pkl[0m
├── [00mconfig.yaml[0m
├── [01;34mdata[0m
│   └── [00mraw.parquet[0m
├── [01;34mlogs[0m
│   └── [00m10_14_2024_15_52_14.log[0m
├── [01;34mnotebooks[0m
│   └── [00mrental-prediction-service.ipynb[0m
├── [00mpoetry.lock[0m
├── [00mpyproject.toml[0m
├── [01;34msrc[0m
│   ├── [00m__init__.py[0m
│   ├── [00mapp.py[0m
│   ├── [00mconfig.py[0m
│   ├── [00mdata.py[0m
│   ├── [00mdatabase.py[0m
│   ├── [00mlogger.py[0m
│   ├── [00mmodel.py[0m
│   ├── [00mmodel_builder.py[0m
│   ├── [00mmodel_inference.py[0m
│   ├── [00mrun_model_builder.py[0m
│   └── [00mrun_model_inference.py[0m
└── [01;34mtests[0m
    └── [00m__init__.py[0m

7 directories, 22 files


#### **`Dependencies`**

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import warnings

import pandas as pd

from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# rental-prediction-service modules
from src.data import encode_neighborhood_ids, preprocess_data
from src.database import read_table
from src.model import compute_rsquared, split_data
from src.model_inference import ModelInferenceService

warnings.filterwarnings("ignore")

In [4]:
# set the pd.DataFrame and pd.Series display options
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

**`Data ingestion and pre-processing`**

In [None]:
# raw data
read_table().head(3)

In [None]:
# pre-processed data
read_table().pipe(preprocess_data).head(3)

In [None]:
# machine learning-ready data
# NOTE: the 'neighborhood_id' feature has been encoded
read_table().pipe(preprocess_data).pipe(encode_neighborhood_ids).head(3)

**`Data splitting`**

In [None]:
# split the machine learning-ready data into train, validation, and test sets
Xtrain, Xval, Xtest, ytrain, yval, ytest = (
    read_table()
    .pipe(preprocess_data)
    .pipe(encode_neighborhood_ids)
    .pipe(split_data)
)

**`Model building`**

In [None]:
# fit a base model to the train set
model: XGBRegressor = XGBRegressor(base_score=0.5, n_jobs=-1)
model.fit(Xtrain, ytrain)

**`Model evaluation`**

In [None]:
# compute the base model's test set R²
compute_rsquared(ytest, model.predict(Xtest))

**`Hyperparameter tuning`**

In [None]:
# define the parameter search space
# NOTE: hyperparameters are optimized on a parameter-by-parameter basis, that is, ...
# 'n_estimators' is optimized 1st, then 'max_depth', then 'learning_rate' etc. 
search_space: dict[str, list[float | int]] = {
    # "n_estimators": [100, 200, 500], # R²: 
    # "max_depth": [3, 6, 10], # R²: 
    # "learning_rate": [0.05, 0.1, 0.2, 0.3], # R²: 
    # "gamma": [0.01, 0.1, 1], # R²: 
    # "min_child_weight": [0, 5, 20, 50] # R²: 
}

# instantiate an object of type, 'GridSearchCV'
gscv: GridSearchCV = GridSearchCV(
    estimator=XGBRegressor(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.3,
        gamma=0.1,
        base_score=0.5,
        n_jobs=-1
        ),
    param_grid=search_space,
    scoring="r2",
    refit="r2",
    cv=5,
    verbose=False
)

# fit the 'gscv' object to the train set
gscv.fit(Xtrain, ytrain)

# output the validation set R² and 'best' parameters
display(
    compute_rsquared(yval, gscv.best_estimator_.predict(Xval)),
    gscv.best_params_
)

In [None]:
# extract the 'gscv' object's 'best' model
best_model: XGBRegressor = gscv.best_estimator_

# fit the 'best' model to the train set and evaluate it on the validation set
best_model.early_stopping_rounds = 20
best_model.fit(Xtrain, ytrain, eval_set=[(Xval, yval)], verbose=False)

# output the 'best' model's test set R²
# NOTE: hyperparameter tuning didn't make a difference, that is, ...
# the 'best' model's test set R², 0.82, is identical to the base model's test set R²
compute_rsquared(ytest, best_model.predict(Xtest))

**`Inference`**

In [None]:
# input record
record: dict[str, float | int | str] = {
    "year_built": 2016,
    "area": 105.0,
    "bedrooms": 3,
    "bathrooms": 2.0,
    "furnished": "no",
    "storage": "no",
    "garage": "yes",
    "parking": "yes",
    "balcony": "yes",
    "garden_size": 10.0,
    "neighborhood_id": 10
}

# instantiate an object of type, 'ModelInferenceService'
service: ModelInferenceService = ModelInferenceService()

# load the trained model 
service.load_model()

# output the prediction
service.predict(record)