# Housing value estimation model training

Let's train a simple regressor using Scikit-Learn, and convert the pipeline to ONNX format.

In [None]:
from pathlib import Path

import numpy as np
import onnxruntime as ort
import pandas as pd
import skl2onnx
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

Load the french housing dataset for Isère department in 2022:

In [None]:
dvf_38 = pd.read_csv(
    "https://files.data.gouv.fr/geo-dvf/latest/csv/2022/departements/38.csv.gz"
)
dvf_38.info()

Prepare the dataset to keep only sales of apartments in Grenoble:

In [None]:
dataset = dvf_38.copy()
dataset = dataset[
    (dataset.nature_mutation == "Vente")
    & (dataset.type_local == "Appartement")
    & (dataset.nom_commune == "Grenoble")
]
dataset = dataset[
    [
        "surface_reelle_bati",
        "nombre_pieces_principales",
        "latitude",
        "longitude",
        "valeur_fonciere",
    ]
]
dataset = dataset.rename(
    columns={
        "surface_reelle_bati": "area",
        "nombre_pieces_principales": "rooms",
        "valeur_fonciere": "value",
    }
)
dataset = dataset.dropna()
dataset = dataset.reset_index()
dataset

Split the dataset for train and test sets:

In [None]:
X = dataset[
    ["area", "rooms", "latitude", "longitude"]
]
y = dataset["value"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

Train a Scikit-Learn pipeline, including the normalization step and a regression model:

In [None]:
pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("regressor", LinearRegression()),
    ]
)
pipeline.fit(X_train, y_train)

Score the model (RMSE) on the test set:

In [None]:
root_mean_squared_error(y_test, pipeline.predict(X_test))

Try to predict the value of an apartment (50m2, 3 rooms, Victor Hugo place in Grenoble):

In [None]:
pipeline.predict([[50, 3, 45.1893525, 5.7216074]])

Export the model to ONNX format using `skl2onnx`:

In [None]:
onnx_model = skl2onnx.to_onnx(pipeline, X_train[:1].astype(np.float32))
onnx_model_path = Path() / "model.onnx"
onnx_model_path.write_bytes(onnx_model.SerializeToString())

Load the ONNX model and run an inference on the sample data:

In [None]:
session = ort.InferenceSession(onnx_model_path, providers=ort.get_available_providers())
session.run(
    None,
    {
        "area": [[50.0]],
        "rooms": [[3.0]],
        "latitude": [[45.1893525]],
        "longitude": [[5.7216074]],
    },
)