# Housing value estimation model training

Let's train a simple regressor using Scikit-Learn, and convert the pipeline to ONNX format.

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import skl2onnx
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
dvf_38 = pd.read_csv(
    "https://files.data.gouv.fr/geo-dvf/latest/csv/2022/departements/38.csv.gz"
)
dvf_38.info()

  dvf_38 = pd.read_csv("https://files.data.gouv.fr/geo-dvf/latest/csv/2022/departements/38.csv.gz")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75871 entries, 0 to 75870
Data columns (total 40 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id_mutation                   75871 non-null  object 
 1   date_mutation                 75871 non-null  object 
 2   numero_disposition            75871 non-null  int64  
 3   nature_mutation               75871 non-null  object 
 4   valeur_fonciere               75504 non-null  float64
 5   adresse_numero                48243 non-null  float64
 6   adresse_suffixe               2482 non-null   object 
 7   adresse_nom_voie              74526 non-null  object 
 8   adresse_code_voie             74529 non-null  object 
 9   code_postal                   74528 non-null  float64
 10  code_commune                  75871 non-null  int64  
 11  nom_commune                   75871 non-null  object 
 12  code_departement              75871 non-null  int64  
 13  a

In [3]:
dataset = dvf_38.copy()
dataset = dataset[
    (dataset.nature_mutation == "Vente")
    & (dataset.type_local == "Appartement")
    & (dataset.nom_commune == "Grenoble")
]
dataset = dataset[
    [
        "surface_reelle_bati",
        "nombre_pieces_principales",
        "latitude",
        "longitude",
        "valeur_fonciere",
    ]
]
dataset = dataset.dropna()
dataset = dataset.reset_index()
dataset

Unnamed: 0,index,surface_reelle_bati,nombre_pieces_principales,latitude,longitude,valeur_fonciere
0,1,70.0,3.0,45.176163,5.719166,225000.0
1,6,109.0,4.0,45.187065,5.718309,257900.0
2,15,54.0,2.0,45.181912,5.711105,151500.0
3,26,97.0,5.0,45.173124,5.708733,160000.0
4,31,31.0,1.0,45.182767,5.743471,87000.0
...,...,...,...,...,...,...
3523,44672,54.0,3.0,45.179669,5.717220,165500.0
3524,44679,74.0,5.0,45.180877,5.711429,127000.0
3525,44688,61.0,3.0,45.166853,5.726352,110000.0
3526,44691,73.0,3.0,45.181464,5.720759,192000.0


In [4]:
X = dataset[
    ["surface_reelle_bati", "nombre_pieces_principales", "latitude", "longitude"]
]
y = dataset["valeur_fonciere"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [5]:
pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("regressor", LinearRegression()),
    ]
)
pipeline.fit(X_train, y_train)

In [6]:
pipeline.score(X_test, y_test)

0.08183264859891926

In [7]:
pipeline.predict([[50, 3, 45.1893525, 5.7216074]])



array([272106.51504335])

In [8]:
onnx_model = skl2onnx.to_onnx(pipeline, X_train[:1].astype(np.float32))
(Path() / "model.onnx").write_bytes(onnx_model.SerializeToString())

650