In [1]:
import os
import time
import datetime
import pandas as pd
import sklearn.metrics as sm
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
import pickle
import joblib


In [2]:
test_data = {
  "property_type": "HOUSE",
  "property_subtype": "VILLA",
  "kitchen": "SEMI_EQUIPPED",
  "building_state": "TO RENOVATE",
  "region": "Flanders",
  "province": "West Flanders",
  "number_rooms": 4,
  "living_area": 150,
  "surface_land": 200,
  "number_facades": 2,
  "latitude": 51.208887,
  "longitude": 3.445221
}


In [3]:
df = pd.read_csv('data/cleaned.csv').drop('Unnamed: 0', axis=1)

In [4]:
df.drop_duplicates()

Unnamed: 0,locality,property_type,property_subtype,price,number_rooms,living_area,kitchen,furnished,fireplace,terrace,...,garden,garden_area,surface_land,number_facades,swimming_pool,building_state,region,province,latitude,longitude
0,Ham,HOUSE,HOUSE,399500.0,3.0,197.0,HYPER_EQUIPPED,0,0.0,0,...,0,0.0,461.0,3.0,0,AS_NEW,Flanders,Limburg,49.746322,3.073303
1,Ham,HOUSE,HOUSE,381000.0,3.0,197.0,HYPER_EQUIPPED,0,0.0,0,...,0,0.0,343.0,3.0,0,AS_NEW,Flanders,Limburg,49.746322,3.073303
2,Ham,HOUSE,HOUSE,393500.0,3.0,192.0,HYPER_EQUIPPED,0,0.0,0,...,0,0.0,392.0,3.0,0,AS_NEW,Flanders,Limburg,49.746322,3.073303
3,Ham,HOUSE,HOUSE,315000.0,3.0,197.0,NOT_INSTALLED,0,0.0,0,...,0,0.0,509.0,3.0,0,AS_NEW,Flanders,Limburg,49.746322,3.073303
4,Ham,HOUSE,HOUSE,305000.0,3.0,195.0,NOT_INSTALLED,0,0.0,0,...,0,0.0,453.0,3.0,0,AS_NEW,Flanders,Limburg,49.746322,3.073303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9899,Geraardsbergen Ophasselt,HOUSE,HOUSE,449000.0,3.0,168.0,HYPER_EQUIPPED,0,0.0,1,...,1,150.0,480.0,3.0,0,AS_NEW,Flanders,East Flanders,50.821858,3.897035
9900,Geraardsbergen Ophasselt,HOUSE,HOUSE,549000.0,3.0,180.0,HYPER_EQUIPPED,0,0.0,1,...,1,200.0,705.0,4.0,0,AS_NEW,Flanders,East Flanders,50.821858,3.897035
9901,Geraardsbergen Ophasselt,HOUSE,HOUSE,449000.0,3.0,168.0,HYPER_EQUIPPED,0,0.0,1,...,1,200.0,517.0,3.0,0,AS_NEW,Flanders,East Flanders,50.821858,3.897035
9902,Sint-Martens-Latem,HOUSE,HOUSE,599000.0,4.0,199.0,HYPER_EQUIPPED,0,0.0,1,...,0,0.0,1104.0,4.0,0,TO_RENOVATE,Flanders,East Flanders,51.020971,3.639755


In [5]:
cat_cols = ['property_type','property_subtype','kitchen','building_state','region','province']
numerical_cols = ['price','number_rooms', 'living_area', 'surface_land', 'number_facades','latitude','longitude']
use_cols = ['number_rooms', 'living_area', 'surface_land', 'number_facades','latitude','longitude']

In [6]:
X = df.drop(columns=['price'], axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22)

encoder = OneHotEncoder(handle_unknown='ignore')
X_train_enc = encoder.fit_transform(X_train[cat_cols])
X_test_enc = encoder.transform(X_test[cat_cols])

joblib.dump(encoder, 'models/encoder.joblib')

scaler = MinMaxScaler()
X_train_scale = scaler.fit_transform(X_train[use_cols])
X_test_scale = scaler.transform(X_test[use_cols])

joblib.dump(scaler, 'models/scaler.joblib')

encoded_columns = encoder.get_feature_names_out(input_features=cat_cols)
X_train_enc_df = pd.DataFrame(X_train_enc.toarray(), columns=encoded_columns)
X_test_enc_df = pd.DataFrame(X_test_enc.toarray(), columns=encoded_columns)

X_train_merged = pd.concat([pd.DataFrame(X_train_scale, columns=use_cols), X_train_enc_df], axis=1)
X_test_merged = pd.concat([pd.DataFrame(X_test_scale, columns=use_cols), X_train_enc_df], axis=1)

X_test_merged = X_test_merged.dropna()
X_train_merged = X_train_merged.dropna()


In [7]:
grid_results = {'colsample_bytree': 0.3, 'gamma': 0.0, 'learning_rate': 0.15, 'max_depth': 8, 'min_child_weight': 1}
model = XGBRegressor(**grid_results)

model.fit(X_train_merged, y_train)
y_preds = model.predict(X_test_merged)

In [8]:
y_preds

array([201332.19, 304624.97, 571249.75, ..., 330403.12, 207846.73,
       776954.25], dtype=float32)

In [9]:
with open('models/xgb_regression_model.pkl', 'wb') as f:
        pickle.dump(model, f)

In [10]:
allcolumns = cat_cols + numerical_cols
test_df = df[allcolumns]

test_df

Unnamed: 0,property_type,property_subtype,kitchen,building_state,region,province,price,number_rooms,living_area,surface_land,number_facades,latitude,longitude
0,HOUSE,HOUSE,HYPER_EQUIPPED,AS_NEW,Flanders,Limburg,399500.0,3.0,197.0,461.0,3.0,49.746322,3.073303
1,HOUSE,HOUSE,HYPER_EQUIPPED,AS_NEW,Flanders,Limburg,381000.0,3.0,197.0,343.0,3.0,49.746322,3.073303
2,HOUSE,HOUSE,HYPER_EQUIPPED,AS_NEW,Flanders,Limburg,393500.0,3.0,192.0,392.0,3.0,49.746322,3.073303
3,HOUSE,HOUSE,NOT_INSTALLED,AS_NEW,Flanders,Limburg,315000.0,3.0,197.0,509.0,3.0,49.746322,3.073303
4,HOUSE,HOUSE,NOT_INSTALLED,AS_NEW,Flanders,Limburg,305000.0,3.0,195.0,453.0,3.0,49.746322,3.073303
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9899,HOUSE,HOUSE,HYPER_EQUIPPED,AS_NEW,Flanders,East Flanders,449000.0,3.0,168.0,480.0,3.0,50.821858,3.897035
9900,HOUSE,HOUSE,HYPER_EQUIPPED,AS_NEW,Flanders,East Flanders,549000.0,3.0,180.0,705.0,4.0,50.821858,3.897035
9901,HOUSE,HOUSE,HYPER_EQUIPPED,AS_NEW,Flanders,East Flanders,449000.0,3.0,168.0,517.0,3.0,50.821858,3.897035
9902,HOUSE,HOUSE,HYPER_EQUIPPED,TO_RENOVATE,Flanders,East Flanders,599000.0,4.0,199.0,1104.0,4.0,51.020971,3.639755


In [11]:
# load encoder and scaler from original training

encoder = joblib.load('models/encoder.joblib')
scaler = joblib.load('models/scaler.joblib')

encoded_data = encoder.transform(test_df[cat_cols])
scaled_data = scaler.transform(test_df[use_cols])

scaled_data

array([[0.42857143, 0.53142857, 0.31749311, 0.66666667, 0.87206752,
        0.39674502],
       [0.42857143, 0.53142857, 0.2362259 , 0.66666667, 0.87206752,
        0.39674502],
       [0.42857143, 0.51714286, 0.26997245, 0.66666667, 0.87206752,
        0.39674502],
       ...,
       [0.42857143, 0.44857143, 0.35606061, 0.66666667, 0.88362702,
        0.40010814],
       [0.57142857, 0.53714286, 0.76033058, 1.        , 0.88576702,
        0.39905772],
       [0.28571429, 0.64285714, 0.2768595 , 0.66666667, 0.88576702,
        0.39905772]])

In [15]:
import requests

data = {
  "property_type": "HOUSE",
  "property_subtype": "VILLA",
  "kitchen": "SEMI_EQUIPPED",
  "building_state": "TO RENOVATE",
  "region": "Flanders",
  "province": "West Flanders",
  "number_rooms": 4,
  "living_area": 150,
  "surface_land": 200,
  "number_facades": 2,
  "latitude": 51.208887,
  "longitude": 3.445221
}

response = requests.post('https://house-prediction-model-api.onrender.com', data)
print(X)

<Response [405]>
