In [82]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from joblib import dump

In [84]:
df = pd.read_csv(r"property_data.csv")
df.head()

Unnamed: 0,zip,province,building_type,bedroom,construction_Year,price,heating_type,land_surface
0,2940,Antwerp,HOUSE,3,1955,250000,ELECTRIC,340
1,2070,Antwerp,HOUSE,3,1955,389000,GAS,256
2,2910,Antwerp,HOUSE,3,1947,285000,GAS,427
3,2950,Antwerp,HOUSE,5,1950,299000,GAS,140
4,2300,Antwerp,HOUSE,3,2022,421400,GAS,324


In [85]:
y = df['price']
x = df.drop(['price'], axis=1)

In [86]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=0)

In [88]:
numeric_features = ['land_surface','bedroom','construction_Year']
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ['zip','building_type','province','heating_type']
categorical_transformer = OneHotEncoder(sparse = False,handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [89]:
pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("module", RandomForestRegressor())]
)

pipe.fit(X_train, y_train)
dump(pipe, 'pipe.joblib')

print("model score: %.3f" % pipe.score(X_test, y_test))

model score: 0.696


In [91]:
preds = pipe.predict(X_test)
   
print('MAE:', mean_absolute_error(y_test, preds))


MAE: 155972.5649865857
