In [1]:
import pandas as pd

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [4]:
df = pd.read_csv("modeling_data.csv")

In [5]:
df.shape

(4335, 11)

In [6]:
df.head()

Unnamed: 0,efficiency_category,has_balcony,has_elevator,has_garage,property_state,building_type,area,prague_district,rooms,floor,price
0,Třída G,True,True,False,Velmi dobrý,Cihlová,200,5005.0,6 a vice,5,75000.0
1,Třída G,True,True,False,Velmi dobrý,Cihlová,105,5002.0,3+1,3,43000.0
2,Třída G,False,True,False,Velmi dobrý,Cihlová,106,5001.0,3+1,2,42000.0
3,Třída G,False,True,True,Velmi dobrý,Cihlová,100,5005.0,4+kk,5,39000.0
4,Třída G,True,True,False,Velmi dobrý,Smíšená,174,5001.0,4+1,2,76000.0


In [7]:
df.isna().sum()

efficiency_category    0
has_balcony            0
has_elevator           0
has_garage             0
property_state         0
building_type          0
area                   0
prague_district        0
rooms                  0
floor                  0
price                  0
dtype: int64

In [8]:
X = df.loc[:, [col for col in df.columns if col!='price']]

In [9]:
y = df.price

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [19]:
categorical_cols = ['efficiency_category', 'property_state', 'building_type', 'prague_district', 'rooms']
numerical_cols = ['floor', 'area']

In [20]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)
numerical_transformer = MinMaxScaler()

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [22]:
model = RandomForestRegressor(n_estimators=100, random_state=0)

In [23]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])

In [24]:
# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

In [25]:
# Evaluate the model
score = mean_absolute_error(y_test, preds)
print('MAE:', score)

MAE: 5679.345170595458


In [18]:
y.mean()

28320.799307958478