In [1]:
import pandas as pd


In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [3]:
df = pd.read_csv("modeling_data.csv")

In [25]:
df.shape

(4314, 10)

In [5]:
df.head()

Unnamed: 0,efficiency_category,floor,has_balcony,has_elevator,has_garage,property_state,building_type,area,prague_district,price
0,Třída G,5,True,True,False,Velmi dobrý,Cihlová,200,5005.0,75000.0
1,Třída G,3,True,True,False,Velmi dobrý,Cihlová,105,5002.0,43000.0
2,Třída G,2,False,True,False,Velmi dobrý,Cihlová,106,5001.0,42000.0
3,Třída G,5,False,True,True,Velmi dobrý,Cihlová,100,5005.0,39000.0
4,Třída G,2,True,True,False,Velmi dobrý,Smíšená,174,5001.0,76000.0


In [26]:
df.building_type.value_counts()

Cihlová      3425
Smíšená       386
Panelová      358
Skeletová     134
Other          11
Name: building_type, dtype: int64

In [4]:
df.isna().sum()

efficiency_category    0
floor                  0
has_balcony            0
has_elevator           0
has_garage             0
property_state         0
building_type          0
area                   0
prague_district        0
price                  0
dtype: int64

In [16]:
X = df.loc[:, [col for col in df.columns if col!='price']]

In [18]:
y = df.price

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [6]:
categorical_cols = ['efficiency_category', 'property_state', 'building_type', 'prague_district']
numerical_cols = ['floor', 'area']

In [10]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)
numerical_transformer = MinMaxScaler()

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [9]:
model = RandomForestRegressor(n_estimators=100, random_state=0)

In [12]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])

In [22]:
# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

MSE: 5363.793625207229


In [24]:
# Evaluate the model
score = mean_absolute_error(y_test, preds)
print('MAE:', score)

MAE: 5363.793625207229


In [27]:
X_train.shape

(2890, 9)