In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd /Users/martin/Git/estates/src/data/gold

/Users/martin/Git/estates/src/data/gold


In [76]:
from rentals import load_rentals

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, Binarizer, FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import make_column_transformer

In [4]:
cd /Users/martin/Git/estates/src/models

/Users/martin/Git/estates/src/models


In [5]:
from utils import GroupImputer, DataFrameTransformer

In [6]:
rentals = load_rentals('/Users/martin/Git/estates/data/silver')

In [7]:
rentals.head()

Unnamed: 0,area_m2,district,disposition,category,furnishing,efficiency,floor,building_type,building_state,ownership,...,bus,doctors,school,kindergarten,pub,post_office,restaurant,seven_eleven,playground,price
1,30.0,brno-mesto,1+kk,flat,,,3.0,Cihlová,Velmi dobrý,Osobní,...,,,,,,,,,,11200
4,55.0,ostrava-mesto,2+1,flat,,5.0,3.0,Cihlová,Po rekonstrukci,Osobní,...,232.0,254.0,192.0,266.0,175.0,490.0,241.0,280.0,317.0,7950
5,28.0,karvina,1+1,flat,,5.0,2.0,Cihlová,Dobrý,Osobní,...,112.0,272.0,133.0,293.0,144.0,430.0,538.0,174.0,1246.0,3450
6,56.0,karvina,2+1,flat,,5.0,4.0,Cihlová,Dobrý,Osobní,...,294.0,193.0,640.0,114.0,199.0,426.0,231.0,292.0,284.0,6000
8,350.0,,rodinny,house,0.0,7.0,,Cihlová,Novostavba,,...,602.0,1111.0,1043.0,577.0,1427.0,1039.0,577.0,1039.0,1286.0,90000


In [72]:
rentals.columns

Index(['area_m2', 'district', 'disposition', 'category', 'furnishing',
       'efficiency', 'floor', 'building_type', 'building_state', 'ownership',
       'tram', 'elevator', 'theatre', 'cinema', 'groceries', 'candy_shop',
       'veterinary', 'train', 'pharmacist', 'atm', 'sports', 'bus', 'doctors',
       'school', 'kindergarten', 'pub', 'post_office', 'restaurant',
       'seven_eleven', 'playground', 'price'],
      dtype='object')

In [62]:
constant_imputer = DataFrameTransformer(transformers = [(
    'constant_imputer',
    SimpleImputer(strategy='constant', fill_value='ostatni'),
    ['district', 'disposition'])], remainder='passthrough')

zero_imputer = DataFrameTransformer(transformers = [(
    'zero_imputer',
    SimpleImputer(strategy='constant', fill_value=0),
    ['furnishing', 'elevator'])], remainder='passthrough')

mode_imputer = DataFrameTransformer(transformers = [(
    'mode_imputer',
    SimpleImputer(strategy='most_frequent'),
    ['category', 'efficiency', 'floor', 'building_type', 'building_state', 'ownership'])], remainder='passthrough')

group_imputer = DataFrameTransformer(transformers = [(
    'mode_imputer',
    GroupImputer(group_cols=['disposition'], target='area_m2', metric='median'),
    ['disposition', 'area_m2'])], remainder='passthrough')

encoder = DataFrameTransformer([(
    'encoder',
    OneHotEncoder(), 
    ['district', 'disposition', 'category', 'building_type', 'building_state', 'ownership'])],
    remainder='passthrough'
)

In [69]:
imputer = Pipeline([
    ('constant_imputer', constant_imputer),
    ('mode_imputer', mode_imputer),
    ('zero_imputer', zero_imputer),
    ('group_imputer', group_imputer),
])

In [77]:
binarizer = DataFrameTransformer(transformers = [(
    'binarizer',
    FunctionTransformer(lambda distance: distance < 500),
    ['theatre', 'cinema', 'groceries', 'candy_shop',
       'veterinary', 'train', 'pharmacist', 'atm', 'sports', 'bus', 'doctors',
       'school', 'kindergarten', 'pub', 'post_office', 'restaurant',
       'seven_eleven', 'playground']
)], remainder='passthrough')

In [78]:
preprocessor = Pipeline([
    ('imputer', imputer),
    ('binarizer', binarizer)
])

In [46]:
X, y = rentals.drop('price', axis=1), rentals.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [79]:
preprocessed = preprocessor.fit_transform(X_train)

In [80]:
preprocessed

Unnamed: 0,theatre,cinema,groceries,candy_shop,veterinary,train,pharmacist,atm,sports,bus,...,furnishing,elevator,category,efficiency,floor,building_type,building_state,ownership,district,tram
11055,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,1.0,flat,3.0,3.0,Cihlová,Po rekonstrukci,Osobní,praha 5,148.0
2538,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,flat,2.0,2.0,Smíšená,Novostavba,Osobní,praha 9,116.0
14578,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,flat,7.0,1.0,Cihlová,Dobrý,Státní/obecní,praha 1,321.0
69819,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,flat,3.0,7.0,Panelová,Velmi dobrý,Osobní,ostrava-mesto,322.0
27198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.5,0.0,house,3.0,2.0,Cihlová,Novostavba,Osobní,praha 10,1998.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18457,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,flat,7.0,3.0,Cihlová,Dobrý,Osobní,ostatni,
11158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.5,0.0,flat,7.0,2.0,Cihlová,Novostavba,Osobní,ostatni,
49506,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,flat,7.0,3.0,Cihlová,Dobrý,Osobní,plzen-mesto,115.0
56835,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.5,0.0,flat,7.0,1.0,Cihlová,Velmi dobrý,Osobní,brno-mesto,454.0
