In [1]:
import pandas as pd
import numpy as np

import sklearn
from sklearn import tree

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv('../data/processed/all_rooms_combined.csv', index_col=0)

print(f'Наблюдений в датасете: {df.shape[0]}')

Наблюдений в датасете: 19144


In [3]:
df.head()

Unnamed: 0,rooms,subway,admin_okrug,district,street,home_number,price,year_of_construction,total_meters,kitchen_meters,flat_type,house_type,"dist_to_subway, min",way_to_subway,is_euro,is_skyscraper,floor_type,wc_count,wc_type,class_real
0,1,Спартак,СЗАО,р-н Покровское-Стрешнево,Алиа ЖК,к7,16781328.0,2023.0,37.0,11.0,Новостройка,Монолитный,5.0,пешком,False,False,usual,1,совмещенный,премиум
1,1,Шелепиха,СЗАО,р-н Хорошево-Мневники,Шелепихинская набережная,34к7,16500000.0,2024.0,42.0,12.0,Новостройка,Монолитный,14.0,пешком,False,False,usual,1,совмещенный,бизнес
2,1,Стрешнево,СЗАО,р-н Щукино,Щукинская улица,7/9С7,20540352.0,2023.0,47.0,16.0,Новостройка,Монолитный,10.0,пешком,False,False,usual,2,совмещенный,премиум
3,1,Шелепиха,СЗАО,р-н Хорошево-Мневники,Шелепихинская набережная,34к3,20800000.0,2020.0,32.0,11.0,Вторичка,Монолитный,14.0,пешком,False,False,view,1,совмещенный,премиум
4,1,Спартак,СЗАО,р-н Покровское-Стрешнево,Северо-Западный ао,Клубный Город на Реке Примавера ЖК,30460120.0,2024.0,52.0,11.0,Новостройка,Монолитный,14.0,пешком,False,False,usual,1,раздельный,премиум


In [6]:
df.class_real.unique()

array(['премиум', 'бизнес', 'комфорт', 'эконом', 'элитный'], dtype=object)

In [4]:
df.dtypes

rooms                     int64
subway                   object
admin_okrug              object
district                 object
street                   object
home_number              object
price                   float64
year_of_construction    float64
total_meters            float64
kitchen_meters          float64
flat_type                object
house_type               object
dist_to_subway, min     float64
way_to_subway            object
is_euro                    bool
is_skyscraper              bool
floor_type               object
wc_count                  int64
wc_type                  object
class_real               object
dtype: object

In [5]:
log = ['kitchen_meters', 'total_meters']
categorical = ['way_to_subway', 'subway', 'floor_type', 'wc_type', 'admin_okrug', 'is_skyscraper', 'house_type', 'flat_type','district', 'class_real',]
num = ['dist_to_subway, min', 'wc_count']
ordinal = ['rooms', 'year_of_construction']
boolean = ['is_euro', 'is_skyscraper']

### Разбиваем на X и y

In [6]:
log = ['total_meters', 'kitchen_meters']
categorical = ['admin_okrug', 'subway', 'is_skyscraper', 'class_real', 'way_to_subway']
ordinal = ['rooms']

In [7]:
X = df[log + categorical + ordinal]
y = df['price']/df['total_meters']

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, shuffle=True)

In [10]:
X_train.head(2)

Unnamed: 0,total_meters,kitchen_meters,admin_okrug,subway,is_skyscraper,class_real,way_to_subway,rooms
15780,21.0,5.0,СВАО,Отрадное,False,бизнес,на транспорте,9
1177,33.0,15.0,НАО (Новомосковский),Филатов Луг,False,комфорт,пешком,1


In [11]:
def log_transform(x):
    print(x)
    return np.log(x + 1)

In [12]:
log_transformer = FunctionTransformer(log_transform)

In [13]:
col_transformer = ColumnTransformer([("Log transform", log_transformer, log),
                                ("Scale", StandardScaler(), ordinal),
                                ("One hot", OneHotEncoder(sparse=False, handle_unknown='ignore'),categorical)],
                                remainder="passthrough")
X_train_transformed = col_transformer.fit_transform(X_train)

       total_meters  kitchen_meters
15780          21.0             5.0
1177           33.0            15.0
1112           43.0            14.0
941            58.0            10.0
8980           71.0            28.0
...             ...             ...
17960          29.0            10.0
6932           65.0            12.0
13996         147.0            18.0
237            35.0            11.0
15173         201.0            16.0

[12966 rows x 2 columns]


In [14]:
X_test_transformed = col_transformer.transform(X_test)

       total_meters  kitchen_meters
16769           7.0             2.0
2532           37.0            10.0
11934         169.0            25.0
7643           42.0            10.0
17832          21.0             5.0
...             ...             ...
17326          24.0             2.0
2939           24.0            10.0
12999          94.0            22.0
6745           58.0            21.0
5552           68.0            20.0

[3242 rows x 2 columns]


In [15]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import catboost as ctb

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [16]:
df_test = pd.merge(X_test, pd.DataFrame(y_test, columns=['real']), how = 'left', left_index=True, right_index=True)

### Простая линейная регрессия ###

In [168]:
linreg = LinearRegression().fit(X_train_transformed, y_train)
MAE_lr = round(mean_absolute_error(y_test, linreg.predict(X_test_transformed)), 3)
MAE_lr

4510970674390.974

In [169]:
df_test['preds'] = linreg.predict(X_test_transformed)
df_test.head()

Unnamed: 0,total_meters,kitchen_meters,admin_okrug,subway,is_skyscraper,class_real,way_to_subway,rooms,real,preds
16769,7.0,2.0,ЗАО,Солнцево,False,бизнес,пешком,9,321428.57,308440.0
2532,37.0,10.0,ЗАО,Раменки,False,бизнес,пешком,1,327027.03,376688.0
11934,169.0,25.0,ЦАО,Выставочная,True,элитный,пешком,5,1202130.18,1030120.0
7643,42.0,10.0,ЦАО,Электрозаводская,False,премиум,пешком,2,442837.14,469888.0
17832,21.0,5.0,ЮАО,ЗИЛ,False,премиум,пешком,9,677945.52,469576.0


In [173]:
df_test['diff'] = df_test['real']-df_test['preds']

In [174]:
df_test.sort_values(by='diff')

Unnamed: 0,total_meters,kitchen_meters,admin_okrug,subway,is_skyscraper,class_real,way_to_subway,rooms,real,preds,diff
9886,89.00,12.00,ЦАО,Добрынинская,False,премиум,пешком,3,421348.31,520058304129296.00,-520058303707947.69
17197,18.00,3.00,СВАО,Лось,False,комфорт,на транспорте,9,272222.22,520058303865032.00,-520058303592809.75
9662,50.00,7.00,СВАО,Лось,False,эконом,на транспорте,3,230000.00,520058303816536.00,-520058303586536.00
3048,39.00,9.00,ЮЗАО,Битца,False,комфорт,на транспорте,1,266666.67,520058303816520.00,-520058303549853.31
1910,43.00,10.00,ЮАО,Бирюлёво-Пассажирская,False,комфорт,пешком,1,259302.33,520058303808936.00,-520058303549633.69
...,...,...,...,...,...,...,...,...,...,...,...
9604,202.00,12.00,ЦАО,Парк Культуры,False,элитный,пешком,3,2289287.87,1224552.00,1064735.87
10444,120.00,12.00,ЦАО,Лубянка,False,элитный,пешком,3,2337411.54,1060088.00,1277323.54
13558,302.00,10.00,ЦАО,Пушкинская,False,элитный,пешком,5,2423663.58,1112264.00,1311399.58
11954,459.00,30.00,ЦАО,Третьяковская,False,элитный,пешком,5,4476318.08,1738440.00,2737878.08


### Ridge-регрессия ###

In [176]:
ridge = Ridge(alpha=1000).fit(X_train_transformed, y_train)
MAE_r = round(mean_absolute_error(y_test, ridge.predict(X_test_transformed)), 3)
MAE_r

74763.86

In [177]:
df_test['preds'] = ridge.predict(X_test_transformed)
df_test.head()

Unnamed: 0,total_meters,kitchen_meters,admin_okrug,subway,is_skyscraper,class_real,way_to_subway,rooms,real,preds,diff
16769,7.0,2.0,ЗАО,Солнцево,False,бизнес,пешком,9,321428.57,304507.11,12988.57
2532,37.0,10.0,ЗАО,Раменки,False,бизнес,пешком,1,327027.03,367926.27,-49660.97
11934,169.0,25.0,ЦАО,Выставочная,True,элитный,пешком,5,1202130.18,926667.24,172010.18
7643,42.0,10.0,ЦАО,Электрозаводская,False,премиум,пешком,2,442837.14,583372.46,-27050.86
17832,21.0,5.0,ЮАО,ЗИЛ,False,премиум,пешком,9,677945.52,420030.39,208369.52


In [178]:
df_test['diff'] = df_test['real']-df_test['preds']
df_test.sort_values(by='diff')

Unnamed: 0,total_meters,kitchen_meters,admin_okrug,subway,is_skyscraper,class_real,way_to_subway,rooms,real,preds,diff
14116,388.00,30.00,ЦАО,Шаболовская,False,комфорт,пешком,5,283247.42,589839.51,-306592.09
12084,225.00,33.00,ЦАО,Октябрьская,False,бизнес,пешком,5,311111.11,609548.70,-298437.59
14385,260.00,20.00,ЦАО,Шелепиха,False,комфорт,пешком,5,288461.54,566721.32,-278259.78
13581,190.00,20.00,ЦАО,Чистые пруды,False,премиум,пешком,5,421052.63,694409.26,-273356.63
14909,224.00,25.00,ЦАО,Тверская,False,премиум,пешком,5,436160.71,700756.57,-264595.85
...,...,...,...,...,...,...,...,...,...,...,...
4571,234.00,60.00,ЦАО,Киевская,False,элитный,на транспорте,2,2390384.62,951428.15,1438956.47
10461,82.00,6.00,ЦАО,Киевская,False,элитный,пешком,3,2344408.59,892161.67,1452246.92
10444,120.00,12.00,ЦАО,Лубянка,False,элитный,пешком,3,2337411.54,882613.66,1454797.88
13558,302.00,10.00,ЦАО,Пушкинская,False,элитный,пешком,5,2423663.58,933470.65,1490192.93


### Lasso регрессия ###

In [179]:
lasso_reg = Lasso(alpha = 2.65e-05).fit(X_train_transformed, y_train)
MAE_lasso = round(mean_absolute_error(y_test, lasso_reg.predict(X_test_transformed)), 3)
MAE_lasso

  model = cd_fast.enet_coordinate_descent(


51763.021

In [180]:
df_test['preds'] = lasso_reg.predict(X_test_transformed)
df_test.head()

Unnamed: 0,total_meters,kitchen_meters,admin_okrug,subway,is_skyscraper,class_real,way_to_subway,rooms,real,preds,diff
16769,7.0,2.0,ЗАО,Солнцево,False,бизнес,пешком,9,321428.57,308156.38,16921.46
2532,37.0,10.0,ЗАО,Раменки,False,бизнес,пешком,1,327027.03,376440.05,-40899.24
11934,169.0,25.0,ЦАО,Выставочная,True,элитный,пешком,5,1202130.18,1030115.95,275462.94
7643,42.0,10.0,ЦАО,Электрозаводская,False,премиум,пешком,2,442837.14,469820.13,-140535.32
17832,21.0,5.0,ЮАО,ЗИЛ,False,премиум,пешком,9,677945.52,469348.33,257915.14


In [181]:
df_test['diff'] = df_test['real']-df_test['preds']
df_test.sort_values(by='diff')

Unnamed: 0,total_meters,kitchen_meters,admin_okrug,subway,is_skyscraper,class_real,way_to_subway,rooms,real,preds,diff
12215,210.00,20.00,ЦАО,Третьяковская,False,элитный,пешком,5,952380.95,1726073.73,-773692.78
14907,103.00,13.00,ЦАО,Киевская,False,элитный,пешком,5,718446.60,1453890.12,-735443.52
11676,186.00,17.00,ЦАО,Третьяковская,False,премиум,пешком,5,586005.38,1310369.83,-724364.45
13581,190.00,20.00,ЦАО,Чистые пруды,False,премиум,пешком,5,421052.63,1107672.43,-686619.80
14299,164.00,18.00,ЗАО,Киевская,False,элитный,на транспорте,5,701219.51,1324256.05,-623036.53
...,...,...,...,...,...,...,...,...,...,...,...
4571,234.00,60.00,ЦАО,Киевская,False,элитный,на транспорте,2,2390384.62,1479645.08,910739.54
9604,202.00,12.00,ЦАО,Парк Культуры,False,элитный,пешком,3,2289287.87,1224596.32,1064691.55
10444,120.00,12.00,ЦАО,Лубянка,False,элитный,пешком,3,2337411.54,1060104.98,1277306.56
13558,302.00,10.00,ЦАО,Пушкинская,False,элитный,пешком,5,2423663.58,1112358.47,1311305.10


### Decision Tree ###

In [185]:
dt_reg = DecisionTreeRegressor(random_state = 0, max_depth=15).fit(X_train_transformed, y_train)
MAE_dt = round(mean_absolute_error(y_test, dt_reg.predict(X_test_transformed)), 3)
MAE_dt

41546.134

In [186]:
df_test['preds'] = dt_reg.predict(X_test_transformed)
df_test.head()

Unnamed: 0,total_meters,kitchen_meters,admin_okrug,subway,is_skyscraper,class_real,way_to_subway,rooms,real,preds
16769,7.0,2.0,ЗАО,Солнцево,False,бизнес,пешком,9,321428.57,340428.81
2532,37.0,10.0,ЗАО,Раменки,False,бизнес,пешком,1,327027.03,340428.81
11934,169.0,25.0,ЦАО,Выставочная,True,элитный,пешком,5,1202130.18,1039768.76
7643,42.0,10.0,ЦАО,Электрозаводская,False,премиум,пешком,2,442837.14,466528.78
17832,21.0,5.0,ЮАО,ЗИЛ,False,премиум,пешком,9,677945.52,474703.2


In [187]:
df_test['diff'] = df_test['real']-df_test['preds']
df_test.sort_values(by='diff')

Unnamed: 0,total_meters,kitchen_meters,admin_okrug,subway,is_skyscraper,class_real,way_to_subway,rooms,real,preds,diff
15170,125.00,7.00,ЦАО,Тверская,False,элитный,пешком,5,768000.00,3000944.88,-2232944.88
12668,419.00,18.00,ЗАО,Ломоносовский проспект,False,элитный,на транспорте,5,921412.89,1999632.55,-1078219.66
11206,105.00,10.00,ЗАО,Ломоносовский проспект,False,элитный,пешком,3,700000.00,1545000.00,-845000.00
12495,206.00,25.00,ЦАО,Киевская,False,элитный,на транспорте,5,1224455.83,1971323.53,-746867.70
11673,239.00,82.00,ЦАО,Спортивная,False,элитный,пешком,5,1614965.25,2265392.47,-650427.22
...,...,...,...,...,...,...,...,...,...,...,...
9604,202.00,12.00,ЦАО,Парк Культуры,False,элитный,пешком,3,2289287.87,1039768.76,1249519.11
10444,120.00,12.00,ЦАО,Лубянка,False,элитный,пешком,3,2337411.54,936300.46,1401111.08
10461,82.00,6.00,ЦАО,Киевская,False,элитный,пешком,3,2344408.59,900139.24,1444269.34
13558,302.00,10.00,ЦАО,Пушкинская,False,элитный,пешком,5,2423663.58,940484.69,1483178.89


**Среди базовых моделей лучший результат без настройки гиперпараметров по метрике MAE показывают решающие деревья. Проведем настройку гиперпараметров по кросс-валидации**

In [189]:
from sklearn.model_selection import GridSearchCV

In [190]:
criterion = ['squared_error', 'absolute_error']
splitter = ['best', 'random']
max_depth = [5, 10, 20, 30, 40, 50]

In [191]:
grid = [{'criterion':criterion,
        'splitter':splitter,
        'max_depth':max_depth}]

gs = GridSearchCV(estimator=DecisionTreeRegressor(),
                  param_grid = grid,
                  scoring='neg_mean_absolute_error',
                  cv = 10,
                  n_jobs = -1)

In [192]:
gs = gs.fit(X_train_transformed, y_train)

In [193]:
-1*gs.best_score_

42125.03892028832

In [194]:
gs.best_params_

{'criterion': 'absolute_error', 'max_depth': 40, 'splitter': 'random'}

In [195]:
gs.score(X_test_transformed, y_test)

-38589.45594222635

In [196]:
gs.best_estimator_

In [197]:
df_test['preds'] = gs.best_estimator_.predict(X_test_transformed)
df_test.head()

Unnamed: 0,total_meters,kitchen_meters,admin_okrug,subway,is_skyscraper,class_real,way_to_subway,rooms,real,preds,diff
16769,7.0,2.0,ЗАО,Солнцево,False,бизнес,пешком,9,321428.57,292307.69,-19000.24
2532,37.0,10.0,ЗАО,Раменки,False,бизнес,пешком,1,327027.03,335000.0,-13401.79
11934,169.0,25.0,ЦАО,Выставочная,True,элитный,пешком,5,1202130.18,1103905.33,162361.42
7643,42.0,10.0,ЦАО,Электрозаводская,False,премиум,пешком,2,442837.14,466528.78,-23691.64
17832,21.0,5.0,ЮАО,ЗИЛ,False,премиум,пешком,9,677945.52,447761.19,203242.32


In [198]:
df_test['diff'] = df_test['real']-df_test['preds']
df_test.sort_values(by='diff')

Unnamed: 0,total_meters,kitchen_meters,admin_okrug,subway,is_skyscraper,class_real,way_to_subway,rooms,real,preds,diff
7386,119.00,31.00,ЦАО,Киевская,False,элитный,на транспорте,2,966386.55,2550487.80,-1584101.25
12495,206.00,25.00,ЦАО,Киевская,False,элитный,на транспорте,5,1224455.83,2477672.81,-1253216.99
9060,150.00,15.00,ЦАО,Третьяковская,False,элитный,пешком,3,1193333.33,2356765.15,-1163431.82
12668,419.00,18.00,ЗАО,Ломоносовский проспект,False,элитный,на транспорте,5,921412.89,1999632.55,-1078219.66
16443,232.00,2.00,ЮАО,Технопарк,False,элитный,пешком,9,862197.17,1888412.02,-1026214.84
...,...,...,...,...,...,...,...,...,...,...,...
11679,433.00,37.00,ЦАО,Улица 1905 года,False,элитный,пешком,5,1653391.45,720000.00,933391.45
10459,110.00,4.00,ЦАО,Киевская,False,элитный,пешком,3,2329340.05,1106557.38,1222782.67
10461,82.00,6.00,ЦАО,Киевская,False,элитный,пешком,3,2344408.59,1106557.38,1237851.21
10444,120.00,12.00,ЦАО,Лубянка,False,элитный,пешком,3,2337411.54,886363.64,1451047.91


In [199]:
df_test['price_real'] = df_test['total_meters'] * df_test['real']
df_test['price_predicted'] = df_test['total_meters'] * df_test['preds']
df_test['diff'] = df_test['price_real'] - df_test['price_predicted']

In [200]:
pd.options.display.float_format = '{:,.2f}'.format

In [201]:
df_test

Unnamed: 0,total_meters,kitchen_meters,admin_okrug,subway,is_skyscraper,class_real,way_to_subway,rooms,real,preds,diff,price_real,price_predicted
16769,7.00,2.00,ЗАО,Солнцево,False,бизнес,пешком,9,321428.57,292307.69,203846.15,2250000.00,2046153.85
2532,37.00,10.00,ЗАО,Раменки,False,бизнес,пешком,1,327027.03,335000.00,-295000.00,12100000.00,12395000.00
11934,169.00,25.00,ЦАО,Выставочная,True,элитный,пешком,5,1202130.18,1103905.33,16600000.00,203160000.00,186560000.00
7643,42.00,10.00,ЦАО,Электрозаводская,False,премиум,пешком,2,442837.14,466528.78,-995048.78,18599160.00,19594208.78
17832,21.00,5.00,ЮАО,ЗИЛ,False,премиум,пешком,9,677945.52,447761.19,4833870.93,14236856.00,9402985.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17326,24.00,2.00,ЗАО,Строгино,False,бизнес,пешком,9,337585.00,349922.02,-296088.46,8102040.00,8398128.46
2939,24.00,10.00,СВАО,Алексеевская,False,премиум,пешком,1,528937.50,514534.66,345668.25,12694500.00,12348831.75
12999,94.00,22.00,НАО (Новомосковский),Коммунарка,False,эконом,пешком,5,196808.51,196808.51,0.00,18500000.00,18500000.00
6745,58.00,21.00,СЗАО,Пятницкое шоссе,False,эконом,на транспорте,2,213516.19,214177.47,-38354.00,12383939.00,12422293.00


### CatBoost ###

In [354]:
X_train_ctb = X_train.copy()
X_test_ctb = X_test.copy()

In [355]:
X_train_ctb.dtypes

total_meters      float64
kitchen_meters    float64
admin_okrug        object
subway             object
is_skyscraper        bool
class_real         object
way_to_subway      object
rooms               int64
dtype: object

In [357]:
X_test_ctb.dtypes

total_meters      float64
kitchen_meters    float64
admin_okrug        object
subway             object
is_skyscraper        bool
class_real         object
way_to_subway      object
rooms               int64
dtype: object

In [358]:
X_train_ctb.drop('is_skyscraper', axis=1, inplace=True)
X_test_ctb.drop('is_skyscraper', axis=1, inplace=True)


In [359]:
X_train_ctb

Unnamed: 0,total_meters,kitchen_meters,admin_okrug,subway,class_real,way_to_subway,rooms
15780,21.00,5.00,СВАО,Отрадное,бизнес,на транспорте,9
1177,33.00,15.00,НАО (Новомосковский),Филатов Луг,комфорт,пешком,1
1112,43.00,14.00,ЮЗАО,Калужская,бизнес,пешком,1
941,58.00,10.00,ЦАО,Павелецкая,премиум,пешком,1
8980,71.00,28.00,ЗАО,Фили,премиум,пешком,3
...,...,...,...,...,...,...,...
17960,29.00,10.00,ЦАО,Красносельская,премиум,пешком,9
6932,65.00,12.00,ЮВАО,Некрасовка,эконом,пешком,2
13996,147.00,18.00,ЮЗАО,Нагорная,премиум,пешком,5
237,35.00,11.00,ЮЗАО,Ясенево,бизнес,на транспорте,1


In [360]:
X_train_ctb.dtypes == 'O'

total_meters      False
kitchen_meters    False
admin_okrug        True
subway             True
class_real         True
way_to_subway      True
rooms             False
dtype: bool

In [361]:
ctbst = ctb.CatBoostRegressor(cat_features = list(X_train_ctb.columns[X_train_ctb.dtypes == 'O']),
                                 random_state = 42)
ctbst.fit(X_train_ctb, y_train, verbose = False)
#preds = ctbst.predict(X_test_ctb)

<catboost.core.CatBoostRegressor at 0x2a763aa10>

In [362]:
X_test_ctb[X_test_ctb.way_to_subway.isna()].index

Int64Index([5516], dtype='int64')

In [364]:
X_test_ctb.loc[X_test_ctb[X_test_ctb.way_to_subway.isna()].index, 'way_to_subway'] = 'пешком'

In [365]:
(X_test_ctb['total_meters']=='пешком').sum()

0

In [366]:
X_test_ctb.dtypes

total_meters      float64
kitchen_meters    float64
admin_okrug        object
subway             object
class_real         object
way_to_subway      object
rooms               int64
dtype: object

In [367]:
ctbst_preds = ctbst.predict(X_test_ctb)

In [368]:
len(ctbst_preds)

3242

In [370]:
df_test['preds'] = ctbst_preds
df_test.head()

Unnamed: 0,total_meters,kitchen_meters,admin_okrug,subway,is_skyscraper,class_real,way_to_subway,rooms,real,preds,diff,price_real,price_predicted
16769,7.0,2.0,ЗАО,Солнцево,False,бизнес,пешком,9,321428.57,335580.69,203846.15,2250000.0,2046153.85
2532,37.0,10.0,ЗАО,Раменки,False,бизнес,пешком,1,327027.03,357228.16,-295000.0,12100000.0,12395000.0
11934,169.0,25.0,ЦАО,Выставочная,True,элитный,пешком,5,1202130.18,1085436.03,16600000.0,203160000.0,186560000.0
7643,42.0,10.0,ЦАО,Электрозаводская,False,премиум,пешком,2,442837.14,466360.82,-995048.78,18599160.0,19594208.78
17832,21.0,5.0,ЮАО,ЗИЛ,False,премиум,пешком,9,677945.52,482958.07,4833870.93,14236856.0,9402985.07


In [371]:
MAE_ctbst = round(mean_absolute_error(y_test, ctbst_preds), 3)
MAE_ctbst

43013.282

In [379]:
ctbst = ctb.CatBoostRegressor(iterations = 1000, 
                             depth = 15, 
                             learning_rate = 0.04746000096201897,
                             random_strength = 1,
                             border_count = 254,
                             l2_leaf_reg = 3, 
                             grow_policy = 'SymmetricTree',
                             cat_features = list(X_train_ctb.columns[X_train_ctb.dtypes == 'O']),
                             random_state = 42)
ctbst.fit(X_train_ctb, y_train, verbose = False)

<catboost.core.CatBoostRegressor at 0x168439990>

In [380]:
ctbst_preds = ctbst.predict(X_test_ctb)
MAE_ctbst = round(mean_absolute_error(y_test, ctbst_preds), 3)
MAE_ctbst

37538.141

In [381]:
df_test['preds'] = ctbst_preds
df_test.head()

Unnamed: 0,total_meters,kitchen_meters,admin_okrug,subway,is_skyscraper,class_real,way_to_subway,rooms,real,preds,diff,price_real,price_predicted
16769,7.0,2.0,ЗАО,Солнцево,False,бизнес,пешком,9,321428.57,328386.85,203846.15,2250000.0,2046153.85
2532,37.0,10.0,ЗАО,Раменки,False,бизнес,пешком,1,327027.03,358067.89,-295000.0,12100000.0,12395000.0
11934,169.0,25.0,ЦАО,Выставочная,True,элитный,пешком,5,1202130.18,1163386.42,16600000.0,203160000.0,186560000.0
7643,42.0,10.0,ЦАО,Электрозаводская,False,премиум,пешком,2,442837.14,435793.31,-995048.78,18599160.0,19594208.78
17832,21.0,5.0,ЮАО,ЗИЛ,False,премиум,пешком,9,677945.52,492418.53,4833870.93,14236856.0,9402985.07


### RandomForest Regressor ###

In [17]:
rf = RandomForestRegressor(criterion='squared_error', n_estimators = 300)

In [18]:
rf.fit(X_train_transformed, y_train)

In [19]:
rf_preds = rf.predict(X_test_transformed)
MAE_rf = round(mean_absolute_error(y_test, rf_preds), 3)
MAE_rf

32505.754

In [None]:
rf2 = RandomForestRegressor(criterion='absolute_error', n_estimators = 100)
rf2.fit(X_train_transformed, y_train)

In [None]:
rf2_preds = rf2.predict(X_test_transformed)
MAE_rf2 = round(mean_absolute_error(y_test, rf2_preds), 3)
MAE_rf2

In [203]:
# ### Гиперпараметры леса
# choices = pd.DataFrame(index = range(0, 1000, 1), columns = ['n_estimators',
#                                                             'max_depth',
#                                                             'min_samples_split',
#                                                             'min_samples_leaf',
#                                                             'max_features',
#                                                             'max_leaf_nodes',
#                                                             'bootstrap',
#                                                             'max_samples',
#                                                             'threshold'])
# # 'special' for None
# choices['n_estimators'] = np.random.RandomState(42).choice([100, 300, 500, 700, 1000]*200, 1000, replace = False)
# choices['max_depth'] = np.random.RandomState(43).choice([2, 4, 6, 8, 10]*200, 1000, replace = False)
# choices['min_samples_split'] = np.random.RandomState(44).choice([2, 5, 10, 20, 40]*200, 1000, replace = False)
# choices['min_samples_leaf'] = np.random.RandomState(45).choice([1, 2, 5, 10, 20]*200, 1000, replace = False)
# choices['max_features'] = np.random.RandomState(46).choice([0.7, 'log2', 'sqrt', 'auto']*250, 1000, replace = False)
# choices['max_leaf_nodes'] = np.random.RandomState(47).choice([5, 10, 20, 'special']*250, 1000, replace = False)
# choices['bootstrap'] = np.random.RandomState(48).choice([True]*800 + [False]*200, 1000, replace = False)
# choices['max_samples'] = np.random.RandomState(49).choice([0.5, 0.7, 0.9, 'special']*250, 1000, replace = False)
# choices['threshold'] = np.random.RandomState(50).choice([0.05, 0.01, 0.005, 0.001]*250, 1000, replace = False)

# choices.loc[~choices.max_features.str.contains('log2|sqrt|auto'), 'max_features'] = \
# choices.loc[~choices.max_features.str.contains('log2|sqrt|auto'), 'max_features'].astype(np.float64)
# choices.loc[choices.max_leaf_nodes != 'special', 'max_leaf_nodes'] = \
# choices.loc[choices.max_leaf_nodes != 'special', 'max_leaf_nodes'].astype(np.int64)
# choices.loc[choices.max_samples != 'special', 'max_samples'] = \
# choices.loc[choices.max_samples != 'special', 'max_samples'].astype(np.float64)

# ### Гиперпараметры бустинга
# choices_bstr = pd.DataFrame(index = range(0, 500, 1), columns = ['iterations', 
#                                                                  'depth',  
#                                                                  'learning_rate', 
#                                                                  'random_strength',  
#                                                                  'bagging_temperature', 
#                                                                  'border_count',  
#                                                                  'l2_leaf_reg',  
#                                                                  'grow_policy',  
#                                                                  'threshold'])
# choices_bstr['iterations'] = np.random.RandomState(42).choice([250, 500, 1000, 1500]*125, 500, replace = False)
# choices_bstr['depth'] = np.random.RandomState(43).choice([2, 4, 6, 8]*125, 500, replace = False)
# choices_bstr['learning_rate'] = np.random.RandomState(44).choice([0.001, 0.01, 0.1, 0.2, 0.3]*100, 500, replace = False)
# choices_bstr['random_strength'] = np.random.RandomState(45).choice([0, 0.3, 0.6, 1]*125, 500, replace = False)
# choices_bstr['bagging_temperature'] = np.random.RandomState(46).choice([0, 0.7, 0.9, 1]*125, 500, replace = False)
# choices_bstr['border_count'] = np.random.RandomState(47).choice([60, 128, 200, 254]*125, 500, replace = False)
# choices_bstr['l2_leaf_reg'] = np.random.RandomState(48).choice([1, 3, 5, 10]*125, 500, replace = False)
# choices_bstr['grow_policy'] = np.random.RandomState(49).choice(['SymmetricTree', 'Depthwise']*250, 500, replace = False)
# choices_bstr['threshold'] = np.random.RandomState(50).choice([0.01, 0.005, 0.001, 0.0005]*125, 500, replace = False)

**Можно сказать, что наибольшей важностью для правильности предсказания уровня арендной платы являются:**
- площадь квартиры
- административный район Москвы (расположение в ЦАО существеннее всего влияет на арендную плату)
- этаж и количество этажей в доме
- наличие посудомоечной машины