# Курсовая работа студента факультета искусственного интеллекта онлайн-университета [Geek University](https://geekbrains.ru/geek_university)

## В рамках курса "Python для Data Science"  

**Автор**: Кабанов Сергей  
**Преподаватель**: Ширкин Сергей

**Материалы к проекту (файлы)**:  
train.csv  
test.csv

**Задание**:  
Используя данные из train.csv, построить модель для предсказания цен на недвижимость (квартиры). С помощью полученной модели предсказать цены для квартир из файла test.csv.

**Целевая переменная**:  
Price

**Основная метрика**:  
R2 - коэффициент детерминации (sklearn.metrics.r2_score)

**Вспомогательная метрика**:  
MSE - средняя квадратичная ошибка (sklearn.metrics.mean_squared_error)

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score as r2, mean_squared_error as mse

In [2]:
RANDOM_STATE = 42

In [3]:
TRAIN_DATA_FILE = 'train.csv'
TEST_DATA_FILE = 'test.csv'

### Data load

In [4]:
data_train = pd.read_csv(TRAIN_DATA_FILE)
data_test = pd.read_csv(TEST_DATA_FILE)

In [5]:
data_train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [6]:
data_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,10000.0,8383.4077,4859.01902,0.0,4169.5,8394.5,12592.5,16798.0
DistrictId,10000.0,50.4008,43.587592,0.0,20.0,36.0,75.0,209.0
Rooms,10000.0,1.8905,0.839512,0.0,1.0,2.0,2.0,19.0
Square,10000.0,56.315775,21.058732,1.136859,41.774881,52.51331,65.900625,641.0652
LifeSquare,7887.0,37.199645,86.241209,0.370619,22.769832,32.78126,45.128803,7480.592
KitchenSquare,10000.0,6.2733,28.560917,0.0,1.0,6.0,9.0,2014.0
Floor,10000.0,8.5267,5.241148,1.0,4.0,7.0,12.0,42.0
HouseFloor,10000.0,12.6094,6.775974,0.0,9.0,13.0,17.0,117.0
HouseYear,10000.0,3990.1663,200500.261427,1910.0,1974.0,1977.0,2001.0,20052010.0
Ecology_1,10000.0,0.118858,0.119025,0.0,0.017647,0.075424,0.195781,0.5218671


### Init data prepare

In [7]:
# посмотрим на слишком большие знаечения поля HouseYear
data_train.loc[data_train['HouseYear'] > 2020]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
1497,10814,109,1.0,37.26507,20.239714,9.0,9,12.0,20052011,0.13633,B,B,30,6141,10,262.0,3,6,B,254084.534396
4189,11607,147,2.0,44.791836,28.360393,5.0,4,9.0,4968,0.319809,B,B,25,4756,16,2857.0,5,8,B,243028.603096


In [8]:
# заменим 20052011 на 2005
data_train.loc[data_train['HouseYear'] == 20052011, 'HouseYear'] = 2005

# заменим 4968 на 1968
data_train.loc[data_train['HouseYear'] == 4968, 'HouseYear'] = 1968

In [9]:
# посмотрим на большие значения поля HouseFloor
data_train.loc[data_train['HouseFloor'] > 50]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
6131,10806,5,1.0,51.944587,48.709601,1.0,6,99.0,1977,0.150818,B,B,16,3433,4,2643.0,4,5,B,296127.115515
8599,9300,74,2.0,71.747869,74.579809,9.0,5,99.0,1977,0.075779,B,B,6,1437,3,,0,2,B,243329.912579
8854,78,30,2.0,65.773749,66.811789,1.0,8,117.0,1977,7.8e-05,B,B,22,6398,141,1046.0,3,23,B,207248.37052


In [10]:
# заменим 99 на 9
data_train.loc[data_train['HouseFloor'] == 99, 'HouseFloor'] = 9

# заменим 117 на 17
data_train.loc[data_train['HouseFloor'] == 117, 'HouseFloor'] = 17

In [11]:
# посмотрим на квартиры с очень большой площадью, оставим только понятные столбцы
data_train.loc[data_train['Square'] > 200, ['Id', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
                                            'Floor', 'HouseFloor', 'Price']]

Unnamed: 0,Id,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,Price
1981,7917,0.0,212.932361,211.231125,0.0,2,3.0,302211.260887
1982,5548,5.0,275.645284,233.949309,26.0,12,37.0,455264.882666
4262,28,2.0,604.705972,,1.0,17,18.0,187717.242538
4690,2307,1.0,409.425181,410.639749,10.0,4,4.0,90470.43083
6977,11602,2.0,641.065193,638.163193,10.0,20,19.0,133529.681562
9910,16568,4.0,200.334539,201.627361,25.0,1,2.0,528560.506016


In [12]:
# Похоже, что у квартиры с Id 28, 2307, 11602 судя по их стоимости ошибка в полях LifeSquare и Square. 
# Значения завышены в 10 раз. Для остальных квартир площадь выглядит нормально.
# уменьшим площади в 10 раз для данных квартир
data_train.loc[data_train['Id'].isin([28, 2307, 11602]), ['Square', 'LifeSquare']] = \
                                        data_train.loc[data_train['Id'].isin([28, 2307, 11602]), ['Square', 'LifeSquare']] / 10

In [13]:
# посмотрим на квартиры с большим количеством комнат
data_train.loc[data_train['Rooms'] > 5]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
377,5927,57,10.0,59.056975,36.223072,10.0,22,22.0,2002,0.090799,B,B,74,19083,2,,5,15,B,317265.323792
1454,8491,1,19.0,42.006046,21.779288,7.0,17,17.0,2014,0.007122,B,B,1,264,0,,0,1,B,78364.616704
2170,14003,99,6.0,59.414334,38.702244,6.0,7,9.0,1969,0.033494,B,B,66,10573,1,1322.0,3,8,B,229661.964416
8849,14865,9,10.0,60.871266,38.420681,10.0,3,2.0,1994,0.161532,B,B,25,5648,1,30.0,2,4,B,172329.270863


In [14]:
# посмотрим медианные значения площади для 1, 2 и 3 комнатных квартир
data_train.loc[data_train['Rooms'].isin([1,2,3]), ['Rooms', 'Square']].groupby(by='Rooms').median()

Unnamed: 0_level_0,Square
Rooms,Unnamed: 1_level_1
1.0,40.40659
2.0,55.841812
3.0,77.413643


In [15]:
# заменим значения для квартир с аномально большим количеством комнат на медианные
data_train.loc[data_train['Id'].isin([5927, 14003, 14865]), 'Rooms'] = 2
data_train.loc[data_train['Id'] == 8491, 'Rooms'] = 1

### Train test split

In [16]:
train, valid = train_test_split(data_train, test_size=0.25, random_state=RANDOM_STATE)

### Data prepare

In [17]:
# посчитаем различные статистики по train
# количество квартир в районе:
stats_distr_1 = train['DistrictId'].value_counts(normalize=True).reset_index().\
                                                                 rename(columns={'index':'DistrictId', 
                                                                                 'DistrictId':'flat_qty_distr'})

# средняя цена за квартиру по районам и кол-ву комнат
stats_distr_rooms_1 = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Price']].mean().\
                                                                            rename(columns={'Price':'mean_price_distr_rooms'})

# средняя цена за квартиру по кол-ву комнат
stats_rooms_1 = train.groupby(['Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_rooms'})

# медианная площадь по району и кол-ву комнат
stats_distr_rooms_2 = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Square']].median().\
                                                                            rename(columns={'Square':'mean_square_distr_rooms'})

# медианная площадь по кол-ву комнат
stats_rooms_2 = train.groupby(['Rooms'], as_index=False)[['Square']].median().\
                                                                            rename(columns={'Square':'mean_square_rooms'})

# средняя цена за квадратный метр
stats_sq_price = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Price']].mean().\
                                                                            rename(columns={'Price':'mean_price_distr_rooms'})

In [18]:
# Объявим необходимые функции
def prepare_category(df, cat_feats=['Ecology_2', 'Ecology_3', 'Shops_2']):
    ''' Приводит категориальные признаки к числовым
    '''
    df_copy = df.copy()
    for f in cat_feats:
        df_copy.loc[:, f] = df[f].map({'A': 1, 'B': 0})
    return df_copy


def add_stat_distr_1(df, stats=stats_distr_1):
    ''' Добавляет статистику "количество квартир в районе"
    '''
    df_copy = df.copy()
    df_copy = pd.merge(df_copy, stats, on='DistrictId', how='left')
    df_copy['flat_qty_distr'] = df_copy['flat_qty_distr'].fillna(stats['flat_qty_distr'].min())
    return df_copy


def add_stat_distr_rooms(df, stats_1=stats_distr_rooms_1, stats_2=stats_rooms_1):
    ''' Добавляет статистики "средняя цена за квартиру по районам и кол-ву комнат" и
        "средняя цена за квартиру по кол-ву комнат"
    '''
    df_copy = df.copy()
    df_copy = pd.merge(df_copy, stats_1, on=['DistrictId', 'Rooms'], how='left')
    df_copy = pd.merge(df_copy, stats_2, on='Rooms', how='left')
    
    df_copy['mean_price_rooms'] = df_copy['mean_price_rooms'].fillna(stats_2['mean_price_rooms'].mean())
    df_copy['mean_price_distr_rooms'] = df_copy['mean_price_distr_rooms'].fillna(df_copy['mean_price_rooms'])
    return df_copy


def add_stat_distr_rooms_2(df, stats_1=stats_distr_rooms_2, stats_2=stats_rooms_2):
    ''' Добавляет статистики "медианная площадь по району и кол-ву комнат" и
        "медианная площадь по кол-ву комнат"
    '''
    df_copy = df.copy()
    df_copy = pd.merge(df_copy, stats_1, on=['DistrictId', 'Rooms'], how='left')
    df_copy = pd.merge(df_copy, stats_2, on='Rooms', how='left')
    
    df_copy['mean_square_rooms'] = df_copy['mean_square_rooms'].fillna(stats_2['mean_square_rooms'].mean())
    df_copy['mean_square_distr_rooms'] = df_copy['mean_square_distr_rooms'].fillna(df_copy['mean_square_rooms'])
    return df_copy


def add_stat_square_price(df, stats):
    ''' Добавляет статистику "средняя цена за квадратный метр"
    '''
    df_copy = df.copy()
    df_copy = pd.merge(df_copy, stats, on='Rooms', how='left')
    df_copy['mean_square_price'] = df_copy['mean_square_price'].fillna(stats['mean_square_price'].mean())
    return df_copy


def fillna_Healthcare_1(df):
    ''' Заменяет nan в поле Healthcare_1 на 0
    '''
    df_copy = df.copy()
    df_copy['Healthcare_1'] = df_copy['Healthcare_1'].fillna(0)
    return df_copy


def fillna_LifeSquare(df, source_df):
    ''' Заполняет LifeSquare средним значением
    '''
    df_copy = df.copy()
    df_copy['LifeSquare'] = df_copy['LifeSquare'].fillna(source_df['LifeSquare'].mean())
    return df_copy


def add_square_price(df):
    ''' Добавить признак цена за квадратный метр (только для train)
    '''
    df_copy = df.copy()
    df_copy['square_price'] = df_copy['Price'] / df_copy['Square']
    return df_copy


def prepare_LifeSquare(df, source_df):
    ''' Заменяем все аномально большие значения на среднее
    '''
    df_copy = df.copy()
    df_copy.loc[df_copy['LifeSquare'] > 250, 'LifeSquare'] = source_df['LifeSquare'].mean()
    return df_copy


def prepare_KitchenSquare(df, source_df):
    ''' Заменяем все аномально большие значения на среднее
    '''
    df_copy = df.copy()
    df_copy.loc[df_copy['KitchenSquare'] > 250, 'KitchenSquare'] = source_df['KitchenSquare'].mean()
    return df_copy

In [19]:
train = add_square_price(train)

# средняя цена за квадратный метр
stats_square_price_mean = train.groupby(['Rooms'], as_index=False)[['square_price']]\
    .mean().rename(columns={'square_price':'mean_square_price'})

train = add_stat_distr_1(train)
valid = add_stat_distr_1(valid)

train = add_stat_distr_rooms(train)
valid = add_stat_distr_rooms(valid)

train = add_stat_distr_rooms_2(train)
valid = add_stat_distr_rooms_2(valid)

train = add_stat_square_price(train, stats_square_price_mean)
valid = add_stat_square_price(valid, stats_square_price_mean)

train = fillna_Healthcare_1(train)
valid = fillna_Healthcare_1(valid)

train = fillna_LifeSquare(train, train)
valid = fillna_LifeSquare(valid, train)

train = prepare_category(train)
valid = prepare_category(valid)

train = prepare_LifeSquare(train, train)
valid = prepare_LifeSquare(valid, train)

train = prepare_KitchenSquare(train, train)
valid = prepare_KitchenSquare(valid, train)

### Model

In [20]:
def fit_and_score(model, X_train, X_valid, y_train, y_valid):
    model.fit(X_train, y_train.values.ravel())
    y_train_pred = model.predict(X_train)
    y_valid_pred = model.predict(X_valid)
    r2_train = r2(y_train, y_train_pred)
    r2_valid = r2(y_valid, y_valid_pred)
    return r2_train, r2_valid

In [21]:
rf = RF(random_state=RANDOM_STATE,
        n_estimators=1000,
        max_depth=17,
        max_features=6,
        n_jobs=-1)

In [22]:
feats = [
    'DistrictId',
    'Rooms', 
    'Square', 
    'LifeSquare', 
    'KitchenSquare',
    'Floor',
    'HouseFloor',
    'HouseYear',
    'Ecology_1',
    'Ecology_2',
    'Ecology_3', 
    'Social_1', 
    'Social_2',
    'Social_3', 
    'Healthcare_1',
    'Helthcare_2',
    'Shops_1',
    'Shops_2', 
    'flat_qty_distr',
#     'mean_price_distr_rooms',
    'mean_price_rooms', 
#     'mean_square_distr_rooms',
#     'mean_square_rooms',
    'mean_square_price'
]

target = ['Price']

In [23]:
fit_and_score(rf, train[feats], valid[feats], train[target], valid[target])

(0.9536981479524923, 0.7349390532041377)

### Подбор параметров модели

In [144]:
parameters = [{'n_estimators': [150, 200, 250], 
               'max_features': np.arange(5, 9),
               'max_depth': np.arange(5, 15)}]

clf = GridSearchCV(estimator=RF(random_state=RANDOM_STATE, n_jobs=-1), 
                   param_grid=parameters,
                   scoring='r2',
                   cv=5)

clf.fit(train[feats], train[target].values.ravel())

In [149]:
clf.best_params_

{'max_depth': 14, 'max_features': 6, 'n_estimators': 250}

In [150]:
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

0.641 (+/-0.027) for {'max_depth': 5, 'max_features': 5, 'n_estimators': 150}
0.640 (+/-0.027) for {'max_depth': 5, 'max_features': 5, 'n_estimators': 200}
0.641 (+/-0.028) for {'max_depth': 5, 'max_features': 5, 'n_estimators': 250}
0.646 (+/-0.032) for {'max_depth': 5, 'max_features': 6, 'n_estimators': 150}
0.646 (+/-0.031) for {'max_depth': 5, 'max_features': 6, 'n_estimators': 200}
0.647 (+/-0.030) for {'max_depth': 5, 'max_features': 6, 'n_estimators': 250}
0.648 (+/-0.029) for {'max_depth': 5, 'max_features': 7, 'n_estimators': 150}
0.648 (+/-0.029) for {'max_depth': 5, 'max_features': 7, 'n_estimators': 200}
0.650 (+/-0.030) for {'max_depth': 5, 'max_features': 7, 'n_estimators': 250}
0.654 (+/-0.032) for {'max_depth': 5, 'max_features': 8, 'n_estimators': 150}
0.654 (+/-0.032) for {'max_depth': 5, 'max_features': 8, 'n_estimators': 200}
0.654 (+/-0.031) for {'max_depth': 5, 'max_features': 8, 'n_estimators': 250}
0.670 (+/-0.032) for {'max_depth': 6, 'max_features': 5, 'n_esti

## Подбор max_depth

In [156]:
parameters_2 = [{'n_estimators': [250, 500, 750, 1000], 
               'max_features': [6],
               'max_depth': np.arange(13, 19)}]

clf_2 = GridSearchCV(estimator=RF(random_state=RANDOM_STATE, n_jobs=-1), 
                   param_grid=parameters_2,
                   scoring='r2',
                   cv=5)

clf_2.fit(train[feats], train[target].values.ravel())

In [158]:
clf_2.best_params_

{'max_depth': 17, 'max_features': 6, 'n_estimators': 250}

In [159]:
means = clf_2.cv_results_['mean_test_score']
stds = clf_2.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, clf_2.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

0.740 (+/-0.038) for {'max_depth': 13, 'max_features': 6, 'n_estimators': 250}
0.739 (+/-0.038) for {'max_depth': 13, 'max_features': 6, 'n_estimators': 500}
0.739 (+/-0.037) for {'max_depth': 13, 'max_features': 6, 'n_estimators': 750}
0.740 (+/-0.037) for {'max_depth': 13, 'max_features': 6, 'n_estimators': 1000}
0.742 (+/-0.036) for {'max_depth': 14, 'max_features': 6, 'n_estimators': 250}
0.741 (+/-0.036) for {'max_depth': 14, 'max_features': 6, 'n_estimators': 500}
0.741 (+/-0.037) for {'max_depth': 14, 'max_features': 6, 'n_estimators': 750}
0.741 (+/-0.037) for {'max_depth': 14, 'max_features': 6, 'n_estimators': 1000}
0.740 (+/-0.038) for {'max_depth': 15, 'max_features': 6, 'n_estimators': 250}
0.740 (+/-0.036) for {'max_depth': 15, 'max_features': 6, 'n_estimators': 500}
0.741 (+/-0.036) for {'max_depth': 15, 'max_features': 6, 'n_estimators': 750}
0.741 (+/-0.036) for {'max_depth': 15, 'max_features': 6, 'n_estimators': 1000}
0.741 (+/-0.036) for {'max_depth': 16, 'max_featu

### Кросс-валидация на всем train

In [24]:
data_all = data_train.copy()

In [25]:
data_all = add_square_price(data_all)

# средняя цена за квадратный метр
data_all_stats_square_price_mean = data_all.groupby(['Rooms'], as_index=False)[['square_price']]\
    .mean().rename(columns={'square_price':'mean_square_price'})

data_all = add_stat_distr_1(data_all)
data_all = add_stat_distr_rooms(data_all)
data_all = add_stat_distr_rooms_2(data_all)
data_all = add_stat_square_price(data_all, data_all_stats_square_price_mean)
data_all = fillna_Healthcare_1(data_all)
data_all = fillna_LifeSquare(data_all, data_all)
data_all = prepare_category(data_all)
data_all = prepare_LifeSquare(data_all, data_all)
data_all = prepare_KitchenSquare(data_all, data_all)

In [27]:
from sklearn.model_selection import cross_val_score

clf_cv = RF(random_state=RANDOM_STATE,
            n_estimators=1000,
            max_depth=17,
            max_features=6,
            n_jobs=-1)

scores = cross_val_score(clf_cv, data_all[feats], data_all[target].values.ravel(), cv=5, scoring='r2')

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

### Init data test prepare

In [32]:
data_test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,,0,0,B
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,B,B,30,7538,87,4702.0,5,5,B
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,,3,3,B
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,,0,0,A


In [33]:
# посмотрим на большие значения поля HouseFloor
data_test.loc[data_test['HouseFloor'] > 50]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
651,15864,27,3.0,47.722835,47.098813,9.0,18,99.0,1977,0.072158,B,B,2,629,1,,0,0,A


In [34]:
# заменим 99 на 9
data_test.loc[data_test['HouseFloor'] == 99, 'HouseFloor'] = 9

In [35]:
# посмотрим на большие значения поля Rooms
data_test.loc[data_test['Rooms'] > 6]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
3398,1435,111,17.0,52.866107,32.528342,8.0,15,17.0,1987,0.093443,B,B,23,4635,5,3300.0,2,4,B


In [36]:
# заменим на адекватное значение
data_test.loc[data_test['Rooms'] == 17, 'Rooms'] = 2

In [37]:
# посмотрим на большие значения поля Floor
data_test.loc[data_test['Floor'] > 40]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
3711,414,76,2.0,67.609808,41.175948,5.0,46,48.0,2002,0.0,B,B,7,1660,39,1786.0,1,5,B
4698,15759,17,2.0,57.60187,37.744743,10.0,78,22.0,1989,0.0,B,B,25,5027,4,46.0,1,1,B


In [38]:
# заменим на адекватное значение
data_test.loc[data_test['Floor'] == 78, 'Floor'] = 22

### Data test prepare

In [39]:
data_test = add_stat_distr_1(data_test)
data_test = add_stat_distr_rooms(data_test)
data_test = add_stat_distr_rooms_2(data_test)
data_test = add_stat_square_price(data_test, data_all_stats_square_price_mean)
data_test = fillna_Healthcare_1(data_test)
data_test = fillna_LifeSquare(data_test, data_all)
data_test = prepare_category(data_test)
data_test = prepare_LifeSquare(data_test, data_all)
data_test = prepare_KitchenSquare(data_test, data_all)

### Data test prediction

In [41]:
clf_cv.fit(data_all[feats], data_all[target].values.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=17,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=-1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [45]:
data_test['Price'] = clf_cv.predict(data_test[feats])

In [47]:
data_test.loc[:, ['Id', 'Price']].to_csv('SKabanov_predictions.csv', index=None)