In [1]:
import os
# перейдем в родительскую директорию чтобы использовать пакет raiflib без установки
notebook_path = os.getcwd()
os.chdir(os.path.dirname(notebook_path))

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from IPython.display import display
from catboost import CatBoostRegressor
from raiflib.metrics import deviation_metric, deviation_metric_one_sample

from tqdm import tqdm
from tqdm.auto import tqdm


tqdm.pandas()
pd.options.mode.chained_assignment = None

In [3]:
df = pd.read_csv('../data/2.ipynb_EDA_and_Feature_engineering.csv', index_col=[0])

In [4]:
# Заметим, что целевая переменная так же имеет распределение, близкое к лог-нормальному
# TODO: график
df['per_square_meter_price'] = df.apply(lambda t: np.log(t['per_square_meter_price']), axis=1)

# Валидация

In [5]:
def cross_validate(df, iterations, cat_features, verbose, unused_features, n_splits=5, isLog=True):
    data = df.copy()
    data.drop(unused_features, axis=1, inplace=True)
    diff = set(unused_features)
    cat_features.difference_update(diff)
    
    catboost_regressor = CatBoostRegressor(
        iterations=iterations,
        cat_features=[data.columns.get_loc(c) for c in cat_features], 
        random_seed=17, 
        task_type=task_type
    )
    
    kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)
    X = data.drop('per_square_meter_price', axis=1)
    y = data.per_square_meter_price

    scores = []
    catboost_features = []
    metric_classes = []
    iteration = 0
    for train_index, test_index in kf.split(X):
        iteration += 1
        print(f"Iteraition: {iteration}")

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index].tolist(), y.iloc[test_index].tolist()

        catboost_regressor.fit(X_train, y_train, verbose=verbose, plot=False)
        y_pred = catboost_regressor.predict(X_test)
        
        if isLog:
            y_pred = np.exp(y_pred)
            y_test = np.exp(y_test)

        score = deviation_metric(y_test, y_pred, isLog=False)
        scores.append(score)
        print(f"Score: {score}\n") 
        for n in range(len(y_test)):
            metric_classes.append(deviation_metric_one_sample(y_test[n], y_pred[n], metric_value=False))
        catboost_features.append(catboost_regressor.feature_importances_)
  
    print(f'Validation score: {sum(scores)/len(scores)}\n')
    features_df = pd.DataFrame(catboost_features, columns=X.columns)
    print(f'Feature importances:')
    display(features_df.mean().sort_values(ascending=False).head(15))
    return metric_classes

In [6]:
iterations=1000
task_type="CPU"

UNUSED_FEATURES = ['id', 'date', 
'reform_house_population_1000', 
'osm_city_nearest_population', 
'osm_city_nearest_name',
'price_type']
CAT_FEATURES = {'floor', 
                'city',
                'reform_mean_floor_count_1000', 
                'region', 
                'realty_type'}

train_data = df[df.price_type == 1]
train_data['reform_mean_year_building'] = train_data.reform_mean_year_building.astype('int64')
metric_classes = cross_validate(
    train_data, 
    iterations=1000, 
    cat_features=CAT_FEATURES, 
    verbose=False, 
    n_splits=5, 
    unused_features=UNUSED_FEATURES
)
train_data['metric_class'] = metric_classes

Iteraition: 1


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Score: 1.553253965461026

Iteraition: 2


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Score: 1.2585407599066871

Iteraition: 3


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Score: 1.2738135038545464

Iteraition: 4


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Score: 1.3843920709323696

Iteraition: 5


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Score: 1.1631022505785804

Validation score: 1.326620510146642

Feature importances:


total_square                       12.919604
region                             11.145477
osm_catering_points_in_0.01         7.721303
osm_subway_closest_dist             7.128693
osm_hotels_points_in_0.01           5.087378
lng                                 4.518388
distance_to_moscow                  4.126468
distance_to_region_center           3.291173
osm_crossing_closest_dist           3.217331
osm_transport_stop_closest_dist     3.199502
floor                               3.188199
osm_amenity_points_in_0.01          3.154590
osm_crossing_points_in_0.01         2.716395
realty_type                         2.707221
reform_mean_year_building           2.603755
dtype: float64

In [7]:
regions_class2 = train_data[train_data.metric_class == 2].groupby('region', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'class2'})
regions_class1 = train_data[train_data.metric_class == 1].groupby('region', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'class1'})
regions_class0 = train_data[train_data.metric_class == 0].groupby('region', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'class0'})
regions_class_minus1 = train_data[train_data.metric_class == -1].groupby('region', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'class_1'})
regions_class_minus2 = train_data[train_data.metric_class == -2].groupby('region', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'class_2'})
regions_classes = train_data.groupby('region', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'total'})
classes = [regions_class_minus2, regions_class_minus1, regions_class0, regions_class1, regions_class2]
regions_statistic = regions_classes.copy()
for i, v in enumerate(classes):
    regions_statistic = pd.merge(regions_statistic, v, on='region', how='outer')
regions_statistic = regions_statistic.fillna(0)[['region','total', 'class_2', 'class_1', 'class0', 'class1', 'class2']]
for j in [i for i in regions_statistic.columns if 'class' in i]:
    regions_statistic[j] = (regions_statistic[j] / regions_statistic['total']).round(3)
regions_statistic

Unnamed: 0,region,total,class_2,class_1,class0,class1,class2
0,Алтай,1,0.0,0.0,0.0,1.0,0.0
1,Алтайский край,55,0.018,0.345,0.4,0.218,0.018
2,Башкортостан,68,0.0,0.265,0.5,0.221,0.015
3,Белгородская область,118,0.0,0.28,0.331,0.322,0.068
4,Брянская область,122,0.016,0.254,0.426,0.23,0.074
5,Волгоградская область,9,0.0,0.0,0.667,0.333,0.0
6,Вологодская область,15,0.067,0.067,0.467,0.333,0.067
7,Воронежская область,22,0.0,0.136,0.455,0.318,0.091
8,Ивановская область,56,0.018,0.25,0.482,0.214,0.036
9,Иркутская область,454,0.002,0.289,0.456,0.209,0.044


In [8]:
train_data['metric_class'].value_counts().sort_index()

-2      28
-1    1211
 0    1909
 1    1093
 2     252
Name: metric_class, dtype: int64

In [9]:
regions_class2 = train_data[train_data.metric_class == 2].groupby('realty_type', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'class2'})
regions_class1 = train_data[train_data.metric_class == 1].groupby('realty_type', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'class1'})
regions_class0 = train_data[train_data.metric_class == 0].groupby('realty_type', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'class0'})
regions_class_minus1 = train_data[train_data.metric_class == -1].groupby('realty_type', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'class_1'})
regions_class_minus2 = train_data[train_data.metric_class == -2].groupby('realty_type', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'class_2'})
regions_classes = train_data.groupby('realty_type', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'total'})
classes = [regions_class_minus2, regions_class_minus1, regions_class0, regions_class1, regions_class2]
regions_statistic = regions_classes.copy()
for i, v in enumerate(classes):
    regions_statistic = pd.merge(regions_statistic, v, on='realty_type', how='outer')
regions_statistic = regions_statistic.fillna(0)[['realty_type','total', 'class_2', 'class_1', 'class0', 'class1', 'class2']]
for j in [i for i in regions_statistic.columns if 'class' in i]:
    regions_statistic[j] = (regions_statistic[j] / regions_statistic['total']).round(3)
regions_statistic


Unnamed: 0,realty_type,total,class_2,class_1,class0,class1,class2
0,10,2210,0.006,0.266,0.425,0.25,0.052
1,100,1330,0.008,0.276,0.43,0.229,0.057
2,110,953,0.004,0.27,0.417,0.247,0.063


In [10]:
rtype = 100
regions_class2 = train_data[(train_data.metric_class == 2) & (train_data.realty_type == rtype)].groupby('region', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'class2'})
regions_class1 = train_data[(train_data.metric_class == 1) & (train_data.realty_type == rtype)].groupby('region', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'class1'})
regions_class0 = train_data[(train_data.metric_class == 0) & (train_data.realty_type == rtype)].groupby('region', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'class0'})
regions_class_minus1 = train_data[(train_data.metric_class == -1) & (train_data.realty_type == rtype)].groupby('region', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'class_1'})
regions_class_minus2 = train_data[(train_data.metric_class == -2) & (train_data.realty_type == rtype)].groupby('region', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'class_2'})
regions_classes = train_data[(train_data.realty_type == rtype)].groupby('region', as_index=False).agg({'total_square': 'count'}).rename(columns={'total_square': 'total'})
classes = [regions_class_minus2, regions_class_minus1, regions_class0, regions_class1, regions_class2]
regions_statistic = regions_classes.copy()
for i, v in enumerate(classes):
    regions_statistic = pd.merge(regions_statistic, v, on='region', how='outer')
regions_statistic = regions_statistic.fillna(0)[['region','total', 'class_2', 'class_1', 'class0', 'class1', 'class2']]
for j in [i for i in regions_statistic.columns if 'class' in i]:
    regions_statistic[j] = (regions_statistic[j] / regions_statistic['total']).round(3)
regions_statistic

Unnamed: 0,region,total,class_2,class_1,class0,class1,class2
0,Алтайский край,14,0.0,0.5,0.357,0.071,0.071
1,Башкортостан,27,0.0,0.296,0.481,0.185,0.037
2,Белгородская область,28,0.0,0.393,0.357,0.214,0.036
3,Брянская область,26,0.0,0.115,0.423,0.308,0.154
4,Волгоградская область,2,0.0,0.0,1.0,0.0,0.0
5,Вологодская область,4,0.0,0.0,0.75,0.25,0.0
6,Воронежская область,10,0.0,0.1,0.4,0.3,0.2
7,Ивановская область,22,0.045,0.182,0.455,0.227,0.091
8,Иркутская область,112,0.009,0.33,0.464,0.188,0.009
9,Калининградская область,18,0.0,0.444,0.389,0.111,0.056


# Обучение модели

In [11]:
iterations=1000
task_type="CPU"
verbose=1
data = df[df.price_type == 1]
data.drop(UNUSED_FEATURES, axis=1, inplace=True)
diff = set(UNUSED_FEATURES)
CAT_FEATURES.difference_update(diff)
catboost_regressor = CatBoostRegressor(iterations=iterations,cat_features=[data.columns.get_loc(c) for c in CAT_FEATURES],random_seed=17, task_type=task_type)
X_train =  data.drop('per_square_meter_price', axis=1)
y_train =  data.per_square_meter_price
catboost_regressor.fit(X_train, y_train, verbose=verbose, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.051914
0:	learn: 0.5720410	total: 21.7ms	remaining: 21.7s
1:	learn: 0.5586499	total: 36.4ms	remaining: 18.2s
2:	learn: 0.5453404	total: 52.8ms	remaining: 17.6s
3:	learn: 0.5337603	total: 71.8ms	remaining: 17.9s
4:	learn: 0.5224905	total: 84.6ms	remaining: 16.8s
5:	learn: 0.5124321	total: 95.8ms	remaining: 15.9s
6:	learn: 0.5022159	total: 107ms	remaining: 15.2s
7:	learn: 0.4925717	total: 121ms	remaining: 15s
8:	learn: 0.4840784	total: 133ms	remaining: 14.6s
9:	learn: 0.4754042	total: 145ms	remaining: 14.3s
10:	learn: 0.4675920	total: 154ms	remaining: 13.9s
11:	learn: 0.4607481	total: 165ms	remaining: 13.6s
12:	learn: 0.4546320	total: 176ms	remaining: 13.4s
13:	learn: 0.4483962	total: 187ms	remaining: 13.2s
14:	learn: 0.4426264	total: 199ms	remaining: 13.1s
15:	learn: 0.4371619	total: 210ms	remaining: 12.9s
16:	learn: 0.4319755	total: 219ms	remaining: 12.7s
17:	learn: 0.4270239	total: 230ms	remaining: 12.6s
18:	learn: 0.4225839	total: 241ms	remaining: 12.5s
19:	lea

<catboost.core.CatBoostRegressor at 0x12dc7ef50>

# Predicting

In [12]:
tdf = pd.read_csv('../data/2.ipynb_EDA_and_Feature_engineering_test.csv', index_col=[0])
id = tdf.id
tdf.drop(UNUSED_FEATURES, axis = 1, inplace=True)
test_predict = catboost_regressor.predict(tdf)
test_predict = np.exp(test_predict)
and_df = pd.DataFrame({'per_square_meter_price': test_predict})
and_df.index = id
predict_before_saving = and_df.iloc[0][0]


In [13]:
and_df.to_csv('../data/3.Validation_submission.csv')
catboost_regressor.save_model('../models/catboost.cbm')

In [14]:
price_model = CatBoostRegressor()
price_model.load_model('../models/catboost.cbm', format='cbm')
tdf = pd.read_csv('../data/2.ipynb_EDA_and_Feature_engineering_test.csv', index_col=[0])
tdf.drop(UNUSED_FEATURES, axis = 1, inplace=True)
test_predict = price_model.predict(tdf.iloc[0])
test_predict = np.exp(test_predict)

EPS = 1e-8
assert abs(predict_before_saving - test_predict) <= EPS
'Модель сохранена.'

'Модель сохранена.'

In [15]:
# Заметим, что на валидации есть данные разбиваются на 3 класса, которые было бы хорошо уметь определять.
# Таким образом, к примеру, можно оповещать оценщика, что предсказание для данного набора признаков 
# может быть ненадежным и помещение следует оценить очно.