In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import catboost
import xgboost as xgb
import glob

from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
STATUS = './status.csv'
DESCR = './comment.xlsx'
SAMPLE = './sample submission.csv'
TEST = './test.csv'
FLAT = './flat.csv'
PRICE = './price.csv'
TRAIN = './train.csv'

# Import data

In [3]:
train = pd.read_csv(TRAIN)
test = pd.read_csv(TEST)

# Main handler

In [4]:
def handler(data, test=False):
    
    ds = data.copy(deep=True)
    cols = ds.columns
    
    # Extract categorical data
    categorical_columns = list(set(train.columns) - set(train._get_numeric_data().columns))


    # One hot encoding categorical cols
    for col in categorical_columns:
        if ((ds[col][0] == "да") or (ds[col][0] == "нет")):
            ds[col] = ds[col].map(lambda x: 1 if (x == "да") else 0).copy(deep=True)
            
    # Normalizing
    ds["Количество помещений"] = (ds["Количество помещений"] / 5579).copy(deep=True) # max
    ds["price"] = (ds["price"] / 300000).copy(deep=True) # max
    ds["Детский сад"] = (ds["Детский сад"] / 1000).copy(deep=True) # max
    ds["Машиномест"] = (ds["Машиномест"] / 10000).copy(deep=True) # max
    ds["Площадь пром. зоны в радиусе 500 м"] = (ds["Площадь пром. зоны в радиусе 500 м"] / 378372).copy(deep=True)
    ds["Площадь зеленой зоны в радиусе 500 м"] = (ds["Площадь зеленой зоны в радиусе 500 м"] / 516706).copy(deep=True)
    ds["Площадь двора"] = (ds["Площадь двора"] / 16473).copy(deep=True) # max
    ds["Школа"] = (ds["Школа"] / 3250).copy(deep=True) # max
    ds["Поликлиника"] = (ds["Поликлиника"] / 600).copy(deep=True) # max
    
    # Create year columns
    #ds["year"] = ds["date1"][0:4].copy(deep=True)
    
    # Object class one hot encoding
    ohoc = pd.get_dummies(ds["Класс объекта"]) # One hot object class
    for col in ohoc:
        ds[col] = ohoc[col].copy(deep=True)
    
    # Delete useless columns 
    del ds["bulk_id"]
    if (test == False):
        del ds["plan_s"]
        del ds["plan_m"]
        del ds["plan_l"]
        del ds["vid_0"]
        del ds["vid_1"]
        del ds["vid_2"]
        del ds["start_square"]
    del ds["date1"]
    del ds["id"]
    del ds["Лифт"]
    del ds["Спортивная площадка"]
    del ds["Входные группы"]
    del ds["Система мусоротведения"]
    del ds["Класс объекта"]
    
    return ds

# CV splitter

In [5]:
def get_train_test(train, sz=0.2):
    X = train.copy(deep=True)
    y = X["value"]
    del X["value"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=sz, random_state=42)

    return X_train, X_test, y_train, y_test

In [6]:
newds = handler(train) # test, test=True
newds_tst = handler(test, test=True) # test, test=True

In [7]:
X_train, X_test, y_train, y_test = get_train_test(newds, sz=0.2)

In [8]:
X_train.head()

Unnamed: 0,spalen,price,mean_sq,mean_fl,month,month_cnt,Количество помещений,Огорожена территория,Площадь земельного участка,Детский сад,...,Станций метро от кольца,Площадь двора,Курс,Cтавка по ипотеке,Вклады до 1 года,Вклады от 1 года до 3 лет,Вклады свыше 3 лет,комфорт,стандарт,эконом
1153,2,0.698667,66.44,12,1,37,0.196092,0,6.8,0.15,...,2.0,1.0,56.4981,9.74,5.62,6.64,6.05,1,0,0
6840,0,0.336967,24.86,10,10,34,0.591683,0,22.0,1.0,...,12.0,0.800947,57.698052,10.04,5.74,5.94,6.03,0,0,1
7899,0,0.442852,23.06,5,9,33,0.35311,0,23.25,0.45,...,15.0,0.34851,57.744691,10.51,5.3,6.65,6.8,0,0,1
1242,0,0.488683,20.1,16,10,34,0.387166,0,12.8,0.385,...,12.0,0.394585,57.698052,10.04,5.74,5.94,6.03,1,0,0
8174,3,0.459471,90.01,14,6,30,0.077433,0,4.68,0.125,...,10.0,0.242822,57.893176,11.26,5.73,6.42,7.97,1,0,0


# CatBoost

In [9]:
cbr = catboost.CatBoostRegressor(iterations=1000, depth=6, learning_rate=0.5, l2_leaf_reg=3.5)
cbr.fit(X_train, y_train)

0:	learn: 373.7371178	total: 102ms	remaining: 1m 41s
1:	learn: 343.5533143	total: 139ms	remaining: 1m 9s
2:	learn: 327.4205410	total: 166ms	remaining: 55s
3:	learn: 316.6834396	total: 180ms	remaining: 44.9s
4:	learn: 308.2751126	total: 190ms	remaining: 37.9s
5:	learn: 301.8666780	total: 199ms	remaining: 32.9s
6:	learn: 298.2402005	total: 208ms	remaining: 29.5s
7:	learn: 294.1243711	total: 217ms	remaining: 26.9s
8:	learn: 291.9790723	total: 226ms	remaining: 24.9s
9:	learn: 287.5379767	total: 236ms	remaining: 23.3s
10:	learn: 285.9728536	total: 245ms	remaining: 22.1s
11:	learn: 285.1530637	total: 255ms	remaining: 21s
12:	learn: 283.5131325	total: 264ms	remaining: 20.1s
13:	learn: 281.7298078	total: 274ms	remaining: 19.3s
14:	learn: 278.6677538	total: 282ms	remaining: 18.5s
15:	learn: 276.7120898	total: 291ms	remaining: 17.9s
16:	learn: 273.9585759	total: 302ms	remaining: 17.4s
17:	learn: 272.7446599	total: 315ms	remaining: 17.2s
18:	learn: 271.3280874	total: 328ms	remaining: 16.9s
19:	le

160:	learn: 189.7508436	total: 1.74s	remaining: 9.09s
161:	learn: 189.3447809	total: 1.76s	remaining: 9.09s
162:	learn: 189.3447426	total: 1.76s	remaining: 9.04s
163:	learn: 189.2096924	total: 1.77s	remaining: 9.03s
164:	learn: 188.8518964	total: 1.78s	remaining: 9.02s
165:	learn: 188.7005689	total: 1.79s	remaining: 9s
166:	learn: 188.5197030	total: 1.8s	remaining: 8.98s
167:	learn: 188.3294590	total: 1.81s	remaining: 8.96s
168:	learn: 188.3236230	total: 1.82s	remaining: 8.94s
169:	learn: 188.2777659	total: 1.83s	remaining: 8.92s
170:	learn: 188.2216915	total: 1.84s	remaining: 8.9s
171:	learn: 187.8562905	total: 1.85s	remaining: 8.89s
172:	learn: 187.8562194	total: 1.85s	remaining: 8.84s
173:	learn: 187.5514675	total: 1.86s	remaining: 8.82s
174:	learn: 187.1041338	total: 1.87s	remaining: 8.81s
175:	learn: 186.9913291	total: 1.88s	remaining: 8.79s
176:	learn: 186.5809118	total: 1.89s	remaining: 8.77s
177:	learn: 186.5265604	total: 1.9s	remaining: 8.76s
178:	learn: 186.2616463	total: 1.9

326:	learn: 163.8519875	total: 3.59s	remaining: 7.39s
327:	learn: 163.7932217	total: 3.6s	remaining: 7.38s
328:	learn: 163.6830602	total: 3.62s	remaining: 7.37s
329:	learn: 163.6747530	total: 3.62s	remaining: 7.36s
330:	learn: 163.6492940	total: 3.64s	remaining: 7.35s
331:	learn: 163.3526259	total: 3.65s	remaining: 7.33s
332:	learn: 163.3512497	total: 3.65s	remaining: 7.32s
333:	learn: 163.3512306	total: 3.66s	remaining: 7.29s
334:	learn: 163.3161407	total: 3.67s	remaining: 7.28s
335:	learn: 163.2773290	total: 3.68s	remaining: 7.26s
336:	learn: 163.2321729	total: 3.69s	remaining: 7.25s
337:	learn: 163.2251654	total: 3.69s	remaining: 7.24s
338:	learn: 163.1447014	total: 3.7s	remaining: 7.22s
339:	learn: 162.9972979	total: 3.71s	remaining: 7.21s
340:	learn: 162.8728161	total: 3.72s	remaining: 7.19s
341:	learn: 162.7119316	total: 3.73s	remaining: 7.18s
342:	learn: 162.5018846	total: 3.74s	remaining: 7.16s
343:	learn: 162.3237496	total: 3.75s	remaining: 7.15s
344:	learn: 162.2303357	total:

493:	learn: 149.6541221	total: 5.25s	remaining: 5.38s
494:	learn: 149.5605013	total: 5.26s	remaining: 5.37s
495:	learn: 149.5519233	total: 5.27s	remaining: 5.36s
496:	learn: 149.4452117	total: 5.28s	remaining: 5.34s
497:	learn: 149.2119007	total: 5.29s	remaining: 5.33s
498:	learn: 149.1680606	total: 5.3s	remaining: 5.32s
499:	learn: 148.8757095	total: 5.31s	remaining: 5.31s
500:	learn: 148.7608702	total: 5.32s	remaining: 5.3s
501:	learn: 148.7483218	total: 5.33s	remaining: 5.28s
502:	learn: 148.7194973	total: 5.33s	remaining: 5.27s
503:	learn: 148.6629701	total: 5.34s	remaining: 5.26s
504:	learn: 148.6588817	total: 5.35s	remaining: 5.25s
505:	learn: 148.6076783	total: 5.36s	remaining: 5.24s
506:	learn: 148.5949427	total: 5.37s	remaining: 5.22s
507:	learn: 148.4364304	total: 5.38s	remaining: 5.21s
508:	learn: 148.2816720	total: 5.39s	remaining: 5.2s
509:	learn: 148.2394152	total: 5.4s	remaining: 5.19s
510:	learn: 148.2251179	total: 5.41s	remaining: 5.18s
511:	learn: 148.1578191	total: 5

658:	learn: 140.1495266	total: 6.87s	remaining: 3.56s
659:	learn: 140.1124784	total: 6.88s	remaining: 3.54s
660:	learn: 140.0990259	total: 6.89s	remaining: 3.54s
661:	learn: 140.0739256	total: 6.9s	remaining: 3.52s
662:	learn: 140.0187342	total: 6.91s	remaining: 3.51s
663:	learn: 139.9511130	total: 6.92s	remaining: 3.5s
664:	learn: 139.9483679	total: 6.93s	remaining: 3.49s
665:	learn: 139.8786984	total: 6.94s	remaining: 3.48s
666:	learn: 139.8259370	total: 6.95s	remaining: 3.47s
667:	learn: 139.8254879	total: 6.96s	remaining: 3.46s
668:	learn: 139.7737127	total: 6.97s	remaining: 3.45s
669:	learn: 139.7713826	total: 6.97s	remaining: 3.44s
670:	learn: 139.7509297	total: 6.98s	remaining: 3.42s
671:	learn: 139.6996580	total: 6.99s	remaining: 3.41s
672:	learn: 139.6175140	total: 7s	remaining: 3.4s
673:	learn: 139.4924465	total: 7.01s	remaining: 3.39s
674:	learn: 139.3720919	total: 7.02s	remaining: 3.38s
675:	learn: 139.3129822	total: 7.03s	remaining: 3.37s
676:	learn: 139.2291910	total: 7.0

821:	learn: 132.9474826	total: 8.49s	remaining: 1.84s
822:	learn: 132.7596822	total: 8.51s	remaining: 1.83s
823:	learn: 132.7375729	total: 8.52s	remaining: 1.82s
824:	learn: 132.6640345	total: 8.54s	remaining: 1.81s
825:	learn: 132.6078164	total: 8.54s	remaining: 1.8s
826:	learn: 132.6024331	total: 8.56s	remaining: 1.79s
827:	learn: 132.5951209	total: 8.57s	remaining: 1.78s
828:	learn: 132.5935211	total: 8.58s	remaining: 1.77s
829:	learn: 132.5913896	total: 8.59s	remaining: 1.76s
830:	learn: 132.5703798	total: 8.6s	remaining: 1.75s
831:	learn: 132.4745262	total: 8.61s	remaining: 1.74s
832:	learn: 132.2475061	total: 8.62s	remaining: 1.73s
833:	learn: 131.9832712	total: 8.63s	remaining: 1.72s
834:	learn: 131.8791481	total: 8.64s	remaining: 1.71s
835:	learn: 131.8421455	total: 8.65s	remaining: 1.7s
836:	learn: 131.8387037	total: 8.66s	remaining: 1.69s
837:	learn: 131.8341866	total: 8.67s	remaining: 1.68s
838:	learn: 131.7710805	total: 8.68s	remaining: 1.67s
839:	learn: 131.7655035	total: 

988:	learn: 124.3251854	total: 10.3s	remaining: 115ms
989:	learn: 124.2966775	total: 10.4s	remaining: 105ms
990:	learn: 124.2954959	total: 10.4s	remaining: 94.2ms
991:	learn: 124.2636020	total: 10.4s	remaining: 83.7ms
992:	learn: 124.1564972	total: 10.4s	remaining: 73.3ms
993:	learn: 124.0645437	total: 10.4s	remaining: 62.8ms
994:	learn: 124.0487200	total: 10.4s	remaining: 52.3ms
995:	learn: 124.0396380	total: 10.4s	remaining: 41.9ms
996:	learn: 123.9783873	total: 10.4s	remaining: 31.4ms
997:	learn: 123.9277316	total: 10.4s	remaining: 20.9ms
998:	learn: 123.9133622	total: 10.5s	remaining: 10.5ms
999:	learn: 123.8780296	total: 10.5s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7fbf3d02beb8>

In [10]:
(sum((cbr.predict(X_test) - y_test) ** 2) / len(X_test)) ** (1/2)

243.3033303211698

In [15]:
X_train.columns

Index(['spalen', 'price', 'mean_sq', 'mean_fl', 'month', 'month_cnt',
       'Количество помещений', 'Огорожена территория',
       'Площадь земельного участка', 'Детский сад', 'Школа', 'Поликлиника',
       'ФОК', 'Автомойка', 'Кладовые', 'Колясочные', 'Кондиционирование',
       'Вентлияция', 'Видеонаблюдение', 'Подземная парковка', 'Двор без машин',
       'Машиномест', 'Площадь пром. зоны в радиусе 500 м',
       'Площадь зеленой зоны в радиусе 500 м', 'До Кремля', 'До ТТК(км)',
       'До Садового(км)', 'До большой дороги на машине(км)',
       'До удобной авторазвязки на машине(км)', 'До метро пешком(км)',
       'До промки(км)', 'До парка(км)', 'До парка пешком(км)',
       'Станций метро от кольца', 'Площадь двора', 'Курс', 'Cтавка по ипотеке',
       'Вклады до 1 года', 'Вклады от 1 года до 3 лет', 'Вклады свыше 3 лет',
       'комфорт', 'стандарт', 'эконом'],
      dtype='object')

In [215]:
pd.DataFrame({"id":range(len(newds_tst)), "value":cbr.predict(newds_tst)}).to_csv("./cb-noparams10.csv",
                                                                                  encoding="utf-8",
                                                                                  index=False)