In [147]:
import pandas as pd
import numpy as np
import sklearn as sk
import catboost
import xgboost as xgb
import glob

from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
%matplotlib inline

In [11]:
glob.glob("*")

['main.ipynb',
 'status.csv',
 'comment.xlsx',
 'sample submission.csv',
 'test.csv',
 'flat.csv',
 'price.csv',
 'train.csv']

In [12]:
STATUS = './status.csv'
DESCR = './comment.xlsx'
SAMPLE = './sample submission.csv'
TEST = './test.csv'
FLAT = './flat.csv'
PRICE = './price.csv'
TRAIN = './train.csv'

# Import data

In [55]:
train = pd.read_csv(TRAIN)
test = pd.read_csv(TEST)

# Main handler

In [166]:
def handler(data, test=False):
    
    ds = data.copy(deep=True)
    cols = ds.columns
    
    # Extract categorical data
    categorical_columns = list(set(train.columns) - set(train._get_numeric_data().columns))


    # One hot encoding categorical cols
    for col in categorical_columns:
        if ((ds[col][0] == "да") or (ds[col][0] == "нет")):
            ds[col] = ds[col].map(lambda x: 1 if (x == "да") else 0).copy(deep=True)
            
    # Normalizing
    ds["Количество помещений"] = (ds["Количество помещений"] / 5579).copy(deep=True) # max
    ds["price"] = (ds["price"] / 300000).copy(deep=True) # max
    ds["Детский сад"] = (ds["Детский сад"] / 1000).copy(deep=True) # max
    ds["Машиномест"] = (ds["Машиномест"] / 10000).copy(deep=True) # max
    ds["Площадь пром. зоны в радиусе 500 м"] = (ds["Площадь пром. зоны в радиусе 500 м"] / 378372).copy(deep=True)
    ds["Площадь зеленой зоны в радиусе 500 м"] = (ds["Площадь зеленой зоны в радиусе 500 м"] / 516706).copy(deep=True)
    ds["Площадь двора"] = (ds["Площадь двора"] / 16473).copy(deep=True) # max
    ds["Школа"] = (ds["Школа"] / 3250).copy(deep=True) # max
    ds["Поликлиника"] = (ds["Поликлиника"] / 600).copy(deep=True) # max
    
    # Create year columns
    #ds["year"] = ds["date1"][0:4].copy(deep=True)
    
    # Object class one hot encoding
    ohoc = pd.get_dummies(ds["Класс объекта"]) # One hot object class
    for col in ohoc:
        ds[col] = ohoc[col].copy(deep=True)
    
    # Delete useless columns 
    del ds["bulk_id"]
    if (test == False):
        del ds["plan_s"]
        del ds["plan_m"]
        del ds["plan_l"]
        del ds["vid_0"]
        del ds["vid_1"]
        del ds["vid_2"]
        del ds["start_square"]
    del ds["date1"]
    del ds["id"]
    del ds["Лифт"]
    del ds["Спортивная площадка"]
    del ds["Входные группы"]
    del ds["Система мусоротведения"]
    del ds["Класс объекта"]
    
    return ds

# CV splitter

In [181]:
def get_train_test(train, sz=0.2):
    X = train.copy(deep=True)
    y = X["value"]
    del X["value"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=sz, random_state=42)

    return X_train, X_test, y_train, y_test

In [182]:
newds = handler(train) # test, test=True
newds_tst = handler(test, test=True) # test, test=True

In [219]:
X_train, X_test, y_train, y_test = get_train_test(newds, sz=0.2)

In [220]:
X_train.head()

Unnamed: 0,spalen,price,mean_sq,mean_fl,month,month_cnt,Количество помещений,Огорожена территория,Площадь земельного участка,Детский сад,...,Станций метро от кольца,Площадь двора,Курс,Cтавка по ипотеке,Вклады до 1 года,Вклады от 1 года до 3 лет,Вклады свыше 3 лет,комфорт,стандарт,эконом
1153,2,0.698667,66.44,12,1,37,0.196092,0,6.8,0.15,...,2.0,1.0,56.4981,9.74,5.62,6.64,6.05,1,0,0
6840,0,0.336967,24.86,10,10,34,0.591683,0,22.0,1.0,...,12.0,0.800947,57.698052,10.04,5.74,5.94,6.03,0,0,1
7899,0,0.442852,23.06,5,9,33,0.35311,0,23.25,0.45,...,15.0,0.34851,57.744691,10.51,5.3,6.65,6.8,0,0,1
1242,0,0.488683,20.1,16,10,34,0.387166,0,12.8,0.385,...,12.0,0.394585,57.698052,10.04,5.74,5.94,6.03,1,0,0
8174,3,0.459471,90.01,14,6,30,0.077433,0,4.68,0.125,...,10.0,0.242822,57.893176,11.26,5.73,6.42,7.97,1,0,0


# CatBoost

In [221]:
cbr = catboost.CatBoostRegressor(iterations=2000)
cbr.fit(X_train, y_train)

0:	learn: 450.3466676	total: 38.2ms	remaining: 1m 16s
1:	learn: 444.7828973	total: 61.4ms	remaining: 1m 1s
2:	learn: 439.8140827	total: 73ms	remaining: 48.6s
3:	learn: 434.8576612	total: 83.4ms	remaining: 41.6s
4:	learn: 429.9463435	total: 92.1ms	remaining: 36.8s
5:	learn: 425.5759250	total: 102ms	remaining: 34s
6:	learn: 421.2971971	total: 112ms	remaining: 31.9s
7:	learn: 417.1778706	total: 117ms	remaining: 29.2s
8:	learn: 413.0924185	total: 127ms	remaining: 28.2s
9:	learn: 409.2532674	total: 137ms	remaining: 27.3s
10:	learn: 405.0610170	total: 147ms	remaining: 26.6s
11:	learn: 401.1597572	total: 156ms	remaining: 25.8s
12:	learn: 397.7100017	total: 165ms	remaining: 25.3s
13:	learn: 394.3706514	total: 175ms	remaining: 24.8s
14:	learn: 391.4596609	total: 185ms	remaining: 24.4s
15:	learn: 388.3033620	total: 194ms	remaining: 24.1s
16:	learn: 384.9473954	total: 204ms	remaining: 23.8s
17:	learn: 381.6163237	total: 213ms	remaining: 23.4s
18:	learn: 378.8879741	total: 222ms	remaining: 23.2s
1

166:	learn: 284.1509138	total: 1.91s	remaining: 20.9s
167:	learn: 283.8855990	total: 1.92s	remaining: 21s
168:	learn: 283.6232519	total: 1.94s	remaining: 21s
169:	learn: 283.3420663	total: 1.95s	remaining: 21s
170:	learn: 283.2009019	total: 1.96s	remaining: 20.9s
171:	learn: 283.0122358	total: 1.97s	remaining: 20.9s
172:	learn: 282.8373159	total: 1.98s	remaining: 20.9s
173:	learn: 282.6704305	total: 1.99s	remaining: 20.8s
174:	learn: 282.4815162	total: 1.99s	remaining: 20.8s
175:	learn: 282.2507791	total: 2s	remaining: 20.8s
176:	learn: 281.9786982	total: 2.01s	remaining: 20.7s
177:	learn: 281.7501829	total: 2.02s	remaining: 20.7s
178:	learn: 281.5541191	total: 2.03s	remaining: 20.7s
179:	learn: 281.4490481	total: 2.04s	remaining: 20.6s
180:	learn: 281.2641788	total: 2.05s	remaining: 20.6s
181:	learn: 281.0179471	total: 2.06s	remaining: 20.6s
182:	learn: 280.8123765	total: 2.07s	remaining: 20.5s
183:	learn: 280.6346032	total: 2.08s	remaining: 20.5s
184:	learn: 280.4983033	total: 2.09s	

325:	learn: 259.6324388	total: 4s	remaining: 20.5s
326:	learn: 259.5531284	total: 4.01s	remaining: 20.5s
327:	learn: 259.2830631	total: 4.04s	remaining: 20.6s
328:	learn: 259.1202744	total: 4.05s	remaining: 20.6s
329:	learn: 259.0309955	total: 4.06s	remaining: 20.6s
330:	learn: 258.9449238	total: 4.08s	remaining: 20.6s
331:	learn: 258.8350254	total: 4.1s	remaining: 20.6s
332:	learn: 258.7356936	total: 4.11s	remaining: 20.6s
333:	learn: 258.5878902	total: 4.12s	remaining: 20.6s
334:	learn: 258.4818171	total: 4.13s	remaining: 20.5s
335:	learn: 258.3698005	total: 4.14s	remaining: 20.5s
336:	learn: 258.2257357	total: 4.16s	remaining: 20.5s
337:	learn: 258.1023169	total: 4.17s	remaining: 20.5s
338:	learn: 258.0355908	total: 4.18s	remaining: 20.5s
339:	learn: 257.8050336	total: 4.19s	remaining: 20.5s
340:	learn: 257.6692469	total: 4.2s	remaining: 20.4s
341:	learn: 257.5885380	total: 4.21s	remaining: 20.4s
342:	learn: 257.4953710	total: 4.23s	remaining: 20.4s
343:	learn: 257.3735309	total: 4.

496:	learn: 241.5917525	total: 5.84s	remaining: 17.7s
497:	learn: 241.4813860	total: 5.86s	remaining: 17.7s
498:	learn: 241.3719173	total: 5.87s	remaining: 17.6s
499:	learn: 241.3302369	total: 5.88s	remaining: 17.7s
500:	learn: 241.2270959	total: 5.89s	remaining: 17.6s
501:	learn: 241.1683683	total: 5.91s	remaining: 17.6s
502:	learn: 241.0725170	total: 5.92s	remaining: 17.6s
503:	learn: 240.9749491	total: 5.93s	remaining: 17.6s
504:	learn: 240.9091863	total: 5.93s	remaining: 17.6s
505:	learn: 240.8658314	total: 5.95s	remaining: 17.6s
506:	learn: 240.7518899	total: 5.96s	remaining: 17.5s
507:	learn: 240.7040997	total: 5.96s	remaining: 17.5s
508:	learn: 240.5638869	total: 5.97s	remaining: 17.5s
509:	learn: 240.4022069	total: 5.98s	remaining: 17.5s
510:	learn: 240.3384725	total: 5.99s	remaining: 17.5s
511:	learn: 240.1859906	total: 6s	remaining: 17.4s
512:	learn: 240.0995431	total: 6.01s	remaining: 17.4s
513:	learn: 240.0379436	total: 6.02s	remaining: 17.4s
514:	learn: 239.9419123	total: 

655:	learn: 230.0234297	total: 7.49s	remaining: 15.3s
656:	learn: 229.9679025	total: 7.5s	remaining: 15.3s
657:	learn: 229.8736812	total: 7.52s	remaining: 15.3s
658:	learn: 229.7993173	total: 7.53s	remaining: 15.3s
659:	learn: 229.7536443	total: 7.54s	remaining: 15.3s
660:	learn: 229.6584703	total: 7.55s	remaining: 15.3s
661:	learn: 229.6113900	total: 7.56s	remaining: 15.3s
662:	learn: 229.5437309	total: 7.57s	remaining: 15.3s
663:	learn: 229.4819069	total: 7.58s	remaining: 15.2s
664:	learn: 229.4454726	total: 7.59s	remaining: 15.2s
665:	learn: 229.4237507	total: 7.6s	remaining: 15.2s
666:	learn: 229.3637093	total: 7.61s	remaining: 15.2s
667:	learn: 229.3316936	total: 7.62s	remaining: 15.2s
668:	learn: 229.2230913	total: 7.63s	remaining: 15.2s
669:	learn: 229.1700421	total: 7.63s	remaining: 15.2s
670:	learn: 229.1292702	total: 7.64s	remaining: 15.1s
671:	learn: 229.0722816	total: 7.65s	remaining: 15.1s
672:	learn: 229.0341532	total: 7.66s	remaining: 15.1s
673:	learn: 228.9742309	total:

824:	learn: 220.7713515	total: 9.31s	remaining: 13.3s
825:	learn: 220.7489744	total: 9.32s	remaining: 13.3s
826:	learn: 220.7105907	total: 9.33s	remaining: 13.2s
827:	learn: 220.6875451	total: 9.35s	remaining: 13.2s
828:	learn: 220.6613837	total: 9.36s	remaining: 13.2s
829:	learn: 220.6197262	total: 9.38s	remaining: 13.2s
830:	learn: 220.5596312	total: 9.38s	remaining: 13.2s
831:	learn: 220.5234832	total: 9.39s	remaining: 13.2s
832:	learn: 220.4781581	total: 9.4s	remaining: 13.2s
833:	learn: 220.4260293	total: 9.41s	remaining: 13.2s
834:	learn: 220.4133953	total: 9.42s	remaining: 13.1s
835:	learn: 220.3866841	total: 9.43s	remaining: 13.1s
836:	learn: 220.3021530	total: 9.45s	remaining: 13.1s
837:	learn: 220.2722817	total: 9.45s	remaining: 13.1s
838:	learn: 220.2010626	total: 9.46s	remaining: 13.1s
839:	learn: 220.1787753	total: 9.47s	remaining: 13.1s
840:	learn: 220.1027115	total: 9.48s	remaining: 13.1s
841:	learn: 220.0676372	total: 9.49s	remaining: 13.1s
842:	learn: 220.0059720	total

979:	learn: 214.0618367	total: 10.9s	remaining: 11.4s
980:	learn: 214.0575717	total: 11s	remaining: 11.4s
981:	learn: 214.0185414	total: 11s	remaining: 11.4s
982:	learn: 213.9836433	total: 11s	remaining: 11.4s
983:	learn: 213.9745678	total: 11s	remaining: 11.3s
984:	learn: 213.9443327	total: 11s	remaining: 11.3s
985:	learn: 213.8671383	total: 11s	remaining: 11.3s
986:	learn: 213.8444987	total: 11s	remaining: 11.3s
987:	learn: 213.7294474	total: 11s	remaining: 11.3s
988:	learn: 213.7153106	total: 11s	remaining: 11.3s
989:	learn: 213.6738479	total: 11s	remaining: 11.3s
990:	learn: 213.6242979	total: 11.1s	remaining: 11.3s
991:	learn: 213.5532505	total: 11.1s	remaining: 11.2s
992:	learn: 213.5178606	total: 11.1s	remaining: 11.2s
993:	learn: 213.4779097	total: 11.1s	remaining: 11.2s
994:	learn: 213.4080494	total: 11.1s	remaining: 11.2s
995:	learn: 213.3586659	total: 11.1s	remaining: 11.2s
996:	learn: 213.3187243	total: 11.1s	remaining: 11.2s
997:	learn: 213.3114163	total: 11.1s	remaining: 

1139:	learn: 208.3237265	total: 12.6s	remaining: 9.48s
1140:	learn: 208.2928946	total: 12.6s	remaining: 9.47s
1141:	learn: 208.2623036	total: 12.6s	remaining: 9.46s
1142:	learn: 208.2384327	total: 12.6s	remaining: 9.45s
1143:	learn: 208.2201002	total: 12.6s	remaining: 9.44s
1144:	learn: 208.2018222	total: 12.6s	remaining: 9.43s
1145:	learn: 208.1954357	total: 12.6s	remaining: 9.42s
1146:	learn: 208.1899652	total: 12.7s	remaining: 9.41s
1147:	learn: 208.1480129	total: 12.7s	remaining: 9.4s
1148:	learn: 208.1134025	total: 12.7s	remaining: 9.39s
1149:	learn: 208.1052714	total: 12.7s	remaining: 9.38s
1150:	learn: 208.1047835	total: 12.7s	remaining: 9.36s
1151:	learn: 208.0873577	total: 12.7s	remaining: 9.35s
1152:	learn: 208.0683955	total: 12.7s	remaining: 9.34s
1153:	learn: 208.0601576	total: 12.7s	remaining: 9.33s
1154:	learn: 208.0355069	total: 12.7s	remaining: 9.31s
1155:	learn: 207.9881723	total: 12.7s	remaining: 9.3s
1156:	learn: 207.9404952	total: 12.8s	remaining: 9.29s
1157:	learn:

1298:	learn: 203.3766810	total: 14.4s	remaining: 7.77s
1299:	learn: 203.3607571	total: 14.4s	remaining: 7.76s
1300:	learn: 203.3162464	total: 14.4s	remaining: 7.75s
1301:	learn: 203.3099562	total: 14.4s	remaining: 7.74s
1302:	learn: 203.3092959	total: 14.4s	remaining: 7.73s
1303:	learn: 203.2595652	total: 14.5s	remaining: 7.72s
1304:	learn: 203.1746553	total: 14.5s	remaining: 7.71s
1305:	learn: 203.1746487	total: 14.5s	remaining: 7.69s
1306:	learn: 203.1503004	total: 14.5s	remaining: 7.68s
1307:	learn: 203.1055426	total: 14.5s	remaining: 7.67s
1308:	learn: 203.0983470	total: 14.5s	remaining: 7.66s
1309:	learn: 203.0590688	total: 14.5s	remaining: 7.65s
1310:	learn: 203.0128031	total: 14.5s	remaining: 7.64s
1311:	learn: 202.9533552	total: 14.5s	remaining: 7.63s
1312:	learn: 202.9486402	total: 14.6s	remaining: 7.61s
1313:	learn: 202.9466628	total: 14.6s	remaining: 7.6s
1314:	learn: 202.9301316	total: 14.6s	remaining: 7.59s
1315:	learn: 202.9085559	total: 14.6s	remaining: 7.58s
1316:	learn

1461:	learn: 198.9464697	total: 16.2s	remaining: 5.97s
1462:	learn: 198.8959375	total: 16.2s	remaining: 5.96s
1463:	learn: 198.8677556	total: 16.2s	remaining: 5.95s
1464:	learn: 198.8485388	total: 16.3s	remaining: 5.94s
1465:	learn: 198.8288390	total: 16.3s	remaining: 5.93s
1466:	learn: 198.8176326	total: 16.3s	remaining: 5.91s
1467:	learn: 198.7543526	total: 16.3s	remaining: 5.9s
1468:	learn: 198.7319272	total: 16.3s	remaining: 5.89s
1469:	learn: 198.7028495	total: 16.3s	remaining: 5.88s
1470:	learn: 198.6918808	total: 16.3s	remaining: 5.87s
1471:	learn: 198.6918489	total: 16.3s	remaining: 5.86s
1472:	learn: 198.6458002	total: 16.3s	remaining: 5.84s
1473:	learn: 198.6269875	total: 16.3s	remaining: 5.83s
1474:	learn: 198.6026563	total: 16.4s	remaining: 5.82s
1475:	learn: 198.5717629	total: 16.4s	remaining: 5.81s
1476:	learn: 198.5646752	total: 16.4s	remaining: 5.8s
1477:	learn: 198.5424605	total: 16.4s	remaining: 5.79s
1478:	learn: 198.5338719	total: 16.4s	remaining: 5.77s
1479:	learn:

1622:	learn: 195.0689486	total: 17.9s	remaining: 4.15s
1623:	learn: 195.0374930	total: 17.9s	remaining: 4.14s
1624:	learn: 195.0100501	total: 17.9s	remaining: 4.13s
1625:	learn: 194.9730995	total: 17.9s	remaining: 4.12s
1626:	learn: 194.9276735	total: 17.9s	remaining: 4.11s
1627:	learn: 194.8606045	total: 17.9s	remaining: 4.1s
1628:	learn: 194.8425611	total: 17.9s	remaining: 4.09s
1629:	learn: 194.8348956	total: 18s	remaining: 4.08s
1630:	learn: 194.8296187	total: 18s	remaining: 4.06s
1631:	learn: 194.7780937	total: 18s	remaining: 4.05s
1632:	learn: 194.7474340	total: 18s	remaining: 4.04s
1633:	learn: 194.7354018	total: 18s	remaining: 4.03s
1634:	learn: 194.7309223	total: 18s	remaining: 4.02s
1635:	learn: 194.7225485	total: 18s	remaining: 4.01s
1636:	learn: 194.6943023	total: 18s	remaining: 4s
1637:	learn: 194.6609641	total: 18s	remaining: 3.99s
1638:	learn: 194.6066895	total: 18.1s	remaining: 3.98s
1639:	learn: 194.5625763	total: 18.1s	remaining: 3.97s
1640:	learn: 194.5393595	total: 

1775:	learn: 191.4204091	total: 19.5s	remaining: 2.46s
1776:	learn: 191.3909945	total: 19.5s	remaining: 2.45s
1777:	learn: 191.3527042	total: 19.5s	remaining: 2.44s
1778:	learn: 191.3229324	total: 19.6s	remaining: 2.43s
1779:	learn: 191.2964865	total: 19.6s	remaining: 2.42s
1780:	learn: 191.2664571	total: 19.6s	remaining: 2.41s
1781:	learn: 191.2386401	total: 19.6s	remaining: 2.4s
1782:	learn: 191.2047279	total: 19.6s	remaining: 2.38s
1783:	learn: 191.1679487	total: 19.6s	remaining: 2.37s
1784:	learn: 191.1436172	total: 19.6s	remaining: 2.36s
1785:	learn: 191.1282758	total: 19.6s	remaining: 2.35s
1786:	learn: 191.1194247	total: 19.6s	remaining: 2.34s
1787:	learn: 191.1094298	total: 19.7s	remaining: 2.33s
1788:	learn: 191.0842864	total: 19.7s	remaining: 2.32s
1789:	learn: 191.0466776	total: 19.7s	remaining: 2.31s
1790:	learn: 191.0426479	total: 19.7s	remaining: 2.3s
1791:	learn: 191.0372543	total: 19.7s	remaining: 2.29s
1792:	learn: 191.0346826	total: 19.7s	remaining: 2.27s
1793:	learn:

1931:	learn: 188.0955476	total: 21.1s	remaining: 744ms
1932:	learn: 188.0780931	total: 21.2s	remaining: 733ms
1933:	learn: 188.0540930	total: 21.2s	remaining: 723ms
1934:	learn: 188.0421383	total: 21.2s	remaining: 712ms
1935:	learn: 188.0240227	total: 21.2s	remaining: 701ms
1936:	learn: 188.0138333	total: 21.2s	remaining: 690ms
1937:	learn: 188.0096730	total: 21.2s	remaining: 679ms
1938:	learn: 187.9882141	total: 21.2s	remaining: 668ms
1939:	learn: 187.9291175	total: 21.2s	remaining: 657ms
1940:	learn: 187.8908093	total: 21.3s	remaining: 646ms
1941:	learn: 187.8713554	total: 21.3s	remaining: 635ms
1942:	learn: 187.8694978	total: 21.3s	remaining: 624ms
1943:	learn: 187.8625006	total: 21.3s	remaining: 613ms
1944:	learn: 187.8273415	total: 21.3s	remaining: 602ms
1945:	learn: 187.8092546	total: 21.3s	remaining: 591ms
1946:	learn: 187.7866218	total: 21.3s	remaining: 580ms
1947:	learn: 187.7702797	total: 21.3s	remaining: 569ms
1948:	learn: 187.7341350	total: 21.3s	remaining: 558ms
1949:	lear

<catboost.core.CatBoostRegressor at 0x7efe19f6dc18>

In [215]:
pd.DataFrame({"id":range(len(newds_tst)), "value":cbr.predict(newds_tst)}).to_csv("./cb-noparams10.csv",
                                                                                  encoding="utf-8",
                                                                                  index=False)