In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import os

In [115]:
raw_path = '../data/raw/'
train = pd.read_csv(os.path.join(raw_path, 'train.csv'))
test = pd.read_csv(os.path.join(raw_path, 'test.csv'))

In [126]:
train['Functional']

0        Typ
1        Typ
2        Typ
3        Typ
4        Typ
        ... 
1455     Typ
1456    Min1
1457     Typ
1458     Typ
1459     Typ
Name: Functional, Length: 1460, dtype: object

In [64]:
train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [144]:
train['PavedDrive']

0       Y
1       Y
2       Y
3       Y
4       Y
       ..
1455    Y
1456    Y
1457    Y
1458    Y
1459    Y
Name: PavedDrive, Length: 1460, dtype: object

In [136]:
TARGET_COLS = ['SalePrice']
ID_COL = 'Id'
CAT_COLS = [
 'MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
 'SaleType', 'SaleCondition', 'FireplaceQu']
OHE_COLS = ['PavedDrive']

In [6]:
def set_idx(df: pd.DataFrame, idx_col: str) -> pd.DataFrame:
    df = df.set_index(idx_col)
    return df

In [7]:
def cast_types(df: pd.DataFrame) -> pd.DataFrame:
    df[CAT_COLS] = df[CAT_COLS].astype('category')
    ohe_int_cols = train[OHE_COLS].select_dtypes('number').columns
    df[ohe_int_cols] = df[ohe_int_cols].astype(np.int8)
    return df

In [8]:
null_value = train.isna().sum().sort_values(ascending=False)
null_value[null_value > 0]

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
MasVnrType         8
Electrical         1
dtype: int64

In [66]:
def drop_columns(df: pd.DataFrame) -> pd.DataFrame:
    df.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis=1, inplace=True)
    return df

In [137]:
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    df = set_idx(df, ID_COL)
    df = cast_types(df)
    df = drop_columns(df)
    df = df.dropna()
    return df

In [138]:
prep_train = preprocess_data(train)

In [113]:
for type in prep_train[CAT_COLS].dtypes:
    if type != object and type != "category":
        print(type)

In [117]:
prep_train

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000
7,20,RL,75.0,10084,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,8,2007,WD,Normal,307000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1448,60,RL,80.0,10000,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,12,2007,WD,Normal,240000
1452,20,RL,78.0,9262,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,5,2009,New,Partial,287090
1456,60,RL,62.0,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,8,2007,WD,Normal,175000
1457,20,RL,85.0,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2010,WD,Normal,210000


In [139]:
prep_train, target = prep_train.drop(TARGET_COLS, axis=1), prep_train[TARGET_COLS]

In [72]:
import catboost as cb
from sklearn.model_selection import train_test_split

In [73]:
from sklearn.model_selection import GridSearchCV
cv_params = {'n_estimators': [400, 500, 600, 700, 800]}

In [140]:
train_idx, val_idx = train_test_split(
        prep_train.index, test_size=0.2, random_state=42)

In [134]:
prep_train.loc[train_idx].columns[55]

'FireplaceQu'

In [135]:
i = 0
for x in prep_train.loc[train_idx]['FireplaceQu']:
    print(i, x)
    i += 1

0 Fa
1 TA
2 Gd
3 Ex
4 Gd
5 Gd
6 Gd
7 Gd
8 Ex
9 Gd
10 Gd
11 Gd
12 Gd
13 Gd
14 Gd
15 Gd
16 Po
17 Gd
18 TA
19 Gd
20 Gd
21 Ex
22 TA
23 TA
24 Gd
25 TA
26 Gd
27 Fa
28 Gd
29 TA
30 Gd
31 Gd
32 TA
33 TA
34 TA
35 TA
36 Gd
37 Gd
38 Gd
39 Gd
40 Ex
41 Fa
42 Gd
43 TA
44 Gd
45 Po
46 TA
47 TA
48 Gd
49 Gd
50 Gd
51 TA
52 Gd
53 TA
54 Gd
55 Gd
56 TA
57 Gd
58 TA
59 Gd
60 Gd
61 TA
62 Gd
63 Gd
64 TA
65 TA
66 Gd
67 TA
68 Gd
69 Gd
70 TA
71 Gd
72 Ex
73 Ex
74 TA
75 TA
76 Po
77 TA
78 Gd
79 TA
80 Fa
81 Gd
82 Gd
83 Ex
84 TA
85 Gd
86 Gd
87 Gd
88 Gd
89 Gd
90 TA
91 Gd
92 Gd
93 Gd
94 Gd
95 Gd
96 Gd
97 Gd
98 Po
99 Gd
100 Gd
101 Gd
102 TA
103 Gd
104 Gd
105 Ex
106 Gd
107 Gd
108 TA
109 Gd
110 Gd
111 Gd
112 Gd
113 Gd
114 Gd
115 TA
116 Gd
117 Gd
118 TA
119 TA
120 Gd
121 Ex
122 TA
123 TA
124 TA
125 Gd
126 TA
127 TA
128 Gd
129 Gd
130 Gd
131 Gd
132 TA
133 TA
134 Gd
135 TA
136 Gd
137 Gd
138 Gd
139 TA
140 Gd
141 Gd
142 Gd
143 TA
144 Gd
145 Gd
146 Gd
147 TA
148 TA
149 TA
150 Gd
151 TA
152 Gd
153 Fa
154 Gd
155 TA
156 TA
157 TA
158 

In [141]:
cat = cb.CatBoostRegressor(cat_features= CAT_COLS)
cat = GridSearchCV(estimator=cat, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
cat.fit(prep_train.loc[train_idx], target.loc[train_idx])

Fitting 5 folds for each of 5 candidates, totalling 25 fits


3 fits failed out of a total of 25.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\nixon\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\nixon\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\catboost\core.py", line 5730, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
  File "C:\

Learning rate set to 0.043508
0:	learn: 92458.4166565	total: 200ms	remaining: 2m 39s
1:	learn: 90421.2421060	total: 257ms	remaining: 1m 42s
2:	learn: 88263.8057891	total: 308ms	remaining: 1m 21s
3:	learn: 86489.3122202	total: 333ms	remaining: 1m 6s
4:	learn: 84505.6069727	total: 387ms	remaining: 1m 1s
5:	learn: 82621.9048046	total: 427ms	remaining: 56.5s
6:	learn: 80733.7015937	total: 496ms	remaining: 56.2s
7:	learn: 78830.7877837	total: 539ms	remaining: 53.3s
8:	learn: 77002.2554261	total: 622ms	remaining: 54.7s
9:	learn: 75405.4715285	total: 679ms	remaining: 53.6s
10:	learn: 73608.5307395	total: 735ms	remaining: 52.7s
11:	learn: 72035.0733642	total: 786ms	remaining: 51.6s
12:	learn: 70630.3416612	total: 855ms	remaining: 51.8s
13:	learn: 69243.7276681	total: 915ms	remaining: 51.4s
14:	learn: 67689.7386481	total: 1000ms	remaining: 52.3s
15:	learn: 66451.8012019	total: 1.03s	remaining: 50.3s
16:	learn: 65119.7278138	total: 1.08s	remaining: 49.6s
17:	learn: 63579.6116969	total: 1.11s	rem

GridSearchCV(cv=5,
             estimator=<catboost.core.CatBoostRegressor object at 0x000001F68C21CCD0>,
             n_jobs=4, param_grid={'n_estimators': [400, 500, 600, 700, 800]},
             scoring='r2', verbose=1)

In [143]:
from sklearn.metrics import mean_squared_error 
cat_prediction = cat.predict(prep_train.loc[val_idx])
np.sqrt(mean_squared_error(np.log(target.loc[val_idx]), np.log(cat_prediction)))  

0.11527609015375974

In [147]:
cat.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=5,
             estimator=<catboost.core.CatBoostRegressor object at 0x000001F68C21CCD0>,
             n_jobs=4, param_grid={'n_estimators': [400, 500, 600, 700, 800]},
             scoring='r2', verbose=1)>

In [148]:
cat_ = cb.CatBoostRegressor(cat_features= CAT_COLS)
cat_.fit(prep_train.loc[train_idx], target.loc[train_idx])

Learning rate set to 0.03629
0:	learn: 92850.2780841	total: 91.1ms	remaining: 1m 30s
1:	learn: 91139.2783660	total: 175ms	remaining: 1m 27s
2:	learn: 89315.0193023	total: 274ms	remaining: 1m 31s
3:	learn: 87799.6461008	total: 302ms	remaining: 1m 15s
4:	learn: 86100.1824633	total: 397ms	remaining: 1m 18s
5:	learn: 84468.0983972	total: 463ms	remaining: 1m 16s
6:	learn: 82830.9204075	total: 567ms	remaining: 1m 20s
7:	learn: 81305.5177518	total: 645ms	remaining: 1m 19s
8:	learn: 79732.8832570	total: 744ms	remaining: 1m 21s
9:	learn: 78252.3145129	total: 844ms	remaining: 1m 23s
10:	learn: 76841.5857590	total: 940ms	remaining: 1m 24s
11:	learn: 75369.8001718	total: 1s	remaining: 1m 22s
12:	learn: 74071.9803401	total: 1.08s	remaining: 1m 21s
13:	learn: 72765.4430886	total: 1.17s	remaining: 1m 22s
14:	learn: 71306.8912625	total: 1.24s	remaining: 1m 21s
15:	learn: 69996.8354560	total: 1.33s	remaining: 1m 21s
16:	learn: 68724.6835234	total: 1.43s	remaining: 1m 22s
17:	learn: 67621.6909557	total:

<catboost.core.CatBoostRegressor at 0x1f68c1ee5b0>

In [157]:
cat_prediction = cat_.predict(prep_train.loc[val_idx])
np.sqrt(mean_squared_error(np.log(target.loc[val_idx]), np.log(cat_prediction)))  

0.11153758803951379

In [None]:
cat.save_model(os.path.join('C:\\Users\\nixon\\hse\\DS\\hse_workshop_regression\\models\\catboost', "catboost.cbm"))

In [152]:
trained_model = cb.CatBoostRegressor().load_model('C:\\Users\\nixon\\hse\\DS\\hse_workshop_regression\\models\\catboost\\catboost.cbm')

In [154]:
y_pred = trained_model.predict(prep_train.loc[val_idx])
np.sqrt(mean_squared_error(np.log(target.loc[val_idx]), np.log(y_pred)))  

  np.sqrt(mean_squared_error(np.log(target.loc[val_idx]), np.log(y_pred)))


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [156]:
cat_prediction

array([303192.00849649, 324299.90731262, 218722.41487184, 115591.04009045,
       198338.34946632, 364991.13901354, 223427.39427628, 193741.82793631,
       300161.94133443, 188749.4932457 , 152962.37671696, 149896.84732752,
       368036.09446091, 198018.03380986, 269411.4650521 , 158737.37692474,
       149821.74531602, 202850.7789386 , 183296.21341853, 156582.28346436,
       239915.01477681, 178759.28242584, 173195.6494699 , 235226.07835936,
       202925.21878792, 162149.27817672, 227953.29468291, 274259.02804752,
       151435.08754038, 218827.98470679, 208286.49215523, 208771.7326275 ,
       263627.96042547, 245005.10157731, 432681.92760473, 431255.29798781,
       275374.0675063 , 197753.84094195, 251982.73814395, 241725.82738991,
       126083.94684697, 196517.32456997, 116325.90418595, 134817.35167356,
       128568.6851806 , 278930.8181468 , 133236.03976584, 300441.98480793,
       239349.91554253, 232051.09326206, 197738.44307913, 108149.76120026,
       282510.19839381, 4