In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('data/train.csv')

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [5]:
data.shape

(1460, 81)

#### Выделяем категориальные фичи и заполняем пропуски.

In [6]:
category_features = list(data.dtypes[data.dtypes == object].index)
data[category_features] = data[category_features].fillna('nan')

#### Выделяем непрерывные фичи и заполняем пропуски нулями. Если гаража нет - его площадь равна нулю.

In [7]:
numeric_features = [feature for feature in data if feature not in (category_features + ['Id', 'SalePrice'])]
data[numeric_features] = data[numeric_features].fillna(0)

#### Формируем датасет для обучения RandomForestRegressor и получения списка важных фич. 

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
XE = pd.get_dummies(data[category_features + numeric_features], columns = category_features)
ye = data['SalePrice']
XE_train, XE_validation, ye_train, ye_validation = train_test_split( XE, ye, test_size = 0.3, random_state = 0 )

In [10]:
XE.shape

(1460, 304)

In [17]:
from sklearn.ensemble import RandomForestRegressor
estimator = RandomForestRegressor(n_estimators=20, max_depth=5, min_samples_leaf=5, max_features=0.5, n_jobs=-1)

In [18]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], 'max_depth': [3,4,5,6,7,8]}
search = GridSearchCV(estimator, param_grid, cv=3)
search.fit(XE_train, ye_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=5, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'max_depth': [3, 4, 5, 6, 7, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [28]:
search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=5, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [19]:
search.best_estimator_.score(XE_validation, ye_validation)

0.8426427463385828

In [21]:
search.best_estimator_.feature_importances_

array([6.91350279e-04, 1.76698135e-03, 1.43217335e-02, 3.68997967e-01,
       3.16938654e-03, 3.95634684e-02, 7.58379890e-03, 4.42899636e-03,
       2.76319371e-02, 4.78048162e-05, 4.10495333e-03, 4.25516199e-02,
       2.56954484e-02, 6.67307277e-03, 0.00000000e+00, 1.51151390e-01,
       1.51669342e-03, 0.00000000e+00, 7.82315616e-03, 1.30048394e-03,
       5.27147200e-03, 9.13581396e-04, 1.76073870e-03, 4.03089704e-03,
       3.41868840e-03, 1.07609106e-01, 2.58187444e-02, 7.79107108e-03,
       2.28490514e-03, 3.65370096e-05, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.37492690e-03, 1.01703017e-03,
       0.00000000e+00, 4.06683433e-05, 0.00000000e+00, 1.00418576e-03,
       1.21327683e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 9.06446842e-04, 0.00000000e+00,
       0.00000000e+00, 1.23700119e-04, 0.00000000e+00, 0.00000000e+00,
       1.05048295e-04, 2.57319362e-04, 0.00000000e+00, 0.00000000e+00,
      

#### Выделим фичи у которых важность больше 0.005

In [22]:
important_features = [feature for feature, importance in zip(XE.columns, search.best_estimator_.feature_importances_) if importance > 0.005]

In [23]:
important_features

['LotArea',
 'OverallQual',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'FullBath',
 'BedroomAbvGr',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'ExterQual_Ex',
 'ExterQual_TA',
 'BsmtQual_Ex',
 'KitchenQual_Ex',
 'KitchenQual_Gd']

#### Полученный список фичей выглядит вполне адекватно. Основными ценообразующими параметрами оказались площади различных частей, год постройки и перестройки, наличие и состояние определенных частей недвижимости.

#### Формируем данные и делим их на train и validation.

In [93]:
X_train, X_validation, y_train, y_validation = train_test_split( XE[important_features], ye, test_size = 0.3, random_state = 0 )

#### Формируем данные для линейных моделей.

In [94]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = list(set(numeric_features) & set(important_features))
scaler.fit(X_train[scaled_features])

X_train[scaled_features] = scaler.transform(X_train[scaled_features])
X_validation[scaled_features] = scaler.transform(X_validation[scaled_features])

  return self.partial_fit(X, y)
  
  import sys


#### Инициализируем стекинг.

In [95]:
from sklearn.model_selection import StratifiedKFold
stack = StratifiedKFold(n_splits=10, random_state=555)

#### Инициализируем 3 модели 1го уровня и 1 мета модель.

In [96]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
model1 = LinearRegression()
model2 = RandomForestRegressor(n_estimators=20, max_depth=8, min_samples_leaf=5, max_features=0.5, n_jobs=-1)
model3 = SGDRegressor(alpha = 0.01, max_iter = 1000)
meta_model = LinearRegression()

#### Обучим все модели используя k-fold.

In [97]:
for i, (train_ind, test_ind) in enumerate(stack.split(X_train, y_train)):
    model1.fit(X_train.iloc[train_ind], y_train.iloc[train_ind])
    model2.fit(X_train.iloc[train_ind], y_train.iloc[train_ind])
    model3.fit(X_train.iloc[train_ind], y_train.iloc[train_ind])
    
    meta_test1 = model1.predict(X_train.iloc[test_ind])
    meta_test2 = model2.predict(X_train.iloc[test_ind])
    meta_test3 = model3.predict(X_train.iloc[test_ind])
    
    X_meta_train = pd.DataFrame(np.stack((meta_test1, meta_test2, meta_test3), axis=1), columns=['model1', 'model2', 'model3'])
    meta_model.fit(X_meta_train, y_train.iloc[test_ind])



#### Делаем предсказание всеми моделями и мета моделью.

In [98]:
y_predict1 = model1.predict(X_validation)
y_predict2 = model2.predict(X_validation)
y_predict3 = model3.predict(X_validation)

X_meta_validation = pd.DataFrame(np.stack((y_predict1, y_predict2, y_predict3), axis=1), columns=['model1', 'model2', 'model3'])
y_predict = meta_model.predict(X_meta_validation)

In [99]:
model1.score(X_validation, y_validation)

0.734993440521336

In [100]:
model2.score(X_validation, y_validation)

0.8495417221407673

In [101]:
model3.score(X_validation, y_validation)

0.7342513648158813

In [102]:
meta_model.score(X_meta_validation, y_validation)

0.2943516782341231

In [103]:
from sklearn.metrics import mean_squared_error

In [104]:
print('model 1 - {}'.format(mean_squared_error(y_validation, y_predict1)))
print('model 2 - {}'.format(mean_squared_error(y_validation, y_predict2)))
print('model 3 - {}'.format(mean_squared_error(y_validation, y_predict3)))
print('meta model - {}'.format(mean_squared_error(y_validation, y_predict)))

model 1 - 1799127264.0997217
model 2 - 1021459960.6083751
model 3 - 1804165209.7126005
meta model - 4790640416.798494


#### В итоге получилось что-то совершенно не работающее :(