In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('data/train.csv')

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [5]:
data.shape

(1460, 81)

#### Выделяем категориальные фичи и заполняем пропуски.

In [6]:
category_features = list(data.dtypes[data.dtypes == object].index)
data[category_features] = data[category_features].fillna('nan')

#### Выделяем непрерывные фичи и заполняем пропуски нулями. Если гаража нет - его площадь равна нулю.

In [7]:
numeric_features = [feature for feature in data if feature not in (category_features + ['Id', 'SalePrice'])]
data[numeric_features] = data[numeric_features].fillna(0)

#### Формируем датасет для обучения RandomForestRegressor и получения списка важных фич. 

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
XE = pd.get_dummies(data[category_features + numeric_features], columns = category_features)
ye = data['SalePrice']
XE_train, XE_validation, ye_train, ye_validation = train_test_split( XE, ye, test_size = 0.3, random_state = 0 )

In [10]:
XE.shape

(1460, 304)

In [11]:
from sklearn.ensemble import RandomForestRegressor
estimator = RandomForestRegressor(n_estimators=20, max_depth=5, min_samples_leaf=5, max_features=0.5, n_jobs=-1)

In [12]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], 'max_depth': [3,4,5,6,7,8]}
search = GridSearchCV(estimator, param_grid, cv=3)
search.fit(XE_train, ye_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=5, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'max_depth': [3, 4, 5, 6, 7, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [13]:
search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=5, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [14]:
search.best_estimator_.score(XE_validation, ye_validation)

0.8564112681422302

In [15]:
search.best_estimator_.feature_importances_

array([2.13610376e-03, 1.29055864e-03, 8.95065710e-03, 3.62472334e-01,
       2.70031065e-03, 3.16202604e-02, 7.16898346e-03, 3.54070333e-03,
       3.54533074e-02, 0.00000000e+00, 6.53828455e-03, 3.17685924e-02,
       4.13350675e-02, 1.37850008e-02, 0.00000000e+00, 1.29482830e-01,
       1.37536975e-03, 0.00000000e+00, 3.76179883e-02, 4.51487467e-03,
       1.78014537e-03, 2.53072289e-03, 1.35718980e-03, 9.29211112e-03,
       5.36069320e-03, 1.27204038e-01, 8.46873189e-03, 3.52325850e-03,
       2.36731628e-03, 4.57868955e-05, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 2.71934493e-03, 1.74299339e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.65664155e-03,
       1.74447587e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 8.63144313e-05, 4.04945722e-04, 0.00000000e+00,
       0.00000000e+00, 1.65047071e-04, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

#### Выделим фичи у которых важность больше 0.005

In [16]:
important_features = [feature for feature, importance in zip(XE.columns, search.best_estimator_.feature_importances_) if importance > 0.005]

In [17]:
important_features

['LotArea',
 'OverallQual',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'FullBath',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'ExterQual_TA',
 'BsmtQual_Ex']

#### Полученный список фичей выглядит вполне адекватно. Основными ценообразующими параметрами оказались площади различных частей, год постройки и перестройки, наличие и состояние определенных частей недвижимости.

#### Формируем данные и делим их на train и validation.

In [18]:
X_train, X_validation, y_train, y_validation = train_test_split( XE[important_features], ye, test_size = 0.3, random_state = 0 )

#### Формируем данные для линейных моделей.

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = list(set(numeric_features) & set(important_features))
scaler.fit(X_train[scaled_features])

X_train[scaled_features] = scaler.transform(X_train[scaled_features])
X_validation[scaled_features] = scaler.transform(X_validation[scaled_features])

  return self.partial_fit(X, y)
  
  import sys


#### Инициализируем стекинг.

In [20]:
from sklearn.model_selection import StratifiedKFold
stack = StratifiedKFold(n_splits=10, random_state=555)

#### Инициализируем 3 модели 1го уровня и 1 мета модель.

In [30]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
model1 = LinearRegression()
model2 = RandomForestRegressor(n_estimators=20, max_depth=8, min_samples_leaf=5, max_features=0.5, n_jobs=-1)
model3 = SGDRegressor(alpha = 0.01, max_iter = 1000)
meta_model = LinearRegression()

#### Обучим все модели используя k-fold.

In [31]:
meta_test1 = np.zeros_like(y_train, dtype=float)
meta_test2 = np.zeros_like(y_train, dtype=float)
meta_test3 = np.zeros_like(y_train, dtype=float)
for i, (train_ind, test_ind) in enumerate(stack.split(X_train, y_train)):
    model1.fit(X_train.iloc[train_ind], y_train.iloc[train_ind])
    model2.fit(X_train.iloc[train_ind], y_train.iloc[train_ind])
    model3.fit(X_train.iloc[train_ind], y_train.iloc[train_ind])
    
    meta_test1[test_ind] = model1.predict(X_train.iloc[test_ind])
    meta_test2[test_ind] = model2.predict(X_train.iloc[test_ind])
    meta_test3[test_ind] = model3.predict(X_train.iloc[test_ind])
    
X_meta_train = pd.DataFrame(np.stack((meta_test1, meta_test2, meta_test3), axis=1), columns=['model1', 'model2', 'model3'])
meta_model.fit(X_meta_train, y_train)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

#### Делаем предсказание всеми моделями и мета моделью.

In [32]:
y_predict1 = model1.predict(X_validation)
y_predict2 = model2.predict(X_validation)
y_predict3 = model3.predict(X_validation)

X_meta_validation = pd.DataFrame(np.stack((y_predict1, y_predict2, y_predict3), axis=1), columns=['model1', 'model2', 'model3'])
y_predict = meta_model.predict(X_meta_validation)

In [42]:
print('score model 1    - {}'.format(model1.score(X_validation, y_validation)))
print('score model 2    - {}'.format(model2.score(X_validation, y_validation)))
print('score model 3    - {}'.format(model3.score(X_validation, y_validation)))
print('score meta model - {}'.format(meta_model.score(X_meta_validation, y_validation)))

score model 1    - 0.7293477761043297
score model 2    - 0.8398587012650897
score model 3    - 0.7314023995719454
score meta model - 0.7303532966923678


In [51]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score

In [52]:
print('mean_squared_error:')
print('model 1    - {}'.format(mean_squared_error(y_validation, y_predict1)))
print('model 2    - {}'.format(mean_squared_error(y_validation, y_predict2)))
print('model 3    - {}'.format(mean_squared_error(y_validation, y_predict3)))
print('meta model - {}'.format(mean_squared_error(y_validation, y_predict)))

mean_squared_error:
model 1    - 1837455631.5053272
model 2    - 1087197906.456017
model 3    - 1823506810.369282
meta model - 1830629160.8394246


In [53]:
print('r2_score:')
print('model 1    - {}'.format(r2_score(y_validation, y_predict1)))
print('model 2    - {}'.format(r2_score(y_validation, y_predict2)))
print('model 3    - {}'.format(r2_score(y_validation, y_predict3)))
print('meta model - {}'.format(r2_score(y_validation, y_predict)))

r2_score:
model 1    - 0.7293477761043297
model 2    - 0.8398587012650897
model 3    - 0.7314023995719454
meta model - 0.7303532966923678


In [54]:
print('explained_variance_score:')
print('model 1    - {}'.format(explained_variance_score(y_validation, y_predict1)))
print('model 2    - {}'.format(explained_variance_score(y_validation, y_predict2)))
print('model 3    - {}'.format(explained_variance_score(y_validation, y_predict3)))
print('meta model - {}'.format(explained_variance_score(y_validation, y_predict)))

explained_variance_score:
model 1    - 0.729382902391332
model 2    - 0.8398748973637539
model 3    - 0.7314418951650017
meta model - 0.7380498345197806


#### В итоге получилось что я мета-модель хотя бы не ухудшила качество по сравнению с худшей моделью. Надцать попыток "до" моя мета-модель ухудшала качество решения до 30%.