In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('data/train.csv')

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [5]:
data.shape

(1460, 81)

#### Выделяем категориальные фичи и заполняем пропуски.

In [31]:
category_features = list(data.dtypes[data.dtypes == object].index)
data[category_features] = data[category_features].fillna('nan')

#### Выделяем непрерывные фичи и заполняем пропуски нулями. Если гаража нет - его площадь равна нулю.

In [32]:
numeric_features = [feature for feature in data if feature not in (category_features + ['Id', 'SalePrice'])]
data[numeric_features] = data[numeric_features].fillna(0)

#### Т.к. у нас задача регресии, но предполагается использование категориальных моделей, то было бы не плохо ввести категориальность для целевой переменной. Например результат деления нацело на 100 000.

In [55]:
data['SalePriceCategory'] = data['SalePrice'] // 100000

In [56]:
data['SalePriceCategory'].describe()

count    1460.000000
mean        1.328082
std         0.826998
min         0.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         7.000000
Name: SalePriceCategory, dtype: float64

#### Формируем датасет для обучения RandomForestClassifier и получения списка важных фич. Разбивать не буду, т.к. интересует лишь информация об важности фич.

In [57]:
XE = pd.get_dummies(data[category_features + numeric_features], columns = category_features)
ye = data['SalePriceCategory']

In [58]:
XE.shape

(1460, 304)

In [65]:
from sklearn.ensemble import RandomForestClassifier
estimator = RandomForestClassifier(n_estimators=20, max_depth=5, min_samples_leaf=5, max_features=0.5, n_jobs=-1)
estimator.fit(XE, ye)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [66]:
estimator.feature_importances_

array([9.48657164e-03, 2.61203141e-03, 1.97186497e-02, 2.82689640e-01,
       1.36068360e-02, 5.32930095e-03, 1.03641705e-02, 8.18354590e-03,
       1.20010784e-02, 0.00000000e+00, 4.90270058e-03, 7.87910763e-02,
       6.02865896e-02, 2.11430693e-02, 0.00000000e+00, 1.97661974e-01,
       1.17103119e-03, 0.00000000e+00, 2.41255623e-03, 0.00000000e+00,
       6.41779607e-03, 0.00000000e+00, 5.84387708e-03, 3.00182675e-03,
       1.25089789e-02, 1.83152054e-02, 4.19479486e-02, 4.75313194e-03,
       1.84971675e-03, 0.00000000e+00, 0.00000000e+00, 6.61660835e-04,
       0.00000000e+00, 0.00000000e+00, 1.91817175e-03, 1.04878097e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.84258649e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 2.60726372e-04, 0.00000000e+00,
       0.00000000e+00, 2.15155470e-05, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 4.52821885e-04, 0.00000000e+00, 0.00000000e+00,
      

#### Выделим фичи у которых важность больше 0.01

In [67]:
important_features = [feature for feature, importance in zip(XE.columns, estimator.feature_importances_) if importance > 0.005]

In [68]:
important_features

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'BedroomAbvGr',
 'TotRmsAbvGrd',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'ExterQual_Gd',
 'ExterQual_TA',
 'BsmtQual_Ex',
 'BsmtQual_Gd',
 'CentralAir_N',
 'CentralAir_Y',
 'KitchenQual_TA',
 'GarageCond_TA']

#### Полученный список фичей выглядит вполне адекватно. Основными ценообразующими параметрами оказались площади различных частей, год постройки и перестройки, наличие и состояние определенных частей недвижимости.

#### Формируем данные и делим их на train и validation.

In [87]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split( XE[important_features], data['SalePrice'], test_size = 0.3, random_state = 0 )

#### Формируем данные для линейных моделей.

In [88]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = list(set(numeric_features) & set(important_features))
scaler.fit(X_train[scaled_features])

XL_train = X_train
XL_train[scaled_features] = scaler.transform(X_train[scaled_features])

XL_validation = X_validation
XL_validation[scaled_features] = scaler.transform(XL_validation[scaled_features])

  return self.partial_fit(X, y)
  import sys
  # Remove the CWD from sys.path while we load stuff.


#### Формируем данные для категориальных моделей.

In [89]:
XC_train = X_train

#### Инициализируем стекинг.

In [90]:
from sklearn.model_selection import StratifiedKFold
stack = StratifiedKFold(n_splits=10, random_state=555)

#### Позаимствуем полезную функцию из классной работы. Заменив predict_proba, на predict.

In [132]:
def get_meta_features(clf, X_train, y_train, X_test, stack):
    meta_train = np.zeros_like(y_train, dtype=float)
    meta_test = np.zeros_like(y_validation, dtype=float)
    
    for i, (train_ind, test_ind) in enumerate(stack.split(X_train, y_train)):
        
        clf.fit(X_train.iloc[train_ind], y_train.iloc[train_ind])
        meta_train[test_ind] = clf.predict(X_train.iloc[test_ind])
        meta_test += clf.predict(X_test)
    
    return meta_train, meta_test / stack.n_splits

In [133]:
meta_train = []
meta_test = []
column_names = []

#### Обучаем линейную модель.

In [134]:
from sklearn.linear_model import LinearRegression
model1 = LinearRegression()
meta_train_1, meta_test_1 = get_meta_features(model1, XL_train, y_train, XL_validation, stack)
meta_train.append(meta_train_1)
meta_test.append(meta_test_1)
column_names.append('model1_prediction')



#### Обучаем дерево.

In [135]:
from sklearn.tree import DecisionTreeClassifier
model2 = DecisionTreeClassifier(max_depth=15, min_samples_leaf=20)
meta_train_2, meta_test_2 = get_meta_features(model2, XC_train, y_train, X_validation, stack)
meta_train.append(meta_train_2)
meta_test.append(meta_test_2)
column_names.append('model2_prediction')



#### Обучаем еще одну регрессионную модель.

In [136]:
from sklearn.linear_model import SGDRegressor
model3 = SGDRegressor(alpha = 0.01, max_iter = 1000)
meta_train_3, meta_test_3 = get_meta_features(model3, XL_train, y_train, XL_validation, stack)
meta_train.append(meta_train_3)
meta_test.append(meta_test_3)
column_names.append('model3_prediction')



#### Формируем данные для обучения и тестирования мета-модели.

In [138]:
X_meta_train = pd.DataFrame(np.stack(meta_train, axis=1), columns=column_names)
X_meta_test = pd.DataFrame(np.stack(meta_test, axis=1), columns=column_names)

In [139]:
clf_meta = LinearRegression()
clf_meta.fit(X_meta_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [141]:
y_pred_meta_test = clf_meta.predict(X_meta_test)

In [148]:
from sklearn.metrics import mean_squared_error

In [149]:
mean_squared_error(y_validation, y_pred_meta_test)

1909878929.4838593

#### В итоге получилось что-то совершенно не работающее :(

In [None]:
е