In [3]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [4]:
all_data = pd.read_csv('./train.csv')
all_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(all_data, test_size=0.2, random_state=39)

y_train = train["SalePrice"]
X_train = train.drop("SalePrice", axis=1)

y_test = test["SalePrice"]
X_test = test.drop("SalePrice", axis=1)

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnDropper(BaseEstimator, TransformerMixin):

  def __init__(self, cols):
    self.cols = cols
    return

  def fit(self, X, y=None):
    return self
  
  def transform(self, X, y=None):
    return X.drop(self.cols, axis=1)


In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class ZeroReplacer(BaseEstimator, TransformerMixin):

  def __init__(self, cols, replace=np.nan):
    self.cols = cols
    self.replace = replace
    return

  def fit(self, X, y=None):
    return self
  
  def transform(self, X, y=None):
    X[self.cols] = X[self.cols].replace({ 0: self.replace })
    return X


In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class ValueLimiter(BaseEstimator, TransformerMixin):

  def __init__(self, col_limit_pairs, replace=np.nan):
    self.col_limit_pairs = col_limit_pairs
    self.replace = replace
    return

  def fit(self, X, y=None):
    return self
  
  def transform(self, X, y=None):
    for col, limit in self.col_limit_pairs:
      mask = X[col] > limit
      X.loc[mask, col] = np.nan
    return X


In [9]:
lot_area_limited = ValueLimiter([("LotArea", 40000)]).transform(all_data[["LotArea"]])
lot_area_limited

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0,LotArea
0,8450.0
1,9600.0
2,11250.0
3,9550.0
4,14260.0
...,...
1455,7917.0
1456,13175.0
1457,9042.0
1458,9717.0


In [10]:
lot_area_limited.query("LotArea > 40000")

Unnamed: 0,LotArea


In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder

class Binner(BaseEstimator, TransformerMixin):

  def __init__(self, col_range_pairs):
    self.col_range_pairs = col_range_pairs
    self.label_encoder = LabelEncoder()
    return

  def fit(self, X, y=None):
    return self
  
  def transform(self, X, y=None):
    for col, range in self.col_range_pairs:
      new_col = col + "_bin"
      bins = pd.cut(X[col], range)
      bin_labels = self.label_encoder.fit_transform(bins)
      X.insert(len(X.columns), new_col, bin_labels)
    return X


In [12]:
lot_frontage_binned = Binner([("LotFrontage", 10)]).transform(all_data[["LotFrontage"]])
lot_frontage_binned

Unnamed: 0,LotFrontage,LotFrontage_bin
0,65.0,1
1,80.0,2
2,68.0,1
3,60.0,1
4,84.0,2
...,...,...
1455,62.0,1
1456,85.0,2
1457,66.0,1
1458,68.0,1


In [13]:
lot_frontage_binned["LotFrontage_bin"].value_counts()

1    643
2    298
7    259
0    205
3     40
4      9
5      4
6      2
Name: LotFrontage_bin, dtype: int64

In [59]:
drop_col = [
  "Alley",
  "OverallCond",
  "BsmtFinType2",
  "BsmtFinSF2",
  "BsmtUnfSF",
  "LowQualFinSF",
  "BsmtFullBath",
  "BsmtHalfBath",
  "HalfBath",
  "BedroomAbvGr",
  "KitchenAbvGr",
  "WoodDeckSF",
  "OpenPorchSF",
  "EnclosedPorch",
  "3SsnPorch",
  "ScreenPorch",
  "PoolArea",
  "PoolQC",
  "Fence",
  "MiscFeature",
  "MiscVal",
  "MoSold",
  "YrSold",
  "TotRmsAbvGrd",
  "Functional",
]

In [15]:
replace_zero_col = [
  "MasVnrArea",
  "BsmtFinSF1",
  "2ndFlrSF",
]

In [16]:
limits = [
  ("LotArea", 40000),
  ("BsmtFinSF1", 5000),
  ("TotalBsmtSF", 3000),
  ("1stFlrSF", 1200),
  ("GarageArea", 1200),
]

In [17]:
bins = [
  ("LotFrontage", 10),
  ("LotArea", 10),
  ("YearBuilt", 10),
  ("YearRemodAdd", 10),
  ("MasVnrArea", 10),
  ("BsmtFinSF1", 10),
  ("TotalBsmtSF", 10),
  ("2ndFlrSF", 10),
  ("GrLivArea", 10),
  ("GarageYrBlt", 10),
  ("GarageArea", 10),
]

In [18]:
binned = []

for col, _ in bins:
  binned.append(col)

binned

['LotFrontage',
 'LotArea',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'TotalBsmtSF',
 '2ndFlrSF',
 'GrLivArea',
 'GarageYrBlt',
 'GarageArea']

In [19]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
  ('drop', ColumnDropper(drop_col)),
  ('replace_zero', ZeroReplacer(replace_zero_col)),
  ('limit', ValueLimiter(limits)),
  ('bin', Binner(bins)),
  ('drop_binned', ColumnDropper(binned)),
])

In [20]:
_X_train = pipeline.fit_transform(X_train)
_X_train

Unnamed: 0,Id,MSSubClass,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,LotArea_bin,YearBuilt_bin,YearRemodAdd_bin,MasVnrArea_bin,BsmtFinSF1_bin,TotalBsmtSF_bin,2ndFlrSF_bin,GrLivArea_bin,GarageYrBlt_bin,GarageArea_bin
488,489,190,RL,Pave,Reg,Lvl,AllPub,Corner,Gtl,OldTown,...,2,2,3,10,2,3,2,3,6,4
347,348,20,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,NAmes,...,4,6,1,0,5,5,10,2,5,4
296,297,50,RM,Pave,Reg,Lvl,AllPub,Inside,Gtl,IDOTRR,...,3,5,0,10,1,3,2,2,4,2
1405,1406,120,RM,Pave,IR1,HLS,AllPub,Inside,Mod,Crawfor,...,0,9,9,1,6,6,10,2,9,4
421,422,20,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NWAmes,...,4,7,8,0,5,6,10,2,6,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
916,917,20,C (all),Pave,Reg,Lvl,AllPub,Inside,Gtl,IDOTRR,...,2,5,0,10,0,1,10,0,5,2
1088,1089,160,RM,Pave,Reg,Lvl,AllPub,Inside,Gtl,Edwards,...,0,9,8,0,10,3,3,2,9,3
1065,1066,60,RL,Pave,Reg,Lvl,AllPub,Inside,Mod,ClearCr,...,3,8,7,10,5,4,4,3,8,4
105,106,60,FV,Pave,Reg,Lvl,AllPub,Inside,Gtl,Somerst,...,2,9,8,4,10,3,4,3,9,5


In [21]:
_X_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'Street', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       '1stFlrSF', 'FullBath', 'KitchenQual', 'TotRmsAbvGrd', 'Functional',
       'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCars',
       'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition',
       'LotFrontage_bin', 'LotArea_bin', 'YearBuilt_bin', 'YearRemodAdd_bin',
       'MasVnrArea_bin', 'BsmtFinSF1_bin', 'TotalBsmtSF_bin', '2ndFlrSF_bin',
       'GrLivArea_bin', 'GarageYrBlt_bin', 'GarageArea_bin'],
      dtype='object')

In [24]:
one_hot_cols_base = [
  "MSSubClass", "MSZoning", "LotFrontage", "LotArea", "Street", "LotShape", "LandContour", "Utilities",
  "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "YearBuilt",
  "YearRemodAdd", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "MasVnrArea", "ExterQual", "ExterCond",
  "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "TotalBsmtSF", "Heating", "HeatingQC",
  "CentralAir", "Electrical", "1stFlrSF", "2ndFlrSF", "GrLivArea", "KitchenQual", "FireplaceQu", "GarageType",
  "GarageYrBlt", "GarageFinish", "GarageArea", "GarageQual", "GarageCond", "PavedDrive", "SaleType", "SaleCondition",
]

In [36]:
one_hot_cols = []

for col in one_hot_cols_base:
  if col in binned:
    one_hot_cols.append(col + "_bin")
  else:
    one_hot_cols.append(col)

pd.Series(one_hot_cols)

0           MSSubClass
1             MSZoning
2      LotFrontage_bin
3          LotArea_bin
4               Street
5             LotShape
6          LandContour
7            Utilities
8            LotConfig
9            LandSlope
10        Neighborhood
11          Condition1
12          Condition2
13            BldgType
14          HouseStyle
15       YearBuilt_bin
16    YearRemodAdd_bin
17           RoofStyle
18            RoofMatl
19         Exterior1st
20         Exterior2nd
21          MasVnrType
22      MasVnrArea_bin
23           ExterQual
24           ExterCond
25          Foundation
26            BsmtQual
27            BsmtCond
28        BsmtExposure
29        BsmtFinType1
30      BsmtFinSF1_bin
31     TotalBsmtSF_bin
32             Heating
33           HeatingQC
34          CentralAir
35          Electrical
36            1stFlrSF
37        2ndFlrSF_bin
38       GrLivArea_bin
39         KitchenQual
40         FireplaceQu
41          GarageType
42     GarageYrBlt_bin
43        G

In [49]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

tr = ColumnTransformer([ 
  ('onehot', OneHotEncoder(handle_unknown='ignore'), ["SaleCondition"]),
  # ('onehot', OneHotEncoder(handle_unknown='ignore'), one_hot_cols),
  # ("passthrough", "passthrough", []),
])

_transformed = tr.fit_transform(X_train)
_transformed.toarray()

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.]])

In [51]:
tr.transformers_[0]

('onehot', OneHotEncoder(handle_unknown='ignore'), ['SaleCondition'])

In [54]:
_cats = tr.transformers_[0][1].categories_
_cats

[array(['Abnorml', 'AdjLand', 'Alloca', 'Family', 'Normal', 'Partial'],
       dtype=object)]

In [55]:
pd.DataFrame(_transformed.toarray(), columns=_cats)

Unnamed: 0,Abnorml,AdjLand,Alloca,Family,Normal,Partial
0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...
1163,1.0,0.0,0.0,0.0,0.0,0.0
1164,0.0,0.0,0.0,0.0,1.0,0.0
1165,0.0,0.0,0.0,0.0,1.0,0.0
1166,0.0,0.0,0.0,0.0,1.0,0.0


In [60]:
pass_cols = [
  "OverallQual",
  "FullBath",
  "Fireplaces",
  "GarageCars",
]

In [63]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

col_transformer = ColumnTransformer([ 
  ('onehot', OneHotEncoder(handle_unknown='ignore'), one_hot_cols),
  ("passthrough", "passthrough", pass_cols),
])

_transformed = col_transformer.fit_transform(_X_train)
_transformed.toarray()

array([[0., 0., 0., ..., 2., 1., 2.],
       [1., 0., 0., ..., 2., 2., 2.],
       [0., 0., 0., ..., 1., 0., 1.],
       ...,
       [0., 0., 0., ..., 2., 0., 2.],
       [0., 0., 0., ..., 2., 1., 2.],
       [0., 0., 0., ..., 2., 1., 3.]])

In [66]:
col_transformer.transformers_[0][1].categories_

[array([ 20,  30,  40,  45,  50,  60,  70,  75,  80,  85,  90, 120, 160,
        180, 190]),
 array(['C (all)', 'FV', 'RH', 'RL', 'RM'], dtype=object),
 array([0, 1, 2, 3, 4, 5, 6, 7]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array(['Grvl', 'Pave'], dtype=object),
 array(['IR1', 'IR2', 'IR3', 'Reg'], dtype=object),
 array(['Bnk', 'HLS', 'Low', 'Lvl'], dtype=object),
 array(['AllPub', 'NoSeWa'], dtype=object),
 array(['Corner', 'CulDSac', 'FR2', 'FR3', 'Inside'], dtype=object),
 array(['Gtl', 'Mod', 'Sev'], dtype=object),
 array(['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr',
        'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel',
        'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown',
        'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber',
        'Veenker'], dtype=object),
 array(['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNe',
        'RRNn'], dtype=object),
 array(['Artery', 'Feedr', 'N

In [67]:
whole_pipeline = Pipeline([
  ('pipeline', pipeline),
  ('onehot', col_transformer),
])

In [69]:
def display_scores(scores):
  print("Scores:", scores)
  print("Mean:", scores.mean())
  print("Standard deviation:", scores.std())

In [89]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

_X_train = whole_pipeline.fit_transform(X_train)
clf = RandomForestClassifier(random_state=38)
clf.fit(_X_train, y_train)
scores = cross_val_score(clf, _X_train, y_train, cv=3, scoring="neg_mean_squared_error")
display_scores(scores)



Scores: [-1.70538807e+09 -2.26186028e+09 -2.32055700e+09]
Mean: -2095935119.7105882
Standard deviation: 277196166.2549819


In [71]:
display_scores(np.sqrt(-scores))

Scores: [41296.34456878 47559.0189551  48172.16001402]
Mean: 45675.84117930167
Standard deviation: 3106.871782645329


In [72]:
from sklearn.model_selection import GridSearchCV

param_grid = [
  { 
    "max_features": [2, 8, 32],
    "min_samples_split": [2, 8, 32],
    "n_estimators": [10, 50, 100],
  }
]

grid_search = GridSearchCV(
  RandomForestClassifier(random_state=38),
  param_grid,
  cv=5,
  scoring="neg_mean_squared_error")
grid_search.fit(_X_train, y_train)

grid_search.best_score_, grid_search.best_params_



(-1757640742.8222442,
 {'max_features': 8, 'min_samples_split': 8, 'n_estimators': 100})

In [73]:
from sklearn.model_selection import GridSearchCV

param_grid = [
  { 
    "max_features": [6, 8, 10, 20],
    "min_samples_split": [6, 8, 10, 20],
    "n_estimators": [80, 100, 120],
  }
]

grid_search = GridSearchCV(
  RandomForestClassifier(random_state=38),
  param_grid,
  cv=5,
  scoring="neg_mean_squared_error")
grid_search.fit(_X_train, y_train)

grid_search.best_score_, grid_search.best_params_



(-1648329065.8246396,
 {'max_features': 8, 'min_samples_split': 6, 'n_estimators': 120})

In [74]:
from sklearn.model_selection import GridSearchCV

param_grid = [
  { 
    "max_features": [7, 8, 9],
    "min_samples_split": [5, 6, 7],
    "n_estimators": [110, 120, 140, 180],
  }
]

grid_search = GridSearchCV(
  RandomForestClassifier(random_state=38),
  param_grid,
  cv=5,
  scoring="neg_mean_squared_error")
grid_search.fit(_X_train, y_train)

grid_search.best_score_, grid_search.best_params_



(-1648329065.8246396,
 {'max_features': 8, 'min_samples_split': 6, 'n_estimators': 120})

In [90]:
_X_test = whole_pipeline.transform(X_test)
scores = cross_val_score(grid_search.best_estimator_, _X_test, y_test, cv=3, scoring="neg_mean_squared_error")
display_scores(np.sqrt(-scores))



Scores: [44866.85831446 39998.37286639 35227.28698099]
Mean: 40030.83938728238
Standard deviation: 3935.405479278608


In [78]:
house_price_test = pd.read_csv('./test.csv')

In [82]:
_test = pipeline.fit_transform(house_price_test)
_test

Unnamed: 0,Id,MSSubClass,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,LotArea_bin,YearBuilt_bin,YearRemodAdd_bin,MasVnrArea_bin,BsmtFinSF1_bin,TotalBsmtSF_bin,2ndFlrSF_bin,GrLivArea_bin,GarageYrBlt_bin,GarageArea_bin
0,1461,20,RH,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,2,6,1,10,1,3,10,1,2,6
1,1462,20,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,NAmes,...,3,6,1,0,2,4,10,1,2,2
2,1463,60,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,...,3,9,7,10,1,3,3,2,3,4
3,1464,60,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,...,2,9,7,0,1,3,3,2,3,3
4,1465,120,RL,Pave,IR1,HLS,AllPub,Inside,Gtl,StoneBr,...,0,8,6,10,0,4,10,1,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,Pave,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,...,0,6,3,10,7,1,2,1,5,0
1455,2916,160,RM,Pave,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,...,0,6,3,10,0,1,2,1,2,2
1456,2917,20,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,...,4,6,7,10,3,4,10,1,2,4
1457,2918,85,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,...,2,8,6,10,0,3,10,1,5,0


In [95]:
len(_transformed.toarray()[0])

716

In [94]:
_test = whole_pipeline.transform(house_price_test)
len(_test.toarray()[0])

716

In [96]:
predict = grid_search.best_estimator_.predict(_test)
predict

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

testにnanがあるカラムを削除

In [97]:
pass_cols = [
  "OverallQual",
  "FullBath",
  "Fireplaces",
  # "GarageCars",
]

In [98]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

col_transformer = ColumnTransformer([ 
  ('onehot', OneHotEncoder(handle_unknown='ignore'), one_hot_cols),
  ("passthrough", "passthrough", pass_cols),
])

In [99]:
whole_pipeline = Pipeline([
  ('pipeline', pipeline),
  ('onehot', col_transformer),
])

In [100]:
from sklearn.model_selection import GridSearchCV

param_grid = [
  { 
    "max_features": [8],
    "min_samples_split": [6],
    "n_estimators": [115, 120, 130],
  }
]

_X_train = whole_pipeline.fit_transform(X_train)
grid_search = GridSearchCV(
  RandomForestClassifier(random_state=38),
  param_grid,
  cv=5,
  scoring="neg_mean_squared_error")
grid_search.fit(_X_train, y_train)

grid_search.best_score_, grid_search.best_params_



(-1773498491.8156343,
 {'max_features': 8, 'min_samples_split': 6, 'n_estimators': 120})

In [101]:
_X_test = whole_pipeline.transform(X_test)
scores = cross_val_score(grid_search.best_estimator_, _X_test, y_test, cv=3, scoring="neg_mean_squared_error")
display_scores(np.sqrt(-scores))



Scores: [46973.69942417 45051.42642272 40185.21046355]
Mean: 44070.112103478874
Standard deviation: 2856.9364637609337


In [102]:
file = "./predict-2.csv"

In [103]:
_test = whole_pipeline.transform(house_price_test)
predict = grid_search.best_estimator_.predict(_test)

id = np.array(house_price_test["Id"]).astype(int)
df = pd.DataFrame(predict, id, columns = ["SalePrice"])
df.to_csv(file, index_label = ["Id"])

In [105]:
importances = grid_search.best_estimator_.feature_importances_
importances

array([5.41188503e-03, 1.85314936e-03, 5.31854230e-04, 1.08680084e-03,
       2.72459821e-03, 4.01627056e-03, 1.99295088e-03, 8.53118786e-04,
       2.26215223e-03, 1.03865536e-03, 1.45524969e-03, 1.93981595e-03,
       1.71910411e-03, 5.94694774e-04, 1.50778850e-03, 7.00094410e-04,
       2.00153573e-03, 1.15947380e-03, 4.80980038e-03, 4.00021899e-03,
       4.67373735e-03, 6.52524605e-03, 6.22300291e-03, 1.86683249e-03,
       6.06551188e-04, 5.06718674e-04, 6.48315912e-05, 5.88304536e-03,
       3.19273414e-03, 7.56480088e-03, 7.02371540e-03, 5.13270825e-03,
       2.30925484e-03, 9.31463054e-04, 5.94785834e-04, 3.70365010e-04,
       3.64664992e-04, 2.84405287e-04, 7.97725477e-04, 1.36152180e-04,
       4.49605450e-04, 6.57577746e-03, 2.54290898e-03, 5.88203691e-04,
       6.97023991e-03, 2.13709417e-03, 2.34551293e-03, 1.65817583e-03,
       3.93211495e-03, 6.07366134e-05, 1.43938695e-04, 5.19472317e-03,
       2.74876593e-03, 2.84444578e-03, 3.94535696e-04, 7.06150581e-03,
      

In [122]:
cats = []
for cat in whole_pipeline[1].transformers_[0][1].categories_:
  cats.extend(cat)


cats.extend(whole_pipeline[1].transformers_[1][2])
cats

[20,
 30,
 40,
 45,
 50,
 60,
 70,
 75,
 80,
 85,
 90,
 120,
 160,
 180,
 190,
 'C (all)',
 'FV',
 'RH',
 'RL',
 'RM',
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 'Grvl',
 'Pave',
 'IR1',
 'IR2',
 'IR3',
 'Reg',
 'Bnk',
 'HLS',
 'Low',
 'Lvl',
 'AllPub',
 'NoSeWa',
 'Corner',
 'CulDSac',
 'FR2',
 'FR3',
 'Inside',
 'Gtl',
 'Mod',
 'Sev',
 'Blmngtn',
 'Blueste',
 'BrDale',
 'BrkSide',
 'ClearCr',
 'CollgCr',
 'Crawfor',
 'Edwards',
 'Gilbert',
 'IDOTRR',
 'MeadowV',
 'Mitchel',
 'NAmes',
 'NPkVill',
 'NWAmes',
 'NoRidge',
 'NridgHt',
 'OldTown',
 'SWISU',
 'Sawyer',
 'SawyerW',
 'Somerst',
 'StoneBr',
 'Timber',
 'Veenker',
 'Artery',
 'Feedr',
 'Norm',
 'PosA',
 'PosN',
 'RRAe',
 'RRAn',
 'RRNe',
 'RRNn',
 'Artery',
 'Feedr',
 'Norm',
 'PosA',
 'PosN',
 'RRAe',
 'RRAn',
 'RRNn',
 '1Fam',
 '2fmCon',
 'Duplex',
 'Twnhs',
 'TwnhsE',
 '1.5Fin',
 '1.5Unf',
 '1Story',
 '2.5Fin',
 '2.5Unf',
 '2Story',
 'SFoyer',
 'SLvl',
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9

In [123]:
sorted(zip(importances, cats), reverse=True)

[(0.011642862449886797, 'OverallQual'),
 (0.008153927671328111, 'Fireplaces'),
 (0.00756480088043565, 1),
 (0.007061505814058443, 'Inside'),
 (0.007023715404832338, 2),
 (0.007009946060933941, 3),
 (0.006970239906382274, 'Reg'),
 (0.0069516462124107935, 'RFn'),
 (0.00687280170408533, 'No'),
 (0.00676183572457667, 4),
 (0.006646348319679834, 'FullBath'),
 (0.006612546230730658, 'Unf'),
 (0.006575777455374164, 'IR1'),
 (0.006531392876312679, 1),
 (0.0065252460542856916, 1),
 (0.006481793188813867, nan),
 (0.006425756353082009, 'Attchd'),
 (0.006376113504780651, 2),
 (0.006355514548223268, 10),
 (0.006228576214514093, 'BrkFace'),
 (0.006223002906473713, 2),
 (0.006084510266358585, 'None'),
 (0.006019578572775574, 0),
 (0.005995036077214866, 1),
 (0.005992310010803542, 'GLQ'),
 (0.0059809724340431384, 3),
 (0.00592151031703722, 'Hip'),
 (0.0059078093172826186, nan),
 (0.00588304535696479, 7),
 (0.005871361433465216, 'Gd'),
 (0.005869501194541018, 'TA'),
 (0.0058600741664767385, '1Story'),


In [128]:
oh_cols = whole_pipeline[1].transformers_[0][2]
enc = whole_pipeline[1].transformers_[0][1]
cats = enc.get_feature_names(oh_cols).tolist()

cats.extend(whole_pipeline[1].transformers_[1][2])

sorted(zip(importances, cats), reverse=True)

[(0.011642862449886797, 'OverallQual'),
 (0.008153927671328111, 'Fireplaces'),
 (0.00756480088043565, 'LotArea_bin_1'),
 (0.007061505814058443, 'LotConfig_Inside'),
 (0.007023715404832338, 'LotArea_bin_2'),
 (0.007009946060933941, 'TotalBsmtSF_bin_3'),
 (0.006970239906382274, 'LotShape_Reg'),
 (0.0069516462124107935, 'GarageFinish_RFn'),
 (0.00687280170408533, 'BsmtExposure_No'),
 (0.00676183572457667, 'GarageArea_bin_4'),
 (0.006646348319679834, 'FullBath'),
 (0.006612546230730658, 'BsmtFinType1_Unf'),
 (0.006575777455374164, 'LotShape_IR1'),
 (0.006531392876312679, 'GrLivArea_bin_1'),
 (0.0065252460542856916, 'LotFrontage_bin_1'),
 (0.006481793188813867, '1stFlrSF_nan'),
 (0.006425756353082009, 'GarageType_Attchd'),
 (0.006376113504780651, 'GrLivArea_bin_2'),
 (0.006355514548223268, 'BsmtFinSF1_bin_10'),
 (0.006228576214514093, 'MasVnrType_BrkFace'),
 (0.006223002906473713, 'LotFrontage_bin_2'),
 (0.006084510266358585, 'MasVnrType_None'),
 (0.006019578572775574, 'MasVnrArea_bin_0'),
