In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
all_data = pd.read_csv('./train.csv')
all_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(all_data, test_size=0.2, random_state=39)

y_train = train["SalePrice"]
X_train = train.drop("SalePrice", axis=1)

y_test = test["SalePrice"]
X_test = test.drop("SalePrice", axis=1)

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class ValueLimiter(BaseEstimator, TransformerMixin):

  def __init__(self, col_limit_pairs, replace=np.nan):
    self.col_limit_pairs = col_limit_pairs
    self.replace = replace
    return

  def fit(self, X, y=None):
    return self
  
  def transform(self, X, y=None):
    for col, limit in self.col_limit_pairs:
      mask = X[col] > limit
      X.loc[mask, col] = np.nan
    return X


In [5]:
limits = [
  ("LotArea", 40000),
  ("BsmtFinSF1", 5000),
  ("TotalBsmtSF", 3000),
  ("1stFlrSF", 1200),
  ("GarageArea", 1200),
]

In [6]:
one_hot_cols = [
  "MSSubClass", "MSZoning", "LotFrontage", "LotArea", "Street", "LotShape", "LandContour", "Utilities",
  "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "YearBuilt",
  "YearRemodAdd", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "MasVnrArea", "ExterQual", "ExterCond",
  "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "TotalBsmtSF", "Heating", "HeatingQC",
  "CentralAir", "Electrical", "1stFlrSF", "2ndFlrSF", "GrLivArea", "KitchenQual", "FireplaceQu", "GarageType",
  "GarageYrBlt", "GarageFinish", "GarageArea", "GarageQual", "GarageCond", "PavedDrive", "SaleType", "SaleCondition",
]

In [62]:
pass_cols = []

for col in X_train.columns:
  if col not in one_hot_cols:
    pass_cols.append(col)

pass_cols

['Id',
 'MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [63]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

col_transformer = ColumnTransformer(
  [ 
    ('onehot', OneHotEncoder(handle_unknown='ignore'), one_hot_cols),
    # ("others", "passthrough", pass_cols),
  ], 
  remainder='drop'
)

_transformed = col_transformer.fit_transform(X_train)
oh_enc = col_transformer.transformers_[0][1]
one_hot_col_names = oh_enc.get_feature_names(one_hot_cols)
pd.DataFrame(_transformed.toarray(), columns=one_hot_col_names)
# _transformed.toarray()

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_Pave,Alley_nan,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1164,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1165,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1166,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [111]:
one_hot_cols = [
  "MSZoning", "Street", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle",
  "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "Heating", 
  "HeatingQC", "CentralAir", "Electrical", "KitchenQual", "Functional", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", "PoolQC",
  "Fence", "MiscFeature", "SaleType", "SaleCondition",
]

In [114]:
impute_cols = [
  "LotFrontage", "MasVnrArea", "GarageYrBlt", "MSSubClass", "LotArea", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "BsmtFinSF1", "BsmtFinSF2",
  "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr",
  "TotRmsAbvGrd", "Fireplaces", "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

In [115]:
drop_cols = [
  "Id",
  "MoSold",
  "YrSold",
]

In [116]:
cols = X_train.columns.to_list()
_cols = [col for col in cols if not (col in drop_cols or col in one_hot_cols or col in impute_cols)]
_cols

[]

In [119]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

col_transformer = ColumnTransformer(
  [ 
    ('onehot', OneHotEncoder(handle_unknown='ignore'), one_hot_cols),
    ('impute', SimpleImputer(), impute_cols),
    ("drop", "drop", drop_cols),
  ], 
  remainder='passthrough'
)

_transformed = col_transformer.fit_transform(X_train)

oh_enc = col_transformer.named_transformers_['onehot']
columns = oh_enc.get_feature_names(one_hot_cols).tolist()
columns.extend(impute_cols)
_col_transformed = pd.DataFrame(_transformed.toarray(), columns=columns)
_col_transformed

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_Pave,Alley_nan,...,Fireplaces,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,2.0,520.0,142.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,2.0,2.0,498.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,282.0,289.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,2.0,482.0,128.0,53.0,0.0,0.0,155.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,2.0,529.0,240.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,308.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1164,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,2.0,380.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0
1165,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,2.0,533.0,296.0,44.0,0.0,0.0,0.0,0.0,0.0
1166,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,2.0,648.0,140.0,45.0,0.0,0.0,0.0,0.0,0.0


In [95]:
col_transformer.get_feature_names()[:10]

['onehot__x0_C (all)',
 'onehot__x0_FV',
 'onehot__x0_RH',
 'onehot__x0_RL',
 'onehot__x0_RM',
 'onehot__x1_Grvl',
 'onehot__x1_Pave',
 'onehot__x2_Grvl',
 'onehot__x2_Pave',
 'onehot__x2_nan']

In [100]:
col_transformer.named_transformers_['onehot']

OneHotEncoder(handle_unknown='ignore')

In [138]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
  ('pipeline', ValueLimiter(limits)),
  ('onehot', col_transformer),
  # ('classifier', RandomForestClassifier(random_state=38))
])

In [102]:
def display_scores(scores):
  print("Scores:", scores)
  print("Mean:", scores.mean())
  print("Standard deviation:", scores.std())

In [139]:
from sklearn.model_selection import GridSearchCV

param_grid = [
  { 
    "max_features": [7, 8, 9],
    "min_samples_split": [5, 6, 7],
    "n_estimators": [110, 120, 140, 180],
  }
]

_X_train = pipeline.fit_transform(X_train)
grid_search = GridSearchCV(
  RandomForestClassifier(random_state=38),
  param_grid,
  cv=5,
  scoring="neg_mean_squared_error")
grid_search.fit(_X_train, y_train)

grid_search.best_score_, grid_search.best_params_



(-1695839332.5173545,
 {'max_features': 8, 'min_samples_split': 6, 'n_estimators': 180})

In [141]:
from sklearn.model_selection import GridSearchCV

param_grid = [
  { 
    "max_features": [8, 10, 16],
    "min_samples_split": [6],
    "n_estimators": [170, 180, 190],
  }
]

_X_train = pipeline.fit_transform(X_train)
grid_search = GridSearchCV(
  RandomForestClassifier(random_state=38),
  param_grid,
  cv=5,
  scoring="neg_mean_squared_error")
grid_search.fit(_X_train, y_train)

grid_search.best_score_, grid_search.best_params_



(-1695839332.5173545,
 {'max_features': 8, 'min_samples_split': 6, 'n_estimators': 180})

In [142]:
from sklearn.model_selection import cross_val_score

_X_test = pipeline.transform(X_test)
scores = cross_val_score(grid_search.best_estimator_, _X_test, y_test, cv=3, scoring="neg_mean_squared_error")
display_scores(np.sqrt(-scores))



Scores: [39763.22499329 45842.90008211 39378.47242054]
Mean: 41661.53249864759
Standard deviation: 2960.842761396474


In [143]:
house_price_test = pd.read_csv('./test.csv')

In [146]:
file = "./predict-3.csv"

In [147]:
_test = pipeline.transform(house_price_test)
predict = grid_search.best_estimator_.predict(_test)

id = np.array(house_price_test["Id"]).astype(int)
df = pd.DataFrame(predict, id, columns = ["SalePrice"])
df.to_csv(file, index_label = ["Id"])

0.20392

In [149]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

drop_cols = [
  "Id",
  # "MoSold",
  # "YrSold",
]

col_transformer = ColumnTransformer(
  [ 
    ('onehot', OneHotEncoder(handle_unknown='ignore'), one_hot_cols),
    ('impute', SimpleImputer(), impute_cols),
    ("drop", "drop", drop_cols),
  ], 
  remainder='passthrough'
)

_transformed = col_transformer.fit_transform(X_train)
_transformed.toarray()[:10]

array([[   0.,    0.,    0., ...,    0.,    5., 2006.],
       [   0.,    0.,    0., ...,    0.,   12., 2009.],
       [   0.,    0.,    0., ...,    0.,    6., 2007.],
       ...,
       [   0.,    0.,    0., ...,    0.,   11., 2009.],
       [   0.,    0.,    0., ...,    0.,    3., 2010.],
       [   0.,    0.,    0., ...,    0.,   12., 2008.]])

In [150]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
  ('pipeline', ValueLimiter(limits)),
  ('onehot', col_transformer),
  # ('classifier', RandomForestClassifier(random_state=38))
])

In [153]:
from sklearn.model_selection import cross_val_score

_X_test = pipeline.fit_transform(X_test)
rfc = RandomForestClassifier(random_state=38, max_features=8, min_samples_split=6, n_estimators=180)
rfc.fit(_X_test, y_test)
scores = cross_val_score(rfc, _X_test, y_test, cv=3, scoring="neg_mean_squared_error")
display_scores(np.sqrt(-scores))



Scores: [45285.92547574 38668.31991983 34474.31833959]
Mean: 39476.18791172004
Standard deviation: 4450.632931454732


In [154]:
_X_test = pipeline.transform(X_test)
scores = cross_val_score(grid_search.best_estimator_, _X_test, y_test, cv=3, scoring="neg_mean_squared_error")
display_scores(np.sqrt(-scores))



Scores: [45285.92547574 38668.31991983 34474.31833959]
Mean: 39476.18791172004
Standard deviation: 4450.632931454732


In [156]:
file = "./predict-3-2.csv"

_test = pipeline.transform(house_price_test)
predict = rfc.predict(_test)

id = np.array(house_price_test["Id"]).astype(int)
df = pd.DataFrame(predict, id, columns = ["SalePrice"])
df.to_csv(file, index_label = ["Id"])

0.23644