In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
all_data = pd.read_csv('./train.csv')
all_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(all_data, test_size=0.2, random_state=39)

y_train = train["SalePrice"]
X_train = train.drop("SalePrice", axis=1)

y_test = test["SalePrice"]
X_test = test.drop("SalePrice", axis=1)

In [4]:
use_columns = [
  'OverallQual',
  'YearBuilt',
  'YearRemodAdd',
  'MasVnrArea',
  'TotalBsmtSF',
  '1stFlrSF',
  'GrLivArea',
  'FullBath',
  'TotRmsAbvGrd',
  'Fireplaces',
  'GarageYrBlt',
  'GarageCars',
  'GarageArea',
]

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):

  def __init__(self):
    return

  def fit(self, X, y=None):
    return self
  
  def transform(self, X, y=None):
    return X[use_columns]


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

pipeline = Pipeline([
  ('selector', ColumnSelector()),
  ('imputer', SimpleImputer()),
])

In [7]:
_X_train = pipeline.fit_transform(X_train)
_X_train

array([[5.000e+00, 1.900e+03, 1.970e+03, ..., 1.970e+03, 2.000e+00,
        5.200e+02],
       [6.000e+00, 1.960e+03, 1.960e+03, ..., 1.960e+03, 2.000e+00,
        4.980e+02],
       [5.000e+00, 1.950e+03, 1.950e+03, ..., 1.950e+03, 1.000e+00,
        2.820e+02],
       ...,
       [7.000e+00, 1.996e+03, 1.997e+03, ..., 1.996e+03, 2.000e+00,
        5.330e+02],
       [8.000e+00, 2.003e+03, 2.004e+03, ..., 2.003e+03, 2.000e+00,
        6.480e+02],
       [8.000e+00, 1.998e+03, 1.998e+03, ..., 1.998e+03, 3.000e+00,
        1.069e+03]])

In [8]:
def display_scores(scores):
  print("Scores:", scores)
  print("Mean:", scores.mean())
  print("Standard deviation:", scores.std())

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(random_state=38)
clf.fit(_X_train, y_train)
scores = cross_val_score(clf, _X_train, y_train, cv=3, scoring="neg_mean_squared_error")
display_scores(scores)



Scores: [-1.46778900e+09 -1.43716777e+09 -2.49667833e+09]
Mean: -1800545034.704579
Standard deviation: 492399291.10722965


In [33]:
rmse_scores = np.sqrt(-scores)
display_scores(rmse_scores)

Scores: [38311.73453077 37909.99561249 49966.77229938]
Mean: 42062.8341475487
Standard deviation: 5591.334203961495


In [35]:
_X_test = pipeline.fit_transform(X_test)
scores = cross_val_score(clf, _X_test, y_test, cv=3, scoring="neg_mean_squared_error")
display_scores(np.sqrt(-scores))



Scores: [37272.1541716  33204.41071279 41658.03360883]
Mean: 37378.19949774164
Standard deviation: 3451.9916208405502


In [38]:
predict = clf.predict(_X_test)
for p, l in zip(predict[:10], y_test[:10]):
  print("predict:", p, ", actual: ", l)

predict: 157000 , actual:  155000
predict: 128000 , actual:  150500
predict: 112000 , actual:  96500
predict: 171900 , actual:  164700
predict: 116000 , actual:  105000
predict: 190000 , actual:  172000
predict: 180000 , actual:  163500
predict: 372402 , actual:  297000
predict: 206000 , actual:  194000
predict: 145000 , actual:  133000


In [42]:
from sklearn.model_selection import GridSearchCV

param_grid = [
  { 
    "max_features": [2, 4, 8, 12, 13],
    "min_samples_split": [2, 4, 8, 12],
    "n_estimators": [10, 50, 100],
  }
]

grid_search = GridSearchCV(
  RandomForestClassifier(random_state=38),
  param_grid,
  cv=5,
  scoring="neg_mean_squared_error",
  return_train_score=True)
grid_search.fit(_X_train, y_train)

grid_search.best_score_, grid_search.best_params_



(-1636266885.7497268,
 {'max_features': 8, 'min_samples_split': 2, 'n_estimators': 100})

In [43]:
_X_test = pipeline.fit_transform(X_test)
scores = cross_val_score(grid_search.best_estimator_, _X_test, y_test, cv=3, scoring="neg_mean_squared_error")
display_scores(np.sqrt(-scores))



Scores: [38923.72360077 37949.87613361 41896.12136946]
Mean: 39589.90703461264
Standard deviation: 1678.503925324633


In [12]:
house_price_test = pd.read_csv('./test.csv')

In [47]:
file = "./predict-1.csv"
_test = pipeline.fit_transform(house_price_test)
predict = grid_search.best_estimator_.predict(_test)

id = np.array(house_price_test["Id"]).astype(int)
df = pd.DataFrame(predict, id, columns = ["SalePrice"])
df.to_csv(file, index_label = ["Id"])

Score => 0.21613

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rfc = RandomForestClassifier(random_state=38, max_features=8, min_samples_split=2, n_estimators=100)
rfc.fit(_X_train, y_train)
scores = cross_val_score(rfc, _X_train, y_train, cv=3, scoring="neg_mean_squared_error")
display_scores(scores)



Scores: [-1.49690738e+09 -1.67272676e+09 -2.59798665e+09]
Mean: -1922540265.0760596
Standard deviation: 482976175.1598754


In [14]:
_X_test = pipeline.transform(X_test)
scores = cross_val_score(rfc, _X_test, y_test, cv=3, scoring="neg_mean_squared_error")
display_scores(np.sqrt(-scores))



Scores: [38525.64125955 38527.97597302 41996.47352733]
Mean: 39683.36358663146
Standard deviation: 1635.6160024182618


In [16]:
file = "./predict-1-2.csv"
_test = pipeline.transform(house_price_test)
predict = rfc.predict(_test)

id = np.array(house_price_test["Id"]).astype(int)
df = pd.DataFrame(predict, id, columns = ["SalePrice"])
df.to_csv(file, index_label = ["Id"])