In [3]:
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [84]:
features = df.drop('SalePrice', axis=1)
labels = df[['SalePrice']]

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2)

In [85]:
features.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


In [86]:
labels.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


In [87]:
from sklearn.metrics import mean_squared_error
import numpy as np

def log_rmse(actual, predicted, use_log=False):
    if use_log == True:
        actual = np.log(actual)
        predicted = np.log(predicted)
    return mean_squared_error(actual, predicted, squared=False)

In [113]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

def extract_columns(X):
    return X[[
        'OverallQual', 'GrLivArea', 'GarageArea', 'TotalBsmtSF',
        '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt',
        'YearRemodAdd', '2ndFlrSF'
    ]]

transformer = FunctionTransformer(extract_columns)

pipeline = Pipeline([
    ('transformer', transformer),
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(n_jobs=5))
])

pipeline.fit(X_train, y_train.values.ravel())

predictions = pipeline.predict(X_test)

log_rmse(y_test, predictions, True)

0.17052914432006913

In [115]:
from sklearn.model_selection import RandomizedSearchCV

param_map = {
    'regressor__max_features': ['auto', None],
    'regressor__n_estimators': list(map(lambda x: int(x), np.linspace(100, 1000, num=60)))
}

randomsearch = RandomizedSearchCV(pipeline, param_distributions=param_map, verbose=3, cv=5, n_jobs=-1, n_iter=50)

best = randomsearch.fit(X_train, y_train.values.ravel())

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   55.2s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  8.8min finished


In [117]:
best.best_params_

{'regressor__n_estimators': 298, 'regressor__max_features': None}

In [118]:
best.best_estimator_.score(X_test, y_test)

0.8625014562263559

In [119]:
predictions = best.best_estimator_.predict(X_test)
log_rmse(y_test, predictions, True)

0.16809022042070312

In [120]:
best.best_estimator_.fit(features, labels)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Pipeline(steps=[('transformer',
                 FunctionTransformer(func=<function extract_columns at 0x7fe0bd6faea0>)),
                ('scaler', StandardScaler()),
                ('regressor',
                 RandomForestRegressor(max_features=None, n_estimators=298,
                                       n_jobs=5))])

In [121]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [127]:
df_test.shape

(1459, 80)

In [128]:
df_test.fillna(value=0, inplace=True)

In [129]:
df_test[[
        'OverallQual', 'GrLivArea', 'GarageArea', 'TotalBsmtSF',
        '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt',
        'YearRemodAdd', '2ndFlrSF'
    ]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   OverallQual   1459 non-null   int64  
 1   GrLivArea     1459 non-null   int64  
 2   GarageArea    1459 non-null   float64
 3   TotalBsmtSF   1459 non-null   float64
 4   1stFlrSF      1459 non-null   int64  
 5   FullBath      1459 non-null   int64  
 6   TotRmsAbvGrd  1459 non-null   int64  
 7   YearBuilt     1459 non-null   int64  
 8   YearRemodAdd  1459 non-null   int64  
 9   2ndFlrSF      1459 non-null   int64  
dtypes: float64(2), int64(8)
memory usage: 114.1 KB


In [131]:
best.best_estimator_.predict(df_test)

array([128114.76510067, 147731.54362416, 170545.70805369, ...,
       147634.13758389, 109300.        , 233164.72483221])

In [132]:
df_predictions = pd.DataFrame({
    'Id': df_test.Id,
    'SalePrice': best.best_estimator_.predict(df_test)
})
df_predictions.head()

Unnamed: 0,Id,SalePrice
0,1461,128114.765101
1,1462,147731.543624
2,1463,170545.708054
3,1464,181494.966443
4,1465,216955.587248


In [133]:
df_predictions.to_csv('predictions.csv', index=False)