# **HOUSE PRICES - ADVANCED REGRESSION TECHNIQUES**

https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques

In [236]:
# Common libraries.
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

## **1. LOOK AT THE DATASET:**

In [237]:
train_set = pd.read_csv("data/train.csv")
test_set = pd.read_csv("data/test.csv")

In [238]:
train_set.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [239]:
nan_columns = train_set.columns[train_set.isna().any()]
nan_columns.to_list()

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [240]:
train_labels = train_set["SalePrice"]

# Let's drop the SalePrice from train set and se Id as index for train and test.
train_set = train_set.drop("SalePrice", axis=1)
train_set = train_set.set_index("Id")
test_set = test_set.set_index("Id")

In [241]:
num_attribs = train_set.select_dtypes(include=["number"]).columns
cat_attribs = train_set.select_dtypes(include=["object"]).columns

for feature in train_set[cat_attribs]:
    print(train_set[feature].value_counts())

RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64
Pave    1454
Grvl       6
Name: Street, dtype: int64
Grvl    50
Pave    41
Name: Alley, dtype: int64
Reg    925
IR1    484
IR2     41
IR3     10
Name: LotShape, dtype: int64
Lvl    1311
Bnk      63
HLS      50
Low      36
Name: LandContour, dtype: int64
AllPub    1459
NoSeWa       1
Name: Utilities, dtype: int64
Inside     1052
Corner      263
CulDSac      94
FR2          47
FR3           4
Name: LotConfig, dtype: int64
Gtl    1382
Mod      65
Sev      13
Name: LandSlope, dtype: int64
NAmes      225
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sawyer      74
NWAmes      73
SawyerW     59
BrkSide     58
Crawfor     51
Mitchel     49
NoRidge     41
Timber      38
IDOTRR      37
ClearCr     28
StoneBr     25
SWISU       25
MeadowV     17
Blmngtn     17
BrDale      16
Veenker     11
NPkVill      9
Blueste      2
Name: Neighborhood, dtype:

There is so many features that it's hard to say something, so we just prepare
pipeline to fill NaN values.

---

## **2. PREPROCESSING:**

In [242]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"), OrdinalEncoder())

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)

In [243]:
train_set_prepared = preprocessing.fit_transform(train_set.copy())
train_set_after_preproc = pd.DataFrame(train_set_prepared, columns=preprocessing.get_feature_names_out())  # type: ignore
train_set_after_preproc.head()

Unnamed: 0,pipeline-1__MSSubClass,pipeline-1__LotFrontage,pipeline-1__LotArea,pipeline-1__OverallQual,pipeline-1__OverallCond,pipeline-1__YearBuilt,pipeline-1__YearRemodAdd,pipeline-1__MasVnrArea,pipeline-1__BsmtFinSF1,pipeline-1__BsmtFinSF2,...,pipeline-2__GarageType,pipeline-2__GarageFinish,pipeline-2__GarageQual,pipeline-2__GarageCond,pipeline-2__PavedDrive,pipeline-2__PoolQC,pipeline-2__Fence,pipeline-2__MiscFeature,pipeline-2__SaleType,pipeline-2__SaleCondition
0,0.073375,-0.220875,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.514104,0.575425,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
1,-0.872563,0.46032,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57075,1.171992,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
2,0.073375,-0.084636,0.07348,0.651479,-0.5172,0.984752,0.830215,0.325915,0.092907,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
3,0.309859,-0.44794,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57075,-0.499274,-0.288653,...,5.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,0.0
4,0.073375,0.641972,0.375148,1.374795,-0.5172,0.951632,0.733308,1.366489,0.463568,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0


---

## **3. MODEL SELECTION:**

### **3.1. TRY DIFFERENT MODELS:**

In [244]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

In [245]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42, max_depth=10)
tree_rmses = -cross_val_score(tree_reg, train_set_prepared, train_labels, cv=10,
                              scoring="neg_root_mean_squared_error")
pd.Series(tree_rmses).describe()

count       10.000000
mean     36720.107976
std       7419.946107
min      28871.119860
25%      31990.345531
50%      36519.816978
75%      37880.818516
max      55407.697906
dtype: float64

In [246]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=10), n_estimators=100)
ada_rmses = -cross_val_score(ada_reg, train_set_prepared, train_labels, cv=10,
                             scoring="neg_root_mean_squared_error")
pd.Series(ada_rmses).describe()

count       10.000000
mean     28826.341650
std       6753.491801
min      23208.926179
25%      24046.507782
50%      26037.814241
75%      31417.583131
max      42423.833876
dtype: float64

In [247]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(random_state=42, max_depth=10)
forest_rmses = -cross_val_score(forest_reg, train_set_prepared, train_labels, cv=10,
                                scoring="neg_root_mean_squared_error")
pd.Series(forest_rmses).describe()

count       10.000000
mean     28839.188743
std       6422.501359
min      22287.093736
25%      24375.702578
50%      26274.809822
75%      31770.144974
max      40477.378502
dtype: float64

In [248]:
from sklearn.neighbors import KNeighborsRegressor

knn_reg = KNeighborsRegressor()
knn_rmses = -cross_val_score(knn_reg, train_set_prepared, train_labels, cv=10,
                             scoring="neg_root_mean_squared_error")
pd.Series(knn_rmses).describe()

count       10.000000
mean     39282.386093
std       9072.903807
min      28660.760935
25%      32740.891154
50%      37134.378895
75%      41141.189268
max      56425.193534
dtype: float64

In [249]:
from sklearn.svm import SVR

svm_reg = SVR()
svm_rmses = -cross_val_score(svm_reg, train_set_prepared, train_labels, cv=10,
                             scoring="neg_root_mean_squared_error")
pd.Series(svm_rmses).describe()

count       10.000000
mean     80877.785005
std       9896.768193
min      69009.831331
25%      72657.735326
50%      80508.785968
75%      84920.169540
max      98720.158918
dtype: float64

In [250]:
from sklearn.ensemble import GradientBoostingRegressor

gbr_reg = GradientBoostingRegressor(random_state=42)
gbr_reg_rmses = -cross_val_score(gbr_reg, train_set_prepared, train_labels, cv=10,
                                 scoring="neg_root_mean_squared_error")
pd.Series(gbr_reg_rmses).describe()

count       10.000000
mean     26134.266626
std       5842.591523
min      18977.926760
25%      22500.694287
50%      24794.726606
75%      28204.633082
max      38730.352869
dtype: float64

### **3.2. FEATURE IMPORTANCES AND TUNE GBR:**

In [251]:
# Let's see which features are really important.
forest_reg.fit(train_set_prepared, train_labels)
feature_importances = forest_reg.feature_importances_
feature_names = preprocessing.get_feature_names_out()

importances = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)  # type: ignore
importances[:5]

[('pipeline-1__OverallQual', 0.5848882202693158),
 ('pipeline-1__GrLivArea', 0.11081678435457351),
 ('pipeline-1__TotalBsmtSF', 0.037923373022714646),
 ('pipeline-1__2ndFlrSF', 0.030385548360054557),
 ('pipeline-1__BsmtFinSF1', 0.030284817281182246)]

In [252]:
eps = 0.001
important_features = [x[0] for x in importances if x[1] > eps]
train_set_trucated = pd.DataFrame(train_set_prepared, columns=feature_names)[important_features]  # type: ignore
train_set_trucated[:5]

Unnamed: 0,pipeline-1__OverallQual,pipeline-1__GrLivArea,pipeline-1__TotalBsmtSF,pipeline-1__2ndFlrSF,pipeline-1__BsmtFinSF1,pipeline-1__GarageCars,pipeline-1__1stFlrSF,pipeline-1__GarageArea,pipeline-1__LotArea,pipeline-1__YearBuilt,...,pipeline-2__LandContour,pipeline-2__SaleCondition,pipeline-2__Exterior1st,pipeline-2__BsmtFinType1,pipeline-1__MSSubClass,pipeline-1__HalfBath,pipeline-2__MasVnrType,pipeline-2__LotShape,pipeline-1__YrSold,pipeline-2__Exterior2nd
0,0.651479,0.370333,-0.459303,1.161852,0.575425,0.311725,-0.793434,0.351,-0.207142,1.050994,...,3.0,4.0,12.0,2.0,0.073375,1.227585,1.0,3.0,0.138777,13.0
1,-0.071836,-0.482512,0.466465,-0.795163,1.171992,0.311725,0.25714,-0.060731,-0.091886,0.156734,...,3.0,4.0,8.0,0.0,-0.872563,-0.761621,2.0,3.0,-0.614439,8.0
2,0.651479,0.515013,-0.313369,1.189351,0.092907,0.311725,-0.627826,0.631726,0.07348,0.984752,...,3.0,4.0,12.0,2.0,0.073375,1.227585,1.0,0.0,0.138777,13.0
3,0.651479,0.383659,-0.687324,0.937276,-0.499274,1.650307,-0.521734,0.790804,-0.096897,-1.863632,...,3.0,0.0,13.0,0.0,0.309859,-0.761621,2.0,0.0,-1.367655,15.0
4,1.374795,1.299326,0.19968,1.617877,0.463568,1.650307,-0.045611,1.698485,0.375148,0.951632,...,3.0,4.0,12.0,2.0,0.073375,1.227585,1.0,0.0,0.138777,13.0


In [253]:
# Tune the GBR.
from scipy.stats import randint

param_grid = {
    "n_estimators": randint(50, 200),
    "max_depth": randint(5, 12),
    "max_leaf_nodes": randint(7, 15),
    "min_samples_split": randint(2, 5),
    "min_samples_leaf": randint(1, 4),
    "subsample": [0.6, 0.8],
}

gbr_rnd_search = RandomizedSearchCV(gbr_reg, param_distributions=param_grid, cv=10,
                                    random_state=42, scoring="neg_root_mean_squared_error")
gbr_rnd_search.fit(train_set_trucated, train_labels)

best_rmse = -gbr_rnd_search.best_score_
best_rmse

24881.308181190456

---

## **4. FINAL PREDICTIONS:**

In [254]:
test_set_prepared = preprocessing.transform(test_set.copy())
test_set_trucated = pd.DataFrame(test_set_prepared, columns=feature_names)[important_features]  # type: ignore
results = gbr_rnd_search.best_estimator_.predict(test_set_trucated)  # type: ignore

df = pd.DataFrame({"Id": test_set.index, "SalePrice": results})
df.set_index("Id").to_csv("data/submission.csv")
df

Unnamed: 0,Id,SalePrice
0,1461,126898.678769
1,1462,159503.231182
2,1463,184557.906844
3,1464,187958.094375
4,1465,179753.060326
...,...,...
1454,2915,78540.804978
1455,2916,83444.811552
1456,2917,175305.532448
1457,2918,118784.298263


#### **BEST RESULT ON KAGGLE: 0.13082**