# House Prices: Advanced Regression Techniques (Kaggle Competition) - Predictions, 1st Pass

## Get the Data

In [1]:
from zipfile import ZipFile

# Having some trouble with Kaggle API at the moment, but in future try to download data programmatically if possible

ZIP_PATH = "data/house-prices-advanced-regression-techniques.zip"

with ZipFile(ZIP_PATH, 'r') as zip:
    zip.extractall('data')

In [2]:
import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

## Data Preparation


### Feature Selection

In [3]:
# Remove outliers from bivariate analysis of GrLivArea vs SalePrice
# train = train[train['GrLivArea'] < 4500]
# This helped my score on cross validation but worsened my final test score so I've commented it out

In [4]:
train_X = train[['GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt', 'Neighborhood', 'OverallQual', 'ExterQual', 'BsmtQual', 'KitchenQual']]
train_y = train[['SalePrice']]

### Data Preprocessing (remove outliers, impute missing values, standardize and encode data)

In [5]:
train_X.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   GrLivArea     1460 non-null   int64 
 1   GarageCars    1460 non-null   int64 
 2   TotalBsmtSF   1460 non-null   int64 
 3   FullBath      1460 non-null   int64 
 4   YearBuilt     1460 non-null   int64 
 5   Neighborhood  1460 non-null   object
 6   OverallQual   1460 non-null   int64 
 7   ExterQual     1460 non-null   object
 8   BsmtQual      1423 non-null   object
 9   KitchenQual   1460 non-null   object
dtypes: int64(6), object(4)
memory usage: 114.2+ KB


There are some null values to deal with. 
* We will impute `LotFrontage` with the median
* The 37 values missing from `BsmtQual` are homes with no basement (there are 37 homes with `TotalBsmtSF` = 0). We will add a new category for these.

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

num_features = ['GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_features = ['Neighborhood']
cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())
])

ord_features = ['OverallQual', 'ExterQual', 'BsmtQual', 'KitchenQual']
ord_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NoBsmt')),
    ('ordinal', OrdinalEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features),
        ('ord', ord_transformer, ord_features),
    ]
)

train_X = preprocessor.fit_transform(train_X)

## Model Selection

In [7]:
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR

models = [
    {'name': 'Linear Regression', 'obj': LinearRegression()},
    {'name': 'Stochastic Gradient Descent', 'obj': SGDRegressor()},
    {'name': 'K Neighbors Regressor', 'obj': KNeighborsRegressor()},
    {'name': 'Decision Tree Regressor', 'obj': DecisionTreeRegressor()},
    {'name': 'Random Forest Regressor', 'obj': RandomForestRegressor()},
    {'name': 'Gradient Boosting Regressor', 'obj': GradientBoostingRegressor()},
    {'name': 'XGBoost', 'obj': XGBRegressor()},
    {'name': 'Kernel Ridge', 'obj': KernelRidge()},
    {'name': 'Support Vector Regressor', 'obj': SVR()}
]

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error

for mdl in models:  
    cv_mae = cross_val_score(mdl['obj'], train_X, train_y, cv=5, scoring='neg_mean_absolute_error')
    print()
    print(mdl['name'])
    print("MAE mean:", -(cv_mae.mean()))
    print("MAE Standard deviation:", cv_mae.std())


Linear Regression
MAE mean: 21765.337509236742
MAE Standard deviation: 999.5519134016062

Stochastic Gradient Descent
MAE mean: 26046.499566542374
MAE Standard deviation: 1318.8172481831807

K Neighbors Regressor
MAE mean: 21252.800000000003
MAE Standard deviation: 1562.2850121773479

Decision Tree Regressor
MAE mean: 26533.03595890411
MAE Standard deviation: 2485.125712422217

Random Forest Regressor
MAE mean: 19564.329108013597
MAE Standard deviation: 1268.0813798307554

Gradient Boosting Regressor
MAE mean: 19042.400500172924
MAE Standard deviation: 892.9086546588333

XGBoost
MAE mean: 20308.317513912676
MAE Standard deviation: 1680.6798919034466

Kernel Ridge
MAE mean: 23574.102265299924
MAE Standard deviation: 1246.27711528727

Support Vector Regressor
MAE mean: 55472.38517888852
MAE Standard deviation: 3360.014995497445


## Model Tuning

### Random Forest

In [9]:
from pprint import pprint
rfr = RandomForestRegressor(random_state=33)
pprint(rfr.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 33,
 'verbose': 0,
 'warm_start': False}


In [10]:
from sklearn.model_selection import GridSearchCV
# I tried multiple different grids to achieve a better score, refining the parameters with each iteration. These are the last set I tried
param_grid = {   
        'bootstrap': [True],
        'n_estimators': [300, 350, 400], 
        'max_features': [4, 5, 6], 
        'max_depth': [45, 60, 75, 90],
}

rfr = RandomForestRegressor(random_state=33)
grid_search_rfr = GridSearchCV(rfr, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=6)
grid_search_rfr.fit(train_X, train_y)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=33),
             param_grid={'bootstrap': [True], 'max_depth': [45, 60, 75, 90],
                         'max_features': [4, 5, 6],
                         'n_estimators': [300, 350, 400]},
             scoring='neg_mean_absolute_error')

In [11]:
print(grid_search_rfr.best_params_)
print(grid_search_rfr.best_score_)

{'bootstrap': True, 'max_depth': 45, 'max_features': 6, 'n_estimators': 350}
-18299.24061890085


### Gradient Boosting

In [12]:
from pprint import pprint
rfr = GradientBoostingRegressor(random_state=33)
pprint(rfr.get_params())

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': 33,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}


In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {
        'n_estimators': [1000, 1100, 1500, 2000], 
        'max_features': [4, 5], 
        'max_depth': [4, 5],
        'learning_rate': [0.015, 0.025, 0.05]
}

gbr = GradientBoostingRegressor(random_state=33)
grid_search_gbr = GridSearchCV(gbr, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=6)
grid_search_gbr.fit(train_X, train_y)

GridSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=33),
             param_grid={'learning_rate': [0.015, 0.025, 0.05],
                         'max_depth': [4, 5], 'max_features': [4, 5],
                         'n_estimators': [1000, 1100, 1500, 2000]},
             scoring='neg_mean_absolute_error')

In [14]:
print(grid_search_gbr.best_params_)
print(grid_search_gbr.best_score_)

{'learning_rate': 0.015, 'max_depth': 4, 'max_features': 5, 'n_estimators': 1500}
-17501.519157536157


### XGBoost

In [10]:
from pprint import pprint
xgb = XGBRegressor(random_state=33)
pprint(xgb.get_params())

{'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'objective': 'reg:squarederror',
 'random_state': 33,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}


In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {
        'n_estimators': [125, 100, 125],
        'learning_rate': [0.2, 0.3, 0.4, 0.8]
}

gbr = GradientBoostingRegressor(random_state=33)
grid_search_xgb = GridSearchCV(gbr, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=6)
grid_search_xgb.fit(train_X, train_y)

GridSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=33),
             n_jobs=6,
             param_grid={'learning_rate': [0.2, 0.3, 0.4, 0.8],
                         'n_estimators': [125, 100, 125]},
             scoring='neg_mean_absolute_error')

In [20]:
print(grid_search_xgb.best_params_)
print(grid_search_xgb.best_score_)

{'learning_rate': 0.2, 'n_estimators': 125}
-18586.02242074211


## Make Predictions

### Random Forest

In [18]:
mdl_final = grid_search_rfr.best_estimator_

test_X = test # the target variable has already been removed from the test set provided by kaggle

test_X = preprocessor.fit_transform(test_X)
predictions = mdl_final.predict(test_X)
predictions

array([112943.61619048, 154828.71428571, 172441.43428571, ...,
       141400.99428571, 106869.33333333, 210524.66      ])

### Gradient Boosting

In [16]:
mdl_final = grid_search_gbr.best_estimator_

test_X = test # the target variable has already been removed from the test set provided by kaggle

test_X = preprocessor.fit_transform(test_X)
predictions = mdl_final.predict(test_X)
predictions

array([124099.13779855, 156702.78084556, 172073.44575827, ...,
       146248.53243224, 118460.76941357, 215356.59828922])

In [21]:
mdl_final = grid_search_xgb.best_estimator_

test_X = test # the target variable has already been removed from the test set provided by kaggle

test_X = preprocessor.fit_transform(test_X)
predictions = mdl_final.predict(test_X)
predictions

array([123355.39968837, 156199.76444146, 163199.05083515, ...,
       144774.17875048, 119185.63634525, 239402.15948509])

In [22]:
submission_dict = {'Id': test['Id'], 'SalePrice': predictions}
submission_df = pd.DataFrame(data=submission_dict)
submission_df.to_csv('predictions.csv', index=False)