# House Prices: Advanced Regression Techniques
https://www.kaggle.com/c/house-prices-advanced-regression-techniques#evaluation

In [1]:
# Label: SalePrice
# Feature ideas:
# - mean, max, min, std SalePrice for buildings with same MSSubClass (building class)
# - mean, max, min, std SalePrice for buildings with same MSZoning (zoning classification)
# - PCA features

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np
import seaborn as sns
sns.set(style="ticks", color_codes=True)
import ml_helpers

# | Flags
LOG_TRAFO = True
PLOTS = False
ONE_HOT = True
impute_method = 'mm'# 'mm', 'drop'

# | constants
label_name = 'SalePrice'

# Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm
# of the predicted value and the logarithm of the observed sales price

train_path = "./data/train.csv"
test_path = "./data/test.csv"

df = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

df = df.set_index('Id', drop=True)
df_test = df_test.set_index('Id', drop=True)

if LOG_TRAFO:
    df[label_name] = np.log(df[label_name])

    
df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,12.247694
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,12.109011
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,12.317167
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,11.849398
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,12.429216


In [3]:
import importlib
importlib.reload(ml_helpers)

<module 'ml_helpers' from 'C:\\Users\\Nicolas\\workspace\\Kaggle\\house-prices-advanced-regression-techniques\\ml_helpers.py'>

## Drop columns where >90% of the values are missing

In [4]:
# | drop all columns where more than 90% of the values are N/A
orig_columns = set(df.columns)
df = df.dropna(axis=1, thresh=int(0.9*df.shape[0]))

dropped_columns = orig_columns - set(df.columns)

# | test-set
df_test = df_test.drop(list(dropped_columns), axis=1)

## Get column names of numerical and categorical features

In [5]:
cat_colnames = df.select_dtypes(include=['object']).columns.tolist()
num_colnames = df.select_dtypes(exclude=['object']).columns.tolist()
num_colnames.remove(label_name)

df[cat_colnames].head()

Unnamed: 0_level_0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
2,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,SBrkr,TA,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
4,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,SBrkr,Gd,Typ,Detchd,Unf,TA,TA,Y,WD,Abnorml
5,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal


## Map nominal values of categorical features to numerical values

In [6]:
df[cat_colnames], cat_mapping_dict = ml_helpers.categorical_to_numerical(df[cat_colnames])
df_test[cat_colnames] = ml_helpers.categorical_to_numerical(df_test[cat_colnames], cat_mapping_dict)
# TODO: We save the label encoder as a separate object so that we can transform both 
# the training and later the test and validation datasets using the same encoding scheme.
df[cat_colnames].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df[feature_name] = df[feature_name].astype('category')


Unnamed: 0_level_0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,1,3,3,0,4,0,5,2,2,...,4.0,2,6,1.0,1.0,4.0,4.0,2,8,4
2,3,1,3,3,0,2,0,24,1,2,...,4.0,3,6,1.0,1.0,4.0,4.0,2,8,4
3,3,1,0,3,0,4,0,5,2,2,...,4.0,2,6,1.0,1.0,4.0,4.0,2,8,4
4,3,1,0,3,0,0,0,6,2,2,...,4.0,2,6,5.0,2.0,4.0,4.0,2,8,0
5,3,1,0,3,0,2,0,15,2,2,...,4.0,2,6,1.0,1.0,4.0,4.0,2,8,4


## Impute NaN values

In [7]:
df = ml_helpers.impute_missing_values(df, impute_method)
df_test = ml_helpers.impute_missing_values(df_test, impute_method)

df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,3,8450,1,3,3,0,4,0,5,...,0,0,0,0,0,2,2008,8,4,12.247694
2,20,3,9600,1,3,3,0,2,0,24,...,0,0,0,0,0,5,2007,8,4,12.109011
3,60,3,11250,1,0,3,0,4,0,5,...,0,0,0,0,0,9,2008,8,4,12.317167
4,70,3,9550,1,0,3,0,0,0,6,...,272,0,0,0,0,2,2006,8,0,11.849398
5,60,3,14260,1,0,3,0,2,0,15,...,0,0,0,0,0,12,2008,8,4,12.429216


## Removing outliers

In [8]:
# | Plotting pairwise scatter plots
if PLOTS:
    for i in range(0, len(num_colnames), 5):
        end_index = i+5
        if end_index >= len(num_colnames):
            end_index = len(num_colnames) 
        sns.pairplot(data=df[num_colnames+[label_name]], x_vars=num_colnames[i:end_index], y_vars=['SalePrice'])

In [9]:
nr_orig = len(df)
df = df.drop(df[(df['GrLivArea'] > 4000) & (df['SalePrice'] < 300000)].index)
df = df.drop(df[df['1stFlrSF'] > 4000].index)
df = df.drop(df[df['TotalBsmtSF'] > 4000].index)
df = df.drop(df[df['LotArea'] > 100000].index)
df = df.drop(df[df['MiscVal'] > 5000].index)
df = df.drop(df[df['BsmtFinSF1'] > 4000].index)

print('{} ({}%) Outliers removed'.format(nr_orig-len(df), (nr_orig-len(df))*100/nr_orig))

10 (0.684931506849315%) Outliers removed


## Split into Features & Label

In [10]:
df_labels = df.pop(label_name)

## Standardization of the numerical columns

In [11]:
df[num_colnames] = (df[num_colnames]-df[num_colnames].mean())/df[num_colnames].std()
df_test[num_colnames] = (df_test[num_colnames]-df_test[num_colnames].mean())/df_test[num_colnames].std()

## One-hot encoding of categorical features

In [12]:
if ONE_HOT:
    for nr, col_name in enumerate(cat_colnames):
        # | TRAIN SET
        df_dummies = pd.get_dummies(df[col_name], prefix='{}category'.format(nr))
        df = pd.concat([df, df_dummies], axis=1)
        df = df.drop(col_name, axis=1)
        
        # | TEST SET
        df_dummies = pd.get_dummies(df_test[col_name], prefix='{}category'.format(nr))
        df_test = pd.concat([df_test, df_dummies], axis=1)
        df_test = df_test.drop(col_name, axis=1)
    
    # Get missing columns in the test set
    missing_cols = set( df.columns ) - set( df_test.columns )
    # Add a missing column in test set with default value equal to 0
    for col in missing_cols:
        df_test[col] = 0
    # Ensure the order of column in the test set is in the same order than in train set
    df_test = df_test[df.columns]
    df.head()

This code also ensures that column resulting from category in the test dataset but not present in the training dataset will be removed. The alternative would be to concatenate the train & test set features, and apply onehot to both of them together.

## Training and Validation

### Random Forest

In [13]:
# | CV
rf_model = RandomForestRegressor(n_estimators=100)
print(np.sqrt(-cross_val_score(rf_model, df, df_labels, cv=5, scoring="neg_mean_squared_error")).mean())

# | retraining on whole data
rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(df, df_labels)

0.13905269533826548


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

### XGBOOST

In [14]:
from xgboost import XGBRegressor
# TODO: XGBoost does not take categorical features in input.

# | CV
xg_model = XGBRegressor(max_depth=5, n_estimators=400)
print(np.sqrt(-cross_val_score(xg_model, df.values, df_labels, cv=5, scoring="neg_mean_squared_error")).mean())

# | retraining on whole data
xg_model = XGBRegressor(max_depth=5, n_estimators=400)
xg_model.fit(df.values, df_labels)

0.12017174584384457


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=400,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [15]:
from sklearn.ensemble import GradientBoostingRegressor

# | CV
gbr_model = GradientBoostingRegressor(max_depth=4, n_estimators=150)
print(np.sqrt(-cross_val_score(gbr_model, df.values, df_labels, cv=5, scoring="neg_mean_squared_error")).mean())

# | retraining on whole data
gbr_model = GradientBoostingRegressor(max_depth=4, n_estimators=150)
gbr_model.fit(df.values, df_labels)

0.12096120636574961


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=150, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [16]:
from sklearn.linear_model import Lasso

# | CV
lasso_model = Lasso(alpha=0.00047)
print(np.sqrt(-cross_val_score(lasso_model, df.values, df_labels, cv=5, scoring="neg_mean_squared_error")).mean())

# | retraining on whole data
lasso_model = Lasso(alpha=0.00047)
lasso_model.fit(df.values, df_labels)

0.12102268459806112


Lasso(alpha=0.00047, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [17]:
from sklearn.linear_model import Ridge

# | CV
ridge_model = Ridge(alpha=13)
print(np.sqrt(-cross_val_score(ridge_model, df.values, df_labels, cv=5, scoring="neg_mean_squared_error")).mean())

# | retraining on whole data
ridge_model = Ridge(alpha=13)
ridge_model.fit(df.values, df_labels)

0.12114567098586923


Ridge(alpha=13, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

## Predictions on test set

In [18]:
model = xg_model

# | predictions (hybrid/ensemble)
# y_test_predicted = 0.2*rf_model.predict(df_test) + 0.4*xg_model.predict(df_test) + 0.4*gbr_model.predict(df_test)

x = df_test.values
# y_test_predicted = 0.3*gbr_model.predict(x) + 0.3*xg_model.predict(x) + 0.2*lasso_model.predict(x) + 0.2*ridge_model.predict(x)
y_test_predicted = 0.5*lasso_model.predict(x) + 0.5*ridge_model.predict(x)



if LOG_TRAFO:
    y_test_predicted = np.exp(y_test_predicted)

submission_df = pd.DataFrame({"Id": df_test.index, "SalePrice": y_test_predicted})
# submission_df.to_csv('submission7.csv', index=False, float_format='%.16g')
submission_df.to_csv('./submissions/submission16.csv', index=False)