In [1]:
import pandas as pd

TEST = '/kaggle/input/house-prices-advanced-regression-techniques/test.csv'
TRAIN = '/kaggle/input/house-prices-advanced-regression-techniques/train.csv'

test_raw_df = pd.read_csv(filepath_or_buffer=TEST, )
submission_df = test_raw_df[['Id']]
test_raw_df = test_raw_df.drop(columns=['Id'])
train_raw_df = pd.read_csv(filepath_or_buffer=TRAIN, ).drop(columns=['Id'])
train_raw_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


Can we improve results by removing target value outliers from our training data?

In [2]:
from plotly import express

express.histogram(data_frame=train_raw_df, x='SalePrice')

Here's our data processing pipeline; let's do as much of it as we can before we start comparing models.

In [3]:
from sklearn.impute import KNNImputer


TARGET = 'SalePrice'
keys = [key for key, value in train_raw_df.drop(columns=[TARGET]).dtypes.to_dict().items() if str(value) in {'int64', 'float64'}]
COLUMNS = [key for key in keys if test_raw_df[key].isna().sum() == 0]

# our imputing is a work in progress, and we think at the moment that we benefit from 
# using KNN imputing on these three columns is best
knn_columns = [key for key in keys if 0 < test_raw_df[key].isna().sum() < 5]
knn_columns = COLUMNS + ['BsmtFinSF1',  'BsmtUnfSF', 'TotalBsmtSF']

imputer = KNNImputer()
train_knn_df = pd.DataFrame(data=imputer.fit_transform(X=train_raw_df[knn_columns]), columns=knn_columns)
test_knn_df = pd.DataFrame(data=imputer.transform(X=test_raw_df[knn_columns]), columns=knn_columns)

# now get the dummies we want to bolt on
other_columns = [column for column in test_raw_df.columns if column not in COLUMNS and column not in knn_columns and 0 == train_raw_df[column].isna().sum() and 0 == test_raw_df[column].isna().sum()]
dummy_columns = [column for column in other_columns if column not in {'Condition2', 'Heating', 'HouseStyle', 'RoofMatl'}]

train_dummies_df = pd.get_dummies(data=train_raw_df[dummy_columns])
test_dummies_df = pd.get_dummies(data=test_raw_df[dummy_columns])

train_df = pd.concat(axis='columns', objs=[train_knn_df, train_dummies_df,])
train_df[TARGET] = train_raw_df[TARGET].tolist()

# before we go let's remove some outliers
with_outliers_shape = train_df.shape
train_df = train_df[train_df[TARGET] < 500000]

test_df = pd.concat(axis='columns', objs=[test_knn_df, test_dummies_df])
print(with_outliers_shape, train_df.shape, test_df.shape)

(1460, 123) (1451, 123) (1459, 122)


The competition uses RMSE, so we need an RMSE metric.

In [4]:
import numpy as np
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred) -> float:
    return mean_squared_error(squared=False, y_true=np.log(1 + y_true), y_pred=np.log(1 + y_pred))

Let's spend some time tuning our ridge regression parameters:
* alpha : l2 regularization strength

Remember that we have learned elsewhere that our error metric is sensitive to the train/test split, so to fairly compare different alpha values we need to take the mean RMSE across a range of random state values.

Let's also try tuning our 

In [5]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor


CRITERION = 'absolute_error'
MODELS = {
    'tree' : DecisionTreeRegressor(criterion=CRITERION, splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                             min_weight_fraction_leaf=0.0, max_features=None, random_state=2024, max_leaf_nodes=None,
                             min_impurity_decrease=0.0, ccp_alpha=0.0, ),
}

# load up the alpha parameter study
for ialpha in range(30, 50):
    alpha = (1 + ialpha) / 2.0
    MODELS['ridge alpha {}'.format(alpha)] = Ridge(alpha=alpha, tol=1e-4, random_state=2024, max_iter=10000, positive=True, solver='lbfgs')

# load up the random forest estimator count study
for n_estimators in range(10, 121):
    name = 'rf {} estimators/squared_error'.format(n_estimators)
    MODELS[name] = RandomForestRegressor(random_state=2024, n_estimators=n_estimators, criterion = 'squared_error')
    name = 'rf {} estimators/absolute_error'.format(n_estimators)
    MODELS[name] = RandomForestRegressor(random_state=2024, n_estimators=n_estimators, criterion = 'absolute_error')
    
mean_scores = []
for name, model in MODELS.items():
    scores = []
    for random_state in range(10):
        X_train, X_test, y_train, y_test = train_test_split(train_df.drop(columns=[TARGET]), train_df[TARGET], test_size=0.20, random_state=random_state)
        try:
            model.fit(X=X_train, y=y_train)
            score = rmse(y_true=y_test, y_pred=model.predict(X=X_test))
            scores.append(score)
        except ValueError:
            pass
    mean_score = sum(scores)/len(scores)
    mean_scores.append((mean_score, name)) 
    print('{:7.6f} : {}'.format(mean_score, name))

0.208637 : tree
0.193577 : ridge alpha 15.5
0.195130 : ridge alpha 16.0
0.193754 : ridge alpha 16.5
0.193755 : ridge alpha 17.0
0.193329 : ridge alpha 17.5
0.192628 : ridge alpha 18.0
0.193705 : ridge alpha 18.5
0.193171 : ridge alpha 19.0
0.192063 : ridge alpha 19.5
0.194017 : ridge alpha 20.0
0.192632 : ridge alpha 20.5
0.193605 : ridge alpha 21.0
0.193779 : ridge alpha 21.5
0.193666 : ridge alpha 22.0
0.194019 : ridge alpha 22.5
0.193042 : ridge alpha 23.0
0.192960 : ridge alpha 23.5
0.192825 : ridge alpha 24.0
0.193263 : ridge alpha 24.5
0.194568 : ridge alpha 25.0
0.148999 : rf 10 estimators/squared_error
0.152737 : rf 10 estimators/absolute_error
0.147856 : rf 11 estimators/squared_error
0.150890 : rf 11 estimators/absolute_error
0.147565 : rf 12 estimators/squared_error
0.150067 : rf 12 estimators/absolute_error
0.147407 : rf 13 estimators/squared_error
0.149253 : rf 13 estimators/absolute_error
0.146981 : rf 14 estimators/squared_error
0.148838 : rf 14 estimators/absolute_error

In [6]:
SUBMISSION = '/kaggle/working/submission.csv'

best_model_name = sorted(mean_scores)[0][1]
print('best model: {}'.format(best_model_name))
submission_df[TARGET] = MODELS[name].fit(X=train_df.drop(columns=[TARGET]), y=train_df[TARGET]).predict(X=test_df)
submission_df.to_csv(path_or_buf=SUBMISSION, index=False)
print('done.')

best model: rf 114 estimators/squared_error
done.


Let's post-process our mean scores and visualize.

In [7]:
from plotly import express
alpha_rmse = [item[0] for item in mean_scores if 'alpha' in item[1]]
alpha_value = [float(item[1].split()[2]) for item in mean_scores if 'alpha' in item[1]]
express.scatter(x=alpha_value, y=alpha_rmse, title='alpha vs. rmse', trendline='lowess')

We have a couple of puzzles here; 
* The relationship between the alpha regularization parameter and the RMSE is linear but noisy; if we didn't do better with the random forest it might make sense to tune the Ridge alpha further.
* We are measuring the RMSE from our test split as a proxy for the test data RMSE and the relationship between the two values is unknown.

In [8]:
from plotly import express
estimators_rmse = [item[0] for item in mean_scores if 'estimators' in item[1]]
estimator_count = [int(item[1].split()[1]) for item in mean_scores if 'estimators' in item[1]]
express.scatter(x=estimator_count, y=estimators_rmse, title='estimator count vs. rmse', trendline='lowess')

The estimator count vs RMSE curve similarly might lead us to false precision; the RMSE for the test split has a nice sharp minimum at 114 estimators but we actually get a better result for the test data for 100 estimators.