In [1]:
import pandas as pd

TRAIN = '/kaggle/input/house-prices-advanced-regression-techniques/train.csv'

df = pd.read_csv(filepath_or_buffer=TRAIN,).drop(columns=['Id'])
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
TEST = '/kaggle/input/house-prices-advanced-regression-techniques/test.csv'
test_df = pd.read_csv(filepath_or_buffer=TEST)

In [3]:
TARGET = 'SalePrice'
keys = [key for key, value in df.drop(columns=[TARGET]).dtypes.to_dict().items() if str(value) in {'int64', 'float64'}]
numerical_columns = [key for key in keys if test_df[key].isna().sum() < 30]

In [4]:
{key: test_df[key].isna().sum()  + df[key].isna().sum() for key, value in df.drop(columns=[TARGET]).dtypes.to_dict().items() if str(value) in {'int64', 'float64'} and test_df[key].isna().sum() > 0}


{'LotFrontage': 486,
 'MasVnrArea': 23,
 'BsmtFinSF1': 1,
 'BsmtFinSF2': 1,
 'BsmtUnfSF': 1,
 'TotalBsmtSF': 1,
 'BsmtFullBath': 2,
 'BsmtHalfBath': 2,
 'GarageYrBlt': 159,
 'GarageCars': 1,
 'GarageArea': 1}

Let's use a KNN imputer to fill in a few values so we can use more columns to build our model.

In [5]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean',).fit(pd.concat(objs=[df, test_df])[numerical_columns])
imputed_df = pd.DataFrame(data=imputer.transform(X=df[numerical_columns]), columns=numerical_columns)
test_imputed_df = pd.DataFrame(data=imputer.transform(X=test_df[numerical_columns]), columns=numerical_columns) 

In [6]:
from plotly import express

express.histogram(x=df[TARGET].tolist())

In [7]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(imputed_df[numerical_columns], df[TARGET], test_size=0.20, random_state=2024)
linreg = LinearRegression(positive=False).fit(X=X_train, y=y_train)
print('rmse: {:5.4f}'.format(mean_squared_error(squared=False, y_true=np.log(y_test), y_pred=np.log(linreg.predict(X=X_test)))))

rmse: 0.1659


In [8]:
from plotly import express
express.histogram(x=numerical_columns, y=linreg.coef_)

In [9]:
SUBMISSION = '/kaggle/working/submission.csv'

linreg_full = LinearRegression(positive=False).fit(X=imputed_df[numerical_columns], y=df[TARGET])
linreg_result_df = pd.DataFrame(data={'Id': test_df['Id'], 'SalePrice': linreg_full.predict(X=test_imputed_df[numerical_columns])})
linreg_result_df.to_csv(path_or_buf=SUBMISSION, index=False)

In [10]:
from plotly import express
express.histogram(x=numerical_columns, y=linreg_full.coef_)

In [11]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

ridge = Ridge(tol=1e-4, random_state=2024, max_iter=10000, positive=True, solver='lbfgs').fit(X=X_train, y=y_train)
print('rmse: {:5.4f}'.format(mean_squared_error(squared=False, y_true=np.log(y_test), y_pred=np.log(ridge.predict(X=X_test)))))

rmse: 0.1752


In [12]:
from plotly import express
express.histogram(x=ridge.feature_names_in_, y=ridge.coef_)

Our linear regressor result and our ridge result look substantially different.

In [13]:
from plotly import express

express.scatter(x=y_test, y=ridge.predict(X=X_test))

In [14]:
from plotly import express
from sklearn.linear_model import Lasso

lasso = Lasso(tol=1e-4, random_state=2024, max_iter=10000, positive=False, ).fit(X=X_train, y=y_train)
print('rmse: {:5.4f}'.format(mean_squared_error(squared=False, y_true=np.log(y_test), y_pred=np.log(lasso.predict(X=X_test)))))
express.histogram(x=lasso.feature_names_in_, y=lasso.coef_).show()
express.scatter(x=linreg.predict(X=X_test), y=lasso.predict(X=X_test), log_x=True, log_y=True).show()

rmse: 0.1658


What does our delta between the linear regression result and the lasso result look like? When we plot them against each other they look essentially identical.

In [15]:
from plotly import express

express.line(y=sorted(y_test - lasso.predict(X=X_test))).show()

Our best model so far is Ridge with positive=True.
An RMSE of about 0.1836 on our test split corresponds to an RMSE of about 0.2201 for the test data. If we can't improve on our test RMSE we shouldn't submit a new submission.

In [16]:
from plotly import express
from sklearn.neural_network import MLPRegressor

activation = ['identity', 'logistic', 'tanh', 'relu'][3]
neural = MLPRegressor(hidden_layer_sizes=(400, 200, 100,), activation=activation, solver='adam', alpha=1e-3, batch_size='auto', 
                      learning_rate='adaptive', learning_rate_init=1e-2, power_t=0.5, max_iter=1000, shuffle=True, random_state=2024,
                      tol=1e-5, verbose=False, warm_start=False, momentum=0.8, nesterovs_momentum=True, early_stopping=False, beta_1=0.9, 
                      beta_2=0.999, epsilon=1e-08, n_iter_no_change=20, max_fun=15000).fit(X=X_train, y=y_train)
print('done after {} iterations.'.format(neural.n_iter_))
print('rmse: {:5.4f}'.format(mean_squared_error(squared=False, y_true=np.log(y_test), y_pred=np.log(neural.predict(X=X_test)))))
express.scatter(x=linreg.predict(X=X_test), y=neural.predict(X=X_test), log_x=False, log_y=False).show()

done after 154 iterations.
rmse: 0.1829


In [17]:
from plotly import express

express.line(y=sorted(linreg.predict(X=X_test) - neural.predict(X=X_test))).show()

In [18]:
SUBMISSION = '/kaggle/working/submission.csv'

neural.fit(X=pd.DataFrame(data=imputer.transform(X=df[numerical_columns]), columns=numerical_columns), y=df[TARGET])
pd.DataFrame(data={'Id': test_df['Id'], 
                   'SalePrice': neural.predict(X=pd.DataFrame(data=imputer.transform(X=test_df[numerical_columns]),
                                                                                   columns=numerical_columns))}).to_csv(path_or_buf=SUBMISSION, index=False)

A test RMSE of 0.1980 corresponds to a score of 0.2203 for this neural network regressor.

In [19]:
# ridge.fit(X=df[numerical_columns], y=df[TARGET])
# pd.DataFrame(data={'Id': test_df['Id'], 'SalePrice': ridge.predict(X=test_df[numerical_columns])}).to_csv(path_or_buf=SUBMISSION, index=False)

In [20]:
from sklearn.tree import DecisionTreeRegressor

CRITERION = ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'][2]
tree = DecisionTreeRegressor(criterion=CRITERION, splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                             min_weight_fraction_leaf=0.0, max_features=None, random_state=2024, max_leaf_nodes=None,
                             min_impurity_decrease=0.0, ccp_alpha=0.0, ).fit(X=X_train, y=y_train)
y_tree_pred = tree.predict(X=X_test)
print('rmse: {:5.4f}'.format(mean_squared_error(squared=False, y_true=np.log(y_test), y_pred=np.log(y_tree_pred))))
express.scatter(x=y_test, y=y_tree_pred, log_x=False, log_y=False).show()
express.scatter(y=sorted(y_test - y_tree_pred)).show()

rmse: 0.1948


An RMSE of 0.2179 on the test split corresponds to a score of about 0.2148, and an RMSE of 0.2076 on the test split corresponds to a score of about 0.2079.

In [21]:
tree.fit(X=pd.DataFrame(data=imputer.transform(X=df[numerical_columns]), columns=numerical_columns), y=df[TARGET])
y_tree_pred = tree.predict(X=pd.DataFrame(data=imputer.transform(X=test_df[numerical_columns]), columns=numerical_columns))
pd.DataFrame(data={'Id': test_df['Id'], 'SalePrice': y_tree_pred}).to_csv(path_or_buf=SUBMISSION, index=False)
express.histogram(x=numerical_columns, y=tree.feature_importances_).show()