In [1]:
import pandas as pd

TRAIN = '/kaggle/input/house-prices-advanced-regression-techniques/train.csv'

df = pd.read_csv(filepath_or_buffer=TRAIN, index_col=['Id'])
df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
TEST = '/kaggle/input/house-prices-advanced-regression-techniques/test.csv'
test_df = pd.read_csv(filepath_or_buffer=TEST)

In [3]:
TARGET = 'SalePrice'
keys = [key for key, value in df.drop(columns=[TARGET]).dtypes.to_dict().items() if str(value) in {'int64', 'float64'}]
numerical_columns = [key for key in keys if test_df[key].isna().sum() == 0]

In [4]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(df[numerical_columns], df[TARGET], test_size=0.20, random_state=2024)
linreg = LinearRegression(positive=False).fit(X=X_train, y=y_train)
print('rmse: {:5.4f}'.format(mean_squared_error(squared=False, y_true=np.log(y_test), y_pred=np.log(linreg.predict(X=X_test)))))

rmse: 0.1671


In [5]:
from plotly import express
express.histogram(x=numerical_columns, y=linreg.coef_)

In [6]:
SUBMISSION = '/kaggle/working/submission.csv'

linreg_full = LinearRegression(positive=False).fit(X=df[numerical_columns], y=df[TARGET])
linreg_result_df = pd.DataFrame(data={'Id': test_df['Id'], 'SalePrice': linreg_full.predict(X=test_df[numerical_columns])})
linreg_result_df.to_csv(path_or_buf=SUBMISSION, index=False)

In [7]:
from plotly import express
express.histogram(x=numerical_columns, y=linreg_full.coef_)

In [8]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

ridge = Ridge(tol=1e-4, random_state=2024, max_iter=10000, positive=True, solver='lbfgs').fit(X=X_train, y=y_train)
print('rmse: {:5.4f}'.format(mean_squared_error(squared=False, y_true=np.log(y_test), y_pred=np.log(ridge.predict(X=X_test)))))

rmse: 0.1836


In [9]:
from plotly import express
express.histogram(x=ridge.feature_names_in_, y=ridge.coef_)

In [10]:
SUBMISSION = '/kaggle/working/submission.csv'

ridge_full = Ridge(tol=1e-4, random_state=2024, max_iter=10000, positive=True, solver='lbfgs').fit(X=df[numerical_columns], y=df[TARGET])
ridge_result_df = pd.DataFrame(data={'Id': test_df['Id'], 'SalePrice': ridge_full.predict(X=test_df[numerical_columns])})
ridge_result_df.to_csv(path_or_buf=SUBMISSION, index=False)

In [11]:
from plotly import express

express.scatter(x=linreg_full.predict(X=test_df[numerical_columns]), y=ridge_full.predict(X=test_df[numerical_columns]))

In [12]:
from plotly import express
from sklearn.linear_model import Lasso

lasso = Lasso(tol=1e-4, random_state=2024, max_iter=10000, positive=False, ).fit(X=X_train, y=y_train)
print('rmse: {:5.4f}'.format(mean_squared_error(squared=False, y_true=np.log(y_test), y_pred=np.log(lasso.predict(X=X_test)))))
express.histogram(x=lasso.feature_names_in_, y=lasso.coef_).show()
express.scatter(x=linreg.predict(X=X_test), y=lasso.predict(X=X_test), log_x=True, log_y=True).show()

rmse: 0.1671


In [13]:
from plotly import express

express.line(y=linreg.predict(X=X_test) - lasso.predict(X=X_test)).show()

An RMSE of about 0.1671 on our test split corresponds to an RMSE of about 0.2201 for the test data. 