In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

df = pd.read_csv("https://raw.githubusercontent.com/matthewgraca/4661-house-prices-regression/refs/heads/main/data/train.csv")


y = df['SalePrice']


X = df[['GrLivArea', 'TotalBsmtSF', 'YearBuilt', 'LotArea']]

# ---- Linear Regression ----
print("---- Linear Regression ----")

# Initialize Linear Regression model with a pipeline to standardize features
linear_model = make_pipeline(StandardScaler(), LinearRegression())

# Perform 10-fold Cross-Validation for RMSE and Normalized RMSE using built-in 'neg_root_mean_squared_error'
linear_rmse_scores = cross_val_score(linear_model, X, y, cv=10, scoring='neg_root_mean_squared_error')
linear_rmse = -linear_rmse_scores.mean()  # Convert to positive RMSE
linear_nrmse = linear_rmse / y.mean()  # Normalized RMSE as percentage

print(f"Linear Regression RMSE: {linear_rmse}")
print(f"Normalized RMSE: {linear_nrmse:.2%}")

# Fit the model to get the coefficients for feature importance
linear_model.fit(X, y)
linear_coefficients = linear_model.named_steps['linearregression'].coef_
linear_feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient Magnitude': np.abs(linear_coefficients)  # Magnitude for feature importance
}).sort_values(by='Coefficient Magnitude', ascending=False)

print("\nFeature Importance based on Coefficient Magnitude (Linear Regression):")
print(linear_feature_importance)


---- Linear Regression ----
Linear Regression RMSE: 42777.562423587035
Normalized RMSE: 23.64%

Feature Importance based on Coefficient Magnitude (Linear Regression):
       Feature  Coefficient Magnitude
0    GrLivArea           41435.552830
2    YearBuilt           25994.942500
1  TotalBsmtSF           18433.148333
3      LotArea            4873.010266


In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error


df = pd.read_csv("https://raw.githubusercontent.com/matthewgraca/4661-house-prices-regression/refs/heads/main/data/train.csv")


y = df['SalePrice']


X = df[['GrLivArea', 'TotalBsmtSF', 'YearBuilt', 'LotArea']]

# ---- Polynomial Regression ----
print("\n---- Polynomial Regression ----")

degree = 2


poly_model = make_pipeline(PolynomialFeatures(degree), StandardScaler(), Ridge(alpha=1.0))

# Perform 10-fold Cross-Validation for RMSE and Normalized RMSE using built-in 'neg_root_mean_squared_error'
poly_rmse_scores = cross_val_score(poly_model, X, y, cv=10, scoring='neg_root_mean_squared_error')
poly_rmse = -poly_rmse_scores.mean()  # Convert to positive RMSE
poly_nrmse = poly_rmse / y.mean()  # Normalized RMSE as percentage

print(f"Polynomial Regression (degree {degree}) RMSE: {poly_rmse}")
print(f"Normalized RMSE: {poly_nrmse:.2%}")


poly_model.fit(X, y)
poly_coefficients = poly_model.named_steps['ridge'].coef_
poly_feature_names = poly_model.named_steps['polynomialfeatures'].get_feature_names_out(X.columns)


poly_feature_importance = pd.DataFrame({
    'Feature': poly_feature_names,
    'Coefficient Magnitude': np.abs(poly_coefficients)
}).sort_values(by='Coefficient Magnitude', ascending=False)

print("\nFeature Importance based on Coefficient Magnitude (Polynomial Regression):")
print(poly_feature_importance.head(10))



---- Polynomial Regression ----
Polynomial Regression (degree 2) RMSE: 44530.093998325254
Normalized RMSE: 24.61%

Feature Importance based on Coefficient Magnitude (Polynomial Regression):
                  Feature  Coefficient Magnitude
10  TotalBsmtSF YearBuilt          123883.293969
2             TotalBsmtSF           85980.729779
7     GrLivArea YearBuilt           85087.386106
1               GrLivArea           66813.024254
12            YearBuilt^2           52957.977166
13      YearBuilt LotArea           48436.184343
11    TotalBsmtSF LotArea           42175.180680
5             GrLivArea^2           36563.471756
3               YearBuilt           35911.154148
8       GrLivArea LotArea           28770.744816
