In [58]:
import kagglehub
import os 
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error


# Download latest version
path = kagglehub.dataset_download("shree1992/housedata")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\USER PC\.cache\kagglehub\datasets\shree1992\housedata\versions\2


In [59]:
file_path = os.path.join(path, 'data.csv')
df = pd.read_csv(file_path)
df

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,3.130000e+05,3.0,1.50,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2.384000e+06,5.0,2.50,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,3.420000e+05,3.0,2.00,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,4.200000e+05,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,5.500000e+05,4.0,2.50,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,2014-07-09 00:00:00,3.081667e+05,3.0,1.75,1510,6360,1.0,0,0,4,1510,0,1954,1979,501 N 143rd St,Seattle,WA 98133,USA
4596,2014-07-09 00:00:00,5.343333e+05,3.0,2.50,1460,7573,2.0,0,0,3,1460,0,1983,2009,14855 SE 10th Pl,Bellevue,WA 98007,USA
4597,2014-07-09 00:00:00,4.169042e+05,3.0,2.50,3010,7014,2.0,0,0,3,3010,0,2009,0,759 Ilwaco Pl NE,Renton,WA 98059,USA
4598,2014-07-10 00:00:00,2.034000e+05,4.0,2.00,2090,6630,1.0,0,0,3,1070,1020,1974,0,5148 S Creston St,Seattle,WA 98178,USA


In [60]:

# ---------------------------------------------------------
# Convert price to thousands
# ---------------------------------------------------------
df["price"] = df["price"] / 1000

# =========================================================
# 2. REMOVE OUTLIERS (IQR METHOD)
# =========================================================
Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df["price"] >= lower_bound) & (df["price"] <= upper_bound)]

# =========================================================
# 3. FEATURE SELECTION
# =========================================================
# Remove zero or negative prices before log-transform
df = df[df["price"] > 0]

# Log-transform price
y = np.log(df["price"])

# Drop target & unnecessary columns
X = df.drop(columns=["price", "date", "street"])

# Numeric features (drop redundant ones)
numeric_features = [
    "bedrooms","bathrooms","sqft_living","sqft_lot","floors",
    "waterfront","view","condition",
    "yr_built","yr_renovated"
]

categorical_features = ["city", "statezip", "country"]

# =========================================================
# 4. PREPROCESSING
# =========================================================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# =========================================================
# 5. PIPELINE
# =========================================================
model = Pipeline(
    steps=[
        ("prep", preprocessor),
        ("reg", LinearRegression())
    ]
)

# =========================================================
# 6. TRAIN / TEST SPLIT
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=42
)

# =========================================================
# 7. TRAIN
# =========================================================
model.fit(X_train, y_train)

# =========================================================
# 8. EVALUATE
# =========================================================
# Exponentiate predictions to get actual price
y_pred = np.exp(model.predict(X_test))
y_test_actual = np.exp(y_test)

r2 = r2_score(y_test_actual, y_pred)
rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred))

print("\nMODEL PERFORMANCE")
print("--------------------")
print("R² Score:", round(r2, 4))
print("RMSE:", round(rmse, 2), "($ thousands)")

# =========================================================
# 9. COEFFICIENTS
# =========================================================
num_features = numeric_features

cat_features = model.named_steps["prep"] \
    .named_transformers_["cat"] \
    .get_feature_names_out(categorical_features)

feature_names = list(num_features) + list(cat_features)

coefficients = model.named_steps["reg"].coef_

coef_table = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coefficients
})

coef_table["Abs_Effect"] = coef_table["Coefficient"].abs()
coef_table = coef_table.sort_values("Abs_Effect", ascending=False)

print("\nTOP COEFFICIENTS")
print("-----------------")
print(coef_table[["Feature", "Coefficient"]].head(20))

# =========================================================
# 10. SAMPLE PREDICTIONS
# =========================================================
comparison = pd.DataFrame({
    "Actual_Price_(Thousands)": y_test_actual.values,
    "Predicted_Price_(Thousands)": y_pred
})

print("\nSAMPLE PREDICTIONS")
print("--------------------")
print(comparison.head(10))



MODEL PERFORMANCE
--------------------
R² Score: 0.7574
RMSE: 106.65 ($ thousands)

TOP COEFFICIENTS
-----------------
                     Feature  Coefficient
52         city_Yarrow Point    -0.769939
125        statezip_WA 98198    -0.601598
104        statezip_WA 98109     0.555084
121        statezip_WA 98168    -0.508871
123        statezip_WA 98178    -0.473081
105        statezip_WA 98112     0.462038
110        statezip_WA 98119     0.425212
10               city_Algona    -0.408566
124        statezip_WA 98188    -0.404086
100        statezip_WA 98105     0.403164
12   city_Beaux Arts Village     0.386422
19            city_Covington    -0.376269
98         statezip_WA 98102     0.372064
18           city_Clyde Hill     0.358258
56         statezip_WA 98004     0.353483
13             city_Bellevue     0.325669
118        statezip_WA 98148    -0.323805
35        city_Normandy Park     0.319478
102        statezip_WA 98107     0.300129
111        statezip_WA 98122     0.29622

In [61]:
from sklearn.linear_model import Lasso, Ridge

In [63]:
# -------------------------------------
# Ridge Regression (with preprocessing)
# -------------------------------------
ridge = Pipeline([
    ("prep", preprocessor),
    ("reg", Ridge(alpha=1.0))
])

ridge.fit(X_train, y_train)

ridge_pred = ridge.predict(X_test)
ridge_pred_actual = np.exp(ridge_pred)
y_test_actual = np.exp(y_test)

ridge_mse = mean_squared_error(y_test_actual, ridge_pred_actual)

print("\nRidge Coefficients:", ridge.named_steps["reg"].coef_)
print("Ridge Intercept:", ridge.named_steps["reg"].intercept_)
print("Ridge MSE:", ridge_mse)


# -------------------------------------
# Lasso Regression (with preprocessing)
# -------------------------------------
lasso = Pipeline([
    ("prep", preprocessor),
    ("reg", Lasso(alpha=0.01))
])

lasso.fit(X_train, y_train)

lasso_pred = lasso.predict(X_test)
lasso_pred_actual = np.exp(lasso_pred)

lasso_mse = mean_squared_error(y_test_actual, lasso_pred_actual)

print("\nLasso Coefficients:", lasso.named_steps["reg"].coef_)
print("Lasso Intercept:", lasso.named_steps["reg"].intercept_)
print("Lasso MSE:", lasso_mse)



Ridge Coefficients: [-9.21083555e-03  3.44895174e-02  2.24270819e-01  1.35339195e-02
  1.05216215e-02  1.69489481e-02  3.45362243e-02  3.99865953e-02
  1.31234361e-02  1.46699980e-02 -3.28731511e-01 -2.44141201e-01
  1.87099223e-01  3.33056007e-01  3.50226713e-02  4.04459599e-02
  2.22310322e-02 -2.00881801e-02  2.93814398e-01 -3.48668759e-01
  5.45436175e-02 -5.06398679e-02 -1.81246160e-01  5.30715840e-02
 -2.01551340e-01  6.85018007e-02  1.36346163e-01  2.46218900e-02
 -2.07224965e-01  1.65398808e-01  1.42706164e-01 -1.16028269e-01
  2.28199801e-01 -5.61131430e-02  1.17863731e-01  2.28966010e-01
 -3.88274560e-02 -1.95478510e-01  5.07372823e-02 -2.06815770e-02
  2.04297474e-01 -1.86079235e-01  1.77170444e-01  3.09243214e-03
  1.66074276e-01  4.32729841e-02 -2.05681086e-01  2.19800229e-02
  6.90948965e-02 -6.21183188e-02 -1.27329980e-02  1.11525068e-01
 -5.03101163e-01 -1.81221419e-01 -2.21577148e-01 -1.38320389e-01
  3.44312671e-01  7.82007657e-02 -2.10499082e-02  1.94948377e-02
 -1.