# Housing Price Prediction – Modeling
This notebook builds regression models using the preprocessed housing dataset to predict log-transformed housing prices. 
It follows the modeling portion of the project, focusing on applying multiple algorithms and tuning them.

In [2]:
# Imports and Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
# Load Preprocessed Data
df = pd.read_csv('processed_housing.csv')
df.head()

Unnamed: 0,bedrooms,bathrooms,floors,waterfront,view,condition,grade,zipcode,sale_year,sale_month,was_renovated,effective_year,effective_age,price_log,sqft_living_log,sqft_lot_log,sqft_living15_log,sqft_lot15_log
0,2,1.0,1.0,0,0,3,7,98178,2014,10,0,1955,59,12.351475,7.074117,8.639588,7.201171,8.639588
1,3,2.25,2.0,0,0,3,7,98125,2014,12,1,1991,23,13.195616,7.85205,8.887791,7.433075,8.941153
2,2,1.0,1.0,0,0,3,6,98028,2015,2,0,1933,82,12.100718,6.647688,9.21044,7.908755,8.995041
3,4,3.0,1.0,0,0,5,7,98136,2014,12,0,1965,49,13.311331,7.58121,8.517393,7.215975,8.517393
4,3,2.0,1.0,0,0,3,8,98074,2015,2,0,1987,28,13.142168,7.427144,8.997271,7.496097,8.923191


In [4]:
# Define Target and Features
X = df.drop('price_log', axis=1)
y = df['price_log']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
# Preprocessing Setup
numeric_features = ['bedrooms', 'bathrooms', 'floors', 'waterfront', 'view',
                    'condition', 'grade', 'sale_year', 'sale_month',
                    'was_renovated', 'effective_year', 'effective_age',
                    'sqft_living_log', 'sqft_lot_log', 'sqft_living15_log',
                    'sqft_lot15_log']
categorical_features = ['zipcode']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [6]:
# Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor

gb_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(random_state=42))
])

gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred_gb)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_gb))
r2 = r2_score(y_test, y_pred_gb)

print("Gradient Boosting Results:")
print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

Gradient Boosting Results:
MAE: 0.180
RMSE: 0.234
R²: 0.808


In [None]:
# Linear Regression Part- Sangit

from sklearn.linear_model import LinearRegression

lr_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression Results:")
print(f"MAE: {mae_lr:.3f}")
print(f"RMSE: {rmse_lr:.3f}")
print(f"R²: {r2_lr:.3f}")

Linear Regression Results:
MAE: 0.133
RMSE: 0.181
R²: 0.886


In [8]:
# TODO Rose: Implement Decision Tree model pipeline