In [10]:
!pip install matplotlib==3.5.3



In [11]:
import json
import pathlib
import pickle
from typing import List
from typing import Tuple

import pandas
from sklearn import model_selection
from sklearn import neighbors
from sklearn import pipeline
from sklearn import preprocessing

In [12]:
SALES_PATH = "../data/kc_house_data.csv"  # path to CSV with home sale data
DEMOGRAPHICS_PATH = "../data/zipcode_demographics.csv"  # path to CSV with demographics
# List of columns (subset) that will be taken from home sale data
SALES_COLUMN_SELECTION = [
    'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
    'sqft_above', 'sqft_basement', 'zipcode'
]
OUTPUT_DIR = "model"  # Directory where output artifacts will be saved

In [13]:
def load_data(
    sales_path: str, demographics_path: str, sales_column_selection: List[str]
) -> Tuple[pandas.DataFrame, pandas.Series]:
    """Load the target and feature data by merging sales and demographics.

    Args:
        sales_path: path to CSV file with home sale data
        demographics_path: path to CSV file with home sale data
        sales_column_selection: list of columns from sales data to be used as
            features

    Returns:
        Tuple containg with two elements: a DataFrame and a Series of the same
        length.  The DataFrame contains features for machine learning, the
        series contains the target variable (home sale price).

    """
    data = pandas.read_csv(sales_path,
                           usecols=sales_column_selection,
                           dtype={'zipcode': str})
    demographics = pandas.read_csv("../data/zipcode_demographics.csv",
                                   dtype={'zipcode': str})

    merged_data = data.merge(demographics, how="left",
                             on="zipcode").drop(columns="zipcode")
    # Remove the target variable from the dataframe, features will remain
    y = merged_data.pop('price')
    x = merged_data

    return x, y

In [14]:
"""Load data, train model, and export artifacts."""
x, y = load_data(SALES_PATH, DEMOGRAPHICS_PATH, SALES_COLUMN_SELECTION)
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    x, y, random_state=42)

In [15]:
model = pipeline.make_pipeline(
    preprocessing.RobustScaler(),
    neighbors.KNeighborsRegressor()
).fit(x_train, y_train)

y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

In [16]:
# --- MODEL EVALUATION ---
# 1. Make predictions on the test data
y_pred = model.predict(x_test)

# 2. Calculate performance metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# 3. Print the evaluation results
print("\n--- Model Performance ---")
print(f"R-squared (R²): {r2:.3f}")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")
print("-------------------------\n")


--- Model Performance ---
R-squared (R²): 0.728
Mean Absolute Error (MAE): $102,044.70
Root Mean Squared Error (RMSE): $201,659.43
-------------------------



In [None]:
#RandomForestRegressor Implementation

In [18]:
from sklearn.ensemble import RandomForestRegressor
"""Load data, train a powerful model, and evaluate."""
x, y = load_data(SALES_PATH, DEMOGRAPHICS_PATH, SALES_COLUMN_SELECTION)
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    x, y, random_state=42)

# --- SWAP THE MODEL ---
# Replace KNeighborsRegressor with RandomForestRegressor
# n_estimators is how many "trees" it builds. More is often better but slower.
model = pipeline.make_pipeline(
    preprocessing.RobustScaler(),
    RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
).fit(x_train, y_train)
# ----------------------

# --- EVALUATION ---
y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("\n--- RandomForest Model Performance ---")
print(f"Test R-squared (R²): {r2:.3f}")
print(f"Test Mean Absolute Error (MAE): ${mae:,.2f}")
print("-------------------------------------\n")


--- RandomForest Model Performance ---
Test R-squared (R²): 0.782
Test Mean Absolute Error (MAE): $93,758.10
-------------------------------------

