<a href="https://colab.research.google.com/github/marcorrea1/AAI2026/blob/main/Coding_Exercise_ML_Basics_Exer_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# =========================================================
# Housing Price Prediction
# Dataset: USA Housing Dataset (Kaggle)
# Path: Google Drive
# =========================================================

import os
import pandas as pd
from google.colab import drive
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# 0) Mount Google Drive, I uploaded the data to my google drive
# This prevents the file from unloading after not using colab
if not os.path.exists("/content/drive/MyDrive"):
    drive.mount("/content/drive")

# 1) Load dataset
csv_path = "/content/drive/MyDrive/Coding Exercise - ML Basics/USA Housing Dataset.csv"
data = pd.read_csv(csv_path)

# 2) Select features and target
X = data[["sqft_living", "city"]].copy()
y = data["price"].copy()

# I renamed the file in order for the output to come out similar to the assignment
X = X.rename(columns={
    "sqft_living": "square_footage",
    "city": "location"
})

# 3) OneHotEncoder is used here to convert the category into numeric value
preprocess = ColumnTransformer(
    transformers=[
        ("loc", OneHotEncoder(handle_unknown="ignore"), ["location"])
    ],
    remainder="passthrough"
)

# 4) Build pipeline
model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("regressor", LinearRegression())
])

# 5) Train model
model.fit(X, y)

# 6) Predict a new house
example_city = data["city"].iloc[0]
new_house = pd.DataFrame({
    "square_footage": [2000],
    "location": [example_city]
})

predicted_price = model.predict(new_house)[0]
print(f"Predicted price: ${predicted_price:,.2f}")

# 7) Organized coefficients
ohe = model.named_steps["preprocess"].named_transformers_["loc"]
location_feature_names = ohe.get_feature_names_out(["location"])

feature_names = list(location_feature_names) + ["square_footage"]
coeffs = model.named_steps["regressor"].coef_
intercept = model.named_steps["regressor"].intercept_

coef_df = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coeffs
})

sqft_coef = coef_df.loc[coef_df["Feature"] == "square_footage", "Coefficient"].iloc[0]

loc_df = coef_df[coef_df["Feature"].str.startswith("location_")].copy()
loc_df["City"] = loc_df["Feature"].str.replace("location_", "", regex=False)
loc_df = loc_df[["City", "Coefficient"]].sort_values("Coefficient", ascending=False)

print("\nSquare Footage Effect:")
print(f"${sqft_coef:,.2f} per additional square foot")

print("\nTop 10 City Effects:")
print(loc_df.head(10).to_string(index=False, formatters={"Coefficient": "{:,.2f}".format}))

print("\nBottom 10 City Effects:")
print(loc_df.tail(10).to_string(index=False, formatters={"Coefficient": "{:,.2f}".format}))

print(f"\nIntercept: ${intercept:,.2f}")


Predicted price: $618,249.59

Square Footage Effect:
$241.70 per additional square foot

Top 10 City Effects:
              City  Coefficient
            Medina 1,064,072.86
        Clyde Hill   420,772.48
     Mercer Island   313,338.48
      Yarrow Point   303,116.06
          Bellevue   184,204.85
           Seattle   122,939.04
          Kirkland    86,847.49
           Redmond    68,766.43
Beaux Arts Village    57,245.08
            Vashon    24,503.83

Bottom 10 City Effects:
        City Coefficient
  Snoqualmie -128,783.83
      Duvall -136,637.82
  Des Moines -136,942.47
   Covington -148,399.53
      Renton -153,765.21
    Enumclaw -184,544.20
Maple Valley -187,414.22
      SeaTac -187,681.40
      Auburn -192,013.46
 Federal Way -231,881.14

Intercept: $11,903.72


# Predict Price for Homes in Washington


# Predict Price
The model estimated the price for a house with parameters of 2,000 square feet per home in Settle. The model predicts a price of $618,249.59, this would be the average price for a house in the area.

# Square Footage Effect
Any additional Square foot of living is roughly $241.70 for square feet.

# Top 10 City Effects
These houses have the most impact on the average cost of living used by the model. A city like Medina has a coefficient of $1,064,073, meaning a house in this town would cost more than 1 million over the baseline.

# Bottom 10 City Effects
These cities have a negative impact on housing prices compared to the baseline. For example Federal Way has a leading coefficient of $-231,881 which is less than the baseline.

# Intercept
In this model the intercept is around $11,903.72, is a constant adjustment in how the model applies to every prediction.
