<a href="https://colab.research.google.com/github/marcorrea1/AAI2026/blob/main/Coding_Exercise_ML_Basics_Exer_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Data Source: The data that was generated was made using Chatgbt SJSU, with the prompt that was provided in the grading Rubric
# Prompt Used: Prompt 1:
#Housing price and square footage dataset with more than 100 records
#Download the CSV or Excel file, then delete other columns except 2: price and footage.

#Creates Path in order to read file
data = pd.read_csv("housing_150_records.csv")

#1) checks if file that was uploaded is being used
print("Dataset shape:", data.shape)

# 2) Seprates the input in X as features and the target as Y
# X includes square footage of the house and location
# Y is the price that we want our model to predict
X = data[["square_footage", "location"]]
y = data["price"]

# 3) Preprocess the data
# The location column is a categorical, so we need to use OneHotEncoder
# This converts the data such as Downtown, Rural into 0/1
# The Square_footage is already a number so no need to change

preprocess = ColumnTransformer(
    transformers=[
        ("loc", OneHotEncoder(handle_unknown="ignore"), ["location"])
    ],
    remainder="passthrough"  # keeps square_footage as-is
)

# 4) create a pipeline
# 1st step: process the data
# 2nd step: train a linear regression model
model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("regressor", LinearRegression())
])

# 5) Train the mode using the housing data
# The model learns how square footage and location affect the house price
model.fit(X, y)

# 6) Predict for new house
# This house is used for example in order to predict the price in downtown
new_house = pd.DataFrame({"square_footage": [2000], "location": ["Downtown"]})
predicted_price = model.predict(new_house)[0]
print(f"Predicted price: ${predicted_price:,.2f}")

# 7) Print the model
# These values shows how much each feature affects the predicted price
# An example of this, is how square_footage coefficient shows increase per
# square foot
ohe = model.named_steps["preprocess"].named_transformers_["loc"]
location_feature_names = ohe.get_feature_names_out(["location"])

# Combines location features with square footage
feature_names = list(location_feature_names) + ["square_footage"]
coeffs = model.named_steps["regressor"].coef_

print("\nCoefficients:")
for name, coef in zip(feature_names, coeffs):
    print(f"{name}: {coef:,.2f}")

# The intercept is the base price when all features are zero
print(f"\nIntercept: {model.named_steps['regressor'].intercept_:,.2f}")



Dataset shape: (150, 3)
Predicted price: $589,626.37

Coefficients:
location_Downtown: 80,800.96
location_Rural: -69,162.08
location_Suburb: -11,638.88
square_footage: 163.98

Intercept: 180,858.03
