In [None]:
# Works with the file system
from pathlib import Path

# Loads, cleans and analyze data
import pandas as pd

# Do math operations with numbers and arrays
import numpy as np

# Create and customize graphs
import matplotlib.pyplot as plt

# Splits dataset into training and testing datasets
from sklearn.model_selection import train_test_split

# Build linear regression model
from sklearn.linear_model import LinearRegression

# Build random forest regressor model
from sklearn.ensemble import RandomForestRegressor

# Measures how accurate the model's (both linear regression and rfr) predictions are
from sklearn.metrics import root_mean_squared_error, mean_squared_error, mean_absolute_error, r2_score

pd.set_option("display.max_columns", None) # Show all columns
pd.set_option("display.max_rows", None) # Show all rows

# Display row into about 120 text characters before wrapping to a new line
pd.set_option("display.width", 120)

# Raw dataset csv file name
FILE_NAME = "train.csv"

# Path to that file
csv_path = f"../data/raw/{FILE_NAME}"

# Load the file
df_raw = pd.read_csv(csv_path)

# Print the shape of the dataset
print("Shape (rows, cols):", df_raw.shape)

# Print the first 10 rows of the dataset
df_raw.head(10)

# Check the data type of each column
df_raw.info()

# Display a graph that shows how house prices are spread
# meaning, to see whether most houses are cheap, expensive or in between.

plt.figure(figsize=(6,4))
df_raw["SalePrice"].plot(kind="hist", bins=40, edgecolor="black")
plt.title("Distribution of House Prices")
plt.xlabel("SalePrice")
plt.ylabel("Frequency")
plt.show()

# Based on the graph above,
# most of the bars are tall between 100k-200k.
# meaning many houses cost around there.

# The bars get shorter as prices go up. Fewer expensive houses.

# Count how many missing (NaN) values each column has
missing_counts = df_raw.isna().sum().sort_values(ascending=False)

# Show all columns with their missing values quantity
print("Columns with missing values:\n")
print(missing_counts)

# Some columns have NaN
# This does not always mean the recorder forgot to record it in the dataset
# It can also mean the value for the column does not exist

# Meaning for example:
# A house has no pool. So, Pool Quality is blank
# This is not a mistake. That house just don't have that feature.
# So, NaN means "no pool". Later, NaN will be replaced with "NotAvailable"

# Another reason why some columns have NaN, 
# is because the recorder actually forgot to record it.
# Maybe a house does have a garage, because the cell is empty
# because the recorder forgot to record it

# Replace "doesn't exist" (NaN) with "NotAvailable"

notAvailable_columns = [
    "PoolQC", 
    "MiscFeature", 
    "Alley", 
    "Fence", 
    "FireplaceQu",
    "GarageType", 
    "GarageFinish", 
    "GarageQual", 
    "GarageCond",
    "BsmtQual", 
    "BsmtCond", 
    "BsmtExposure", 
    "BsmtFinType1", 
    "BsmtFinType2",
    "MasVnrType"
]

# For each column in that list, replace NaN with "NotAvailable"
for curr_col in notAvailable_columns:    
    df_raw[curr_col] = df_raw[curr_col].fillna("NotAvailable")

# Print how many missing values (NaN) remain in those columns
print("\nShow remaining missing values in these columns")
print("----------------------------------------------\n")
print(df_raw[notAvailable_columns].isna().sum().sort_values(ascending=False))

# Print the first 10 rows of the dataset
df_raw.head(10)

# Replace NA values in these 3 numeric columns with their respective median value

# List of numeric columns that have missing values
empty_numeric_columns = ["LotFrontage", "MasVnrArea", "GarageYrBlt"]  

# Go through each numeric column in that list
for curr_column in empty_numeric_columns:           
    if df_raw[curr_column].isna().any():          
        # Replace NaN with the column's missing value
        df_raw[curr_column] = df_raw[curr_column].fillna(df_raw[curr_column].median())


# Print the first 10 rows of the dataset
df_raw.head(10)

# Fill the missing 'Electrical' with the most common value (Note: Only one cell is missing in Electrical column)

# Find the most common value in the Electrical column
most_common_elec = df_raw["Electrical"].mode(dropna=True)[0]

# Replace NaN with the most common value in the Electrical column (most_common_elec)
df_raw["Electrical"] = df_raw["Electrical"].fillna(most_common_elec)

# Calculate total duplicates in the dataset
duplicates = df_raw.duplicated().sum()

# Display duplicates
print(f"Number of duplicate rows: {duplicates}")

# Look for unusual (very large/small) values in NUMERIC columns
# In other words, look for outliers
df_raw.describe().T

# Notes:
# 25% (1st quartile) - meaning 25% of houses are cheaper than this value
# 75% (3rd quartile) - meaning 75% of houses are cheaper than this value

# For example, SalesPrice column:
# 25% = 129,975  
# 50% = 163,000  
# 75% = 214,000

# That means:
# 25% of houses cost less than 129,975
# 50% (the median) cost less than 163,000
# 75% cost less than 214,000

# how we decide what’s “too far”

# We use a simple rule based on the interquartile range (IQR).
# Q1 (25%) = price where 25% of houses are cheaper
# Q3 (75%) = price where 75% of houses are cheaper
# IQR = Q3 – Q1 (this is the “normal middle range”)

# Then,
# any value below Q1 − 1.5 × IQR
# or above Q3 + 1.5 × IQR
# is called an outlier

# That’s just a rule of thumb, not a law — it helps us find numbers that are far outside the normal spread.

# upper bound is a value, 
# and any house price above or equal to this value, 
# is considered as an outlier

# For example:
# Upper bound for SalesPrice = 340037.5
# Any sale price value above this upper bound value, 
# is considered as an outlier
# Examples (not from dataset, random numbers):
# At row #400, the SalesPrice is $200,000 → ❌ not an outlier (normal range)
# At row #56, the SalesPrice is $500,000 → ✅ outlier (too high)
# At row #156, the SalesPrice is$700,000 → ✅ outlier (way too high)

# Same logic applies to lower bound.
# Any value below or equal to the lower bound value
# is considered as an outlier

# Get the names of all numeric columns in the dataset
numeric_columns = df_raw.select_dtypes(include="number").columns.tolist()

# This list will collect one summary per numeric column
summaries = []

for column_name in numeric_columns:
    # Store the series (values) for this column
    values = df_raw[column_name]

    # Calculate 25% and 75% percentiles
    q1 = values.quantile(0.25) # 25th percentile
    q3 = values.quantile(0.75) # 75th percentile

    # Calculate IQR (middle spread)
    iqr = q3 - q1                            

    # Set the lower and upper cutoffs for outliers using the 1.5xIQR rule
    lower_bound = q1 - 1.5 * iqr # unusually low             
    upper_bound = q3 + 1.5 * iqr # unusually high             

    # Mark each value as an outlier (True) if it's below lower or above upper bound
    is_outlier = (values < lower_bound) | (values > upper_bound)

    # Count how many outliers the column has
    outlier_count = int(is_outlier.sum())

    # Calculate the percentage of total outliers of that column
    outlier_percentage = round(100 * outlier_count / len(values), 2)

    # Store a small summary dictionary for this column
    summaries.append({
        "column": column_name,
        "lower_bound": round(lower_bound, 2),
        "upper_bound": round(upper_bound, 2),
        "outlier_count": outlier_count,
        "outlier_percentage": outlier_percentage
    })

# Turn all summaries into a table, 
# sort by most outliers, 
# and reset the row index (meaning the rows in this table are renumbered, from 0, 1, 2, 3, and so on.)
outlier_summary = pd.DataFrame(summaries).sort_values("outlier_count", ascending=False).reset_index(drop=True)

print("Outlier summary (IQR rule) for all numeric columns:\n")
display(outlier_summary)

df_raw.head()

# Save the cleaned dataset into the processed folder
df_raw.to_csv('../data/processed/train_cleaned.csv', index=False)

# Load the cleaned dataset and store it in df_cleaned
df_cleaned = pd.read_csv('../data/processed/train_cleaned.csv')

df_cleaned.head()

# =================================== STEP 2: EDA (Starting) ===========================
# **************************************************************************************

# Checking correlation of all numeric columns with SalePrice
# In other words,
# check how strongly each one moves together with SalePrice.
# It tells you how two numbers move together:
# -- +1 (strong positive): when one goes up, the other also goes up (almost perfectly).
# --  0 (no relation): they don’t move together in any consistent way.
# -- −1 (strong negative): when one goes up, the other goes down (almost perfectly).

# Store all numeric columns from the cleaned dataset
numeric_columns = df_cleaned.select_dtypes(include='number')

# Calculate correlation of each numeric column with SalePrice (Target variable)
# In other words, find how strongly each numeric column is related to SalePrice
correlation_with_target = numeric_columns.corr()['SalePrice'].sort_values(ascending=False)

# Print the correlation values, sorted from strongest to weakest
print("Correlation of numeric columns with SalePrice:\n")
print(correlation_with_target)

# These are the numeric columns that greatly affect SalePrice column.
# We will use these numeric columns. The rest will be ignored.

# OverallQual      0.790982
# GrLivArea        0.708624
# GarageCars       0.640409
# GarageArea       0.623431
# TotalBsmtSF      0.613581
# 1stFlrSF         0.605852
# FullBath         0.560664
# TotRmsAbvGrd     0.533723
# YearBuilt        0.522897
# YearRemodAdd     0.507101

# Numerical columns selected, because they are strongly related to SalePrice (target)
top_numericals = [
    "OverallQual", 
    "GrLivArea", 
    "GarageCars", 
    "GarageArea",
    "TotalBsmtSF", 
    "1stFlrSF", 
    "FullBath", 
    "TotRmsAbvGrd",
    "YearBuilt", 
    "YearRemodAdd"
]

# Create a tall figure to hold multiple subplots
plt.figure(figsize=(12, 20))

# Loop through each chosen numeric column and plot it against SalePrice
for i, numerical in enumerate(top_numericals, 1):
    plt.subplot(5, 2, i)  # 5 rows × 2 columns grid
    plt.scatter(df_cleaned[numerical], df_cleaned["SalePrice"], alpha=0.5)
    plt.title(f"{numerical} vs SalePrice")
    plt.xlabel(numerical)
    plt.ylabel("SalePrice")

plt.tight_layout() # Reduce overlaps between subplots so labels fit nicely

# Display all subplots
plt.show()

# Now, we’re just looking at categorical columns 
# (text columns like Neighborhood, HouseStyle, GarageType, etc.) 
# to see if different categories affect house price differently.

# Store the names of all categorical columns
category_columns = df_cleaned.select_dtypes(include=["object"]).columns

# Display total category columns in the dataset
print("Number of categorical columns:", len(category_columns))
print()

# Dsiplay all category column names
print(list(category_columns))

# Check how each categorical column affects SalePrice
# To do this, calculate the average SalePrice for each category value of a column

# Meaning for example
# In MSZoning column, it has values such as FV, RL, RH, RM and C
# Take the SalePrice of every FV, and average them.
# Do the same for RL, RH, RM and C
# Repeat the steps above for all other category columns.

# Loop through each categorical column
for curr_column in category_columns:
    # Calculate average SalePrice for each category in this column, 
    # then sorted from high to low, then round of to 2 decimal places
    average_price = df_cleaned.groupby(curr_column)["SalePrice"].mean().sort_values(ascending=False).round(2)
    
    # Count how many unique category values this column has
    total_unique_value = df_cleaned[curr_column].nunique()

    # Print column name and how many unique category value it has
    print(f"\nColumn: {curr_column}  (unique categories: {total_unique_value})")

    # Print the average SalePrice for each unique category value, for every categorical column
    print(average_price)

# Based on the result above

# Some columns (like Neighborhood, ExterQual, KitchenQual) 
# show huge differences between categories → they have a strong effect. 

# Others (like Utilities or PavedDrive) show small differences → weak or almost no effect.

# For example:
# Column: Neighborhood
# NoRidge    335295.32
# NridgHt    316270.62
# StoneBr    310499.00
# ...
# MeadowV     98576.47

# The lowest average price is about 98k, and the highest is 335k.
# That’s more than a 3× difference.
# That means houses in some neighborhoods sell for much higher prices than others.
# So, Neighborhood strongly affects SalePrice.

# Another column:
# Column: Utilities
# AllPub    180950.96
# NoSeWa    137500.00

# Only 2 categories, and the price difference is about 43 k.
# That’s not a huge jump compared to the average house price (≈ 180 k),
# and also, almost every house has “AllPub” utilities. very few have “NoSeWa”.
# So, Utilities barely affects price. It’s almost the same everywhere.

# Now we need to select which category column greatly affect the target variable
# instead of manually going through each 43 columns and select the columns that are needed.

# Calculate how strongly each categorical column affects SalePrice

# A dictionary to store how much each categorical column affects sale price
influence_scores = {}

# For every categorical column, compare average prices across its unique category value
# only compare higest and lowest average price value.
for curr_column in category_columns:

    # Group rows by the categories in this column and compute the average SalePrice for each category
    # In other words, it stores the average price of each unique category value in that category column
    average_price = df_cleaned.groupby(curr_column)["SalePrice"].mean()

    # Skip columns with only 1 unique category value
    if len(average_price) > 1:  

        # Difference between the highest and lowest average price across categories
        gap = average_price.max() - average_price.min()

        # Key is category name, value is the price difference
        influence_scores[curr_column] = gap

# Turn the results into a table and sort from expensive to cheapest
influence_df = (
    pd.DataFrame(list(influence_scores.items()), columns=["Column", "PriceDifference"])
    .sort_values(by="PriceDifference", ascending=False)
    .reset_index(drop=True)
)

# Round to 2 decimals for readability
influence_df["PriceDifference"] = influence_df["PriceDifference"].round(2)

# Show the table of categorical columns and their price gaps
print(influence_df)

# Select top 10 (first 10 since it is already sorted)
top_influence = influence_df.head(10)

print("Top 10 most influential categorical columns:")
print(top_influence)

# Loop through each categorical column in the selected top_influence list (column name)
for col in top_influence["Column"]:

    # Get the average SalePrice for each unique category value, then sort them from highest to lowest
    average = df_cleaned.groupby(col)["SalePrice"].mean().sort_values(ascending=False)

    plt.figure(figsize=(10, 4))
    plt.bar(average.index, average.values)
    plt.title(f"Average SalePrice by {col}")
    plt.ylabel("Mean Sale Price ($)")
    plt.xlabel(col)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

# These 10 categorical columns are selected because they greatly influence the sale price
top_category = [
    "PoolQC", 
    "ExterQual", 
    "RoofMatl", 
    "Neighborhood", 
    "Condition2", 
    "KitchenQual", 
    "Exterior2nd", 
    "BsmtQual", 
    "FireplaceQu", 
    "Exterior1st"
]

# ========================================= STEP 3 - FEATURE ENGINEERING (Start) =========================
# ***********************************************************************************************

# These are the columns the models will work on:

# Numerical columns:
# top_numericals = [
#     "OverallQual", 
#     "GrLivArea", 
#     "GarageCars", 
#     "GarageArea",
#     "TotalBsmtSF", 
#     "1stFlrSF", 
#     "FullBath", 
#     "TotRmsAbvGrd",
#     "YearBuilt", 
#     "YearRemodAdd"
# ]

# Categorical columns:
# top_category = [
#     "PoolQC", 
#     "ExterQual", 
#     "RoofMatl", 
#     "Neighborhood", 
#     "Condition2", 
#     "KitchenQual", 
#     "Exterior2nd", 
#     "BsmtQual", 
#     "FireplaceQu", 
#     "Exterior1st"
# ]

# Create a copy data frame
# but this copy will only have the selected 20 columns + 1 (target)

# Make a full copy of the cleaned dataset to avoid changing the original
df_selected = df_cleaned.copy()

# Combine the top numeric, top categorical, and target columns into one list
selected_columns = top_numericals + top_category + ["SalePrice"]

# Keep only those selected columns in the new dataframe
df_selected = df_selected[selected_columns]

print(f"Shape: {df_selected.shape}")

# Display the first 5 rows of the dataset
df_selected.head()

# Save this df_model as a csv file, and store it accordingly
df_selected.to_csv('../data/processed/train_selected_cols.csv', index=False)

# Encode categorical columns
# because ml models can't work with text data. only numbers.
# so, we turn (encode) each categorical value from text to number

# Before we encode, 
# we need to identify which categorical column
# are ordinal or nominal.

# Ordinal
# Categories that clear order or ranking (example: Excellent -> Good -> Average -> Fair -> Poor)
# From the top_category variable,
# There are 5 ordinal categories:
# PoolQC, ExterQual, KitchenQual, BsmtQual, FireplaceQu
# They describe quality levels

# Nominal
# Categories with no particular order or ranking
# From the top_category variable,
# There are 5 nominal categories:
# Neighborhood, RoofMatl, Condition2, Exterior1st, Exterior2nd

# top_category = [
#     "PoolQC",        ---> Ordinal
#     "ExterQual",     ---> Ordinal
#     "RoofMatl",      ---> Nominal
#     "Neighborhood",  ---> Nominal
#     "Condition2",    ---> Nominal
#     "KitchenQual",   ---> Ordinal
#     "Exterior2nd",   ---> Nominal
#     "BsmtQual",      ---> Ordinal
#     "FireplaceQu",   ---> Ordinal
#     "Exterior1st"    ---> Nominal
# ]

# Ordinal category

# Define the order (worst -> best) for each ordinal categorical column
ordinal_orders = {
    "PoolQC":      ["NotAvailable", "Fa", "TA", "Gd", "Ex"],
    "ExterQual":   ["Po", "Fa", "TA", "Gd", "Ex"],
    "KitchenQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtQual":    ["NotAvailable", "Fa", "TA", "Gd", "Ex"],
    "FireplaceQu": ["NotAvailable", "Po", "Fa", "TA", "Gd", "Ex"],
}

# Convert each ordered category column into numeric codes (0 = lowest, larger = better
for col, order in ordinal_orders.items():
    df_selected[col] = pd.Categorical(df_selected[col], categories=order, ordered=True).codes

# print(df_selected[list(ordinal_orders.keys())].head())
for col in ordinal_orders:
    print(f"{col} unique codes:", sorted(df_selected[col].unique()))

# Notes about cell above:
    # "PoolQC":      
    # ["NotAvailable", "Fa", "TA", "Gd", "Ex"],
    # ["NotAvailable", "Fair", "Average", "Good", "Excellent"] 

    # "ExterQual":
    # ["Po", "Fa", "TA", "Gd", "Ex"],
    # ["Poor", "Fair", "Average", "Good", "Excellent]

    # "KitchenQual": 
    # ["Po", "Fa", "TA", "Gd", "Ex"],
    # ["Poor", "Fair", "Average", "Good", "Excellent]

    # "BsmtQual":    
    # ["NotAvailable", "Fa", "TA", "Gd", "Ex"],
    # ["NotAvailable", "Fair", "Average", "Good", "Excellent"] 
    
    # "FireplaceQu": 
    # ["NotAvailable", "Po", "Fa", "TA", "Gd", "Ex"],
    # ["NotAvailable", "Poor", "Fair", "Average", "Good", "Excellent"]

# Important note:
# If you look at the dataset, or the result below,
# you'll find out, for example, PoolQC does not have TA

df_selected.head()

# Nominal (no-order) categorical columns
nominal_cols = ["RoofMatl", "Neighborhood", "Condition2", "Exterior2nd", "Exterior1st"]

# One-hot encode them, store result in df_encoded
df_encoded = pd.get_dummies(df_selected, columns=nominal_cols, drop_first=True)

print("Before encoding nominal categorical columns:", df_selected.shape)
print("After encoding nominal categorical columns:", df_encoded.shape)
df_encoded.head()

# ========================================= STEP 4 - ML MODELING (Start) =========================
# ***********************************************************************************************

# Target variable
target = "SalePrice"

# Separate X (independent variables) and y (dependent variable)

# Store all columns except the target (features used to predict)
X = df_encoded.drop(columns=[target])

# Store the target variable only (the thing to predict)
y = df_encoded[target]

print(X.shape)
print(y.shape)

# Split features (X) and target (y)
# into training and testing sets (80% train, 20% test)

# X_train --> Features used to teach the model
# y_train --> The real answers (prices) for those same training rows

# X_test --> Features the model has not seen
# y_test --> The real prices for the test rows

# X_train, y_train --> used to train the model (it has the target column)
# X_test, y_test --> used to test the model (it has no target column)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set:", X_train.shape)
print("Testing set:", X_test.shape)

# LINEAR REGRESSION

# Create and train the model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# y_pred_lin is the prices predicted by the model
# y_test is the actual real house prices from the dataset

# The trained model looks at the test inputs (X_test),
# and guesses the prices. 
# Those guesses are stored in y_pred_linear_regression
y_pred_linear_regression  = linear_model.predict(X_test)

# Calculate error metrics
# It measures how far off the guesses are
rmse = root_mean_squared_error(y_test, y_pred_linear_regression)
mae  = mean_absolute_error(y_test, y_pred_linear_regression)
r2   = r2_score(y_test, y_pred_linear_regression)

# Print the results
print(f"Linear Regression — RMSE: ${rmse:,.0f}")
print(f"Linear Regression —  MAE: ${mae:,.0f}")
print(f"Linear Regression —   R²:  {r2:.3f}")

# What the results mean

# MAE - $23,244
# on average, the price guesses are off by about $23,000 per house. (Lower is better.)

# RMSE - $48, 193
# when we penalize big mistakes more, 
# the “typical” miss is about $48k. (Means you likely have some big errors/outliers.)

# R2 - 0.697
# The model explains about 70% of why prices go up/down (decent, not perfect)

# Create a scatter plot
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred_linear_regression, alpha=0.6, edgecolors="k")
plt.xlabel("Actual Sale Price")
plt.ylabel("Predicted Sale Price")
plt.title("Actual vs Predicted Sale Prices (Linear Regression)")

# Draw a perfect prediction line
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         color='red', linestyle='--', linewidth=2, label="Perfect Prediction")

plt.legend()
plt.tight_layout()
plt.show()

# Compare actual and predicted prices
comparison = pd.DataFrame({
    "Actual_Price": y_test.values,
    "Predicted_Price": y_pred_linear_regression
})

# Round for readability
comparison["Actual_Price"] = comparison["Actual_Price"].round(2)
comparison["Predicted_Price"] = comparison["Predicted_Price"].round(2)

# Show the first 20 rows
print(comparison.head(20))

# Create a small sample (like 20 houses)
sample = comparison.head(20)

# Plot actual vs predicted
plt.figure(figsize=(10, 5))
plt.plot(sample.index, sample["Actual_Price"], label="Actual Price", marker="o")
plt.plot(sample.index, sample["Predicted_Price"], label="Predicted Price", marker="x")
plt.title("Actual vs Predicted Sale Prices (Sample)")
plt.xlabel("House Index")
plt.ylabel("Sale Price ($)")
plt.legend()
plt.show()

# RANDOM FOREST REGRESSOR

# Create the random forest regressor model
random_forest_regressor_model = RandomForestRegressor(
    n_estimators=300, # number of trees in the forest
    random_state=42, # reproducible results
    n_jobs=-1 # use all CPU cores (faster training)
)
# What above means:
# Create a forest of 300 trees
# Each tree learns slightly different patterns
# The final prediction is the average of all trees' predictions

# Train the model with training data
random_forest_regressor_model.fit(X_train, y_train)

# Predict prices using the test set
y_pred_random_forest_regressor = random_forest_regressor_model.predict(X_test)

# Calculate performance metrics
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_random_forest_regressor))
rf_mae  = mean_absolute_error(y_test, y_pred_random_forest_regressor)
rf_r2   = r2_score(y_test, y_pred_random_forest_regressor)

# Display the results
print(f"Random Forest — RMSE: ${rf_rmse:,.0f}")
print(f"Random Forest —  MAE: ${rf_mae:,.0f}")
print(f"Random Forest —   R²:  {rf_r2:.3f}")

# For comparison

# Linear Regression:
# RMSE: $48,193
# MAE: $23,244
# R²:  0.697

# Random Forest Regressor
# RMSE: $31,642
# MAE: $18,563
# R²:  0.869

# Compare actual and predicted prices
rf_comparison = pd.DataFrame({
    "Actual_Price": y_test.values,
    "Predicted_Price_RF": y_pred_random_forest_regressor
})

# Round for readability
rf_comparison["Actual_Price"] = rf_comparison["Actual_Price"].round(2)
rf_comparison["Predicted_Price_RF"] = rf_comparison["Predicted_Price_RF"].round(2)

# Show the first 20 rows
print(rf_comparison.head(20))

plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred_random_forest_regressor, alpha=0.6, edgecolors="k")
plt.xlabel("Actual Sale Price ($)")
plt.ylabel("Predicted Sale Price ($)")
plt.title("Random Forest — Actual vs Predicted Sale Prices")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--", lw=2)  # perfect prediction line
plt.show()

# Pick a small sample to keep the plot readable (change n if you want)
sample = rf_comparison.head(20)

# Plot actual vs predicted as two lines
plt.figure(figsize=(10, 5))
plt.plot(sample.index, sample["Actual_Price"], label="Actual Price", marker="o")
plt.plot(sample.index, sample["Predicted_Price_RF"], label="Predicted Price (RF)", marker="x")
plt.title("Random Forest — Actual vs Predicted Sale Prices (Sample)")
plt.xlabel("House Index")
plt.ylabel("Sale Price ($)")
plt.legend()
plt.tight_layout()
plt.show()

# keep both panels on the same scale
xmin, xmax = y_test.min(), y_test.max()

fig, axes = plt.subplots(1, 2, figsize=(16, 6), constrained_layout=False)

# --- Linear Regression ---
axes[0].scatter(y_test, y_pred_linear_regression, alpha=0.6, edgecolors="k")
axes[0].plot([xmin, xmax], [xmin, xmax], "r--", lw=2)
axes[0].set_xlim(xmin, xmax); axes[0].set_ylim(xmin, xmax)
axes[0].set_title("Linear Regression — Actual vs Predicted", pad=14)
axes[0].set_xlabel("Actual Sale Price ($)")
axes[0].set_ylabel("Predicted Sale Price ($)")

# --- Random Forest ---
axes[1].scatter(y_test, y_pred_random_forest_regressor, alpha=0.6, edgecolors="k")
axes[1].plot([xmin, xmax], [xmin, xmax], "r--", lw=2)
axes[1].set_xlim(xmin, xmax); axes[1].set_ylim(xmin, xmax)
axes[1].set_title("Random Forest — Actual vs Predicted", pad=14)
axes[1].set_xlabel("Actual Sale Price ($)")
axes[1].set_ylabel("Predicted Sale Price ($)")

# add some spacing between subplots to prevent overlap
fig.subplots_adjust(wspace=0.35, left=0.07, right=0.98, top=0.92, bottom=0.10)

plt.show()

