Imports

In [None]:
pip install xgboost lightgbm

In [None]:
import zipfile
import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from xgboost import plot_importance




In [None]:
zip_path = 'Housing Price Regression.zip'


# Open the ZIP file and check its contents

with zipfile.ZipFile(zip_path, 'r') as z:
    print(z.namelist())  # Lists all files inside the ZIP


Read In Data

In [None]:
# Assuming it contains a CSV file, read it directly
csv_filename = "house_price_regression_dataset.csv"  # Replace with actual file name
with zipfile.ZipFile(zip_path, 'r') as z:
    with z.open(csv_filename) as f:
        df = pd.read_csv(f)

# Display the first few rows
df.head()

Print out Column Names

In [None]:
df.columns

Length and width of File

In [None]:
df.shape

Summary Statstics

In [None]:
df.describe()

Check for the presence of any nulls

In [None]:
df.isnull().values.any()


Determine Field Data Types. Might need to convert types.

In [None]:
print(df.dtypes)

In [None]:
target_column = "House_Price"

Plot all Predictor Variables to get Initial Look

`Square Footage` appears to be factor affecting House Price. Other Features seem to have less linear influence. Tree-based models (like Random Forest, XGBoost) do not require linear relationships.

Some features just don’t have much signal.

`Horizontal Bar` --> Indicates that there are only a few unique values and the model can't learn enough if there is not enough variation. Check the variance of fields: A variable can't be predictive of the target if it doesn’t vary much. 

`Shotgun Blast` ( Points scattered everywhere ) --> The feature has no predictive relationship to the target

In [None]:
sns.pairplot(df)

In [None]:
for col in ['Num_Bedrooms', 'Num_Bathrooms', 'Garage_Size', 'Neighborhood_Quality','Lot_Size','Year_Built']:
    print(f"{col}: Unique values = {df[col].nunique()}, Value counts:\n{df[col].value_counts()}\n")


In [None]:

for col in ['Square_Footage', 'Lot_Size', 'Num_Bedrooms', 'Num_Bathrooms', 'Year_Built', 'Garage_Size', 'Neighborhood_Quality','House_Price',  ]:  # Replace with your predictor variables
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.show()


Q-Q plots (Quantile-Quantile plot) to assess if your data is normally distributed *A key assumption in Linear Regression and other statistical models

In [None]:
import matplotlib.pyplot as plt
import scipy.stats as stats

for col in df.columns:  # Loop through predictor variables
    plt.figure(figsize=(6, 4))
    stats.probplot(df[col], dist="norm", plot=plt)
    plt.title(f"Q-Q Plot of {col}")
    plt.show()


Reviewing Correlations to House Price

In [None]:
df.corr()["House_Price"]


Investigate Multicollinearity

If Square Footage is highly correlated with Bedrooms then this features might not contribute much independently.

In [None]:
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.show()


It would be nice to have features such as "Location" or "Condition"

Box Plots to Identify Outliers


In [None]:
# Define the number of rows and columns for the grid layout
num_columns = 3  # Adjust based on preference
num_features = len(df.columns)
num_rows = int(np.ceil(num_features / num_columns))

# Create subplots for boxplots
fig, axes = plt.subplots(num_rows, num_columns, figsize=(15, num_rows * 4))
axes = axes.flatten()  # Flatten in case of fewer plots than grid spots

# Plot each feature separately in the grid
for i, col in enumerate(df.columns):
    sns.boxplot(y=df[col], ax=axes[i])  # Box plot for each variable
    axes[i].set_title(f"Boxplot of {col}")

# Remove empty subplots if any
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

Using Linear Regression as a `Baseline` model.

Can more complex beat this? --> Occam's razor-->
Quantify the Power of One Good Feature
How much of the variance in price can Square_Footage alone explain?"

In [None]:
y = df["House_Price"]
X_sqft = df[['Square_Footage']]
X_full = df.drop(columns = ['House_Price'])

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X_sqft, y, test_size = 0.2, random_state = 22)
X2_train, X2_test, y2_train, y2_test = train_test_split(X_full, y, test_size = 0.2, random_state= 22)

In [None]:
lr_model = LinearRegression().fit(X1_train, y1_train)
rf_model = RandomForestRegressor(n_estimators=100, random_state=22,).fit(X2_train, y2_train)# Builds an ensemble collection of individual decision trees and averaging their predictions to get a final result.

In [None]:
lr_preds = lr_model.predict(X1_test)
rf_preds = rf_model.predict(X2_test)

def evaluate(name, y_true, y_pred):
    print(f" {name}")
    print(f"R² Score-Coefficient of Determination: {r2_score(y_true, y_pred):,.2f}")
    print(f"MAE real-world error: {mean_absolute_error(y_true, y_pred):,.2f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):,.2f}\n")

evaluate("Linear Regression (Square Footage)", y1_test, lr_preds)
evaluate("Random Forest (All Features)", y2_test, rf_preds)


In [None]:
plt.figure(figsize=(10, 6))

plt.scatter(y1_test, lr_preds, alpha=0.5, label="Linear Regression", color="blue")
plt.scatter(y2_test, rf_preds, alpha=0.5, label="Random Forest", color="green")

plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2, label='Perfect Prediction')

plt.xlabel("Actual House Price")
plt.ylabel("Predicted House Price")
plt.title("Actual vs. Predicted Prices")
plt.legend()
plt.grid(True)
plt.show()


Which ones are most predictive of House_Price?

In [None]:


# Get feature importances from the trained model
importances = rf_model.feature_importances_

# Match them to the column names
feature_names = X2_train.columns
importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)


In [None]:


plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=importance_df)

plt.title("Feature Importance - Random Forest")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.grid(True)
plt.tight_layout()
plt.show()


Demonstrating Optimal Number of Trees

In [None]:
for n in [10, 50, 100, 200, 500]:
    model = RandomForestRegressor(n_estimators=n, random_state=42)
    model.fit(X2_train, y2_train)
    preds = model.predict(X2_test)
    print(f"{n} trees → R²: {r2_score(y2_test, preds):.3f}")


In [None]:

# Create and fit model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X2_train, y2_train)  # Use your full feature training set

# Predict
xgb_preds = xgb_model.predict(X2_test)

#Printing Results from Previous Models
evaluate("Linear Regression (Square Footage)", y1_test, lr_preds)
evaluate("Random Forest (All Features)", y2_test, rf_preds)


# Evaluate
print("XGBoost Performance")
print(f"R² Score: {r2_score(y2_test, xgb_preds):.2f}")
print(f"MAE real-world error: {mean_absolute_error(y2_test, xgb_preds):,.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y2_test, xgb_preds)):,.2f}")


XGBoost finds importance in features that Random Forest ignores.
Random Forest does not go back and correct past errors. 
Each Tree is built independently from bootstrapped data. Each node considers a random subset of features.
If one or two features dominate early splits other features may be ignored.

XGBoost Builds Trees in Sequence. Each tree is built to correct the errors of previous trees.
XGBoost finds value in more features because it builds smarter, targeted trees — not just “randomly sampled” ones.


In [None]:
plot_importance(xgb_model, height=0.8)
plt.title("XGBoost Feature Importance")
plt.show()

# Comments / Areas of Improvements / Constructive Criticism