Imports

In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns


import zipfile


zip_path = 'Housing Price Regression.zip'


# Open the ZIP file and check its contents

with zipfile.ZipFile(zip_path, 'r') as z:
    print(z.namelist())  # Lists all files inside the ZIP


Read In Data

In [None]:
# Assuming it contains a CSV file, read it directly
csv_filename = "house_price_regression_dataset.csv"  # Replace with actual file name
with zipfile.ZipFile(zip_path, 'r') as z:
    with z.open(csv_filename) as f:
        df = pd.read_csv(f)

# Display the first few rows
df.head()

Print out Column Names

In [None]:
df.columns

Length and width of File

In [None]:
df.shape

Summary Statstics

In [None]:
df.describe()

Check for the presence of any nulls

In [None]:
df.isnull().values.any()


Determine Field Data Types. Might need to convert types.

In [None]:
print(df.dtypes)

In [None]:
target_column = "House_Price"

Plot all Predictor Variables to get Initial Look

`Square Footage` appears to be factor affecting House Price. Other Features seem to have less linear influence. Tree-based models (like Random Forest, XGBoost) do not require linear relationships.

Some features just don’t have much signal.

`Horizontal Bar` --> Indicates that there are only a few unique values and the model can't learn enough if there is not enough variation. Check the variance of fields: A variable can't be predictive of the target if it doesn’t vary much. 

`Shotgun Blast` ( Points scattered everywhere ) --> The feature has no predictive relationship to the target

In [None]:
sns.pairplot(df)

In [None]:
for col in ['Num_Bedrooms', 'Num_Bathrooms', 'Garage_Size', 'Neighborhood_Quality','Lot_Size','Year_Built']:
    print(f"{col}: Unique values = {df[col].nunique()}, Value counts:\n{df[col].value_counts()}\n")


In [None]:

for col in ['Square_Footage', 'Lot_Size', 'Num_Bedrooms', 'Num_Bathrooms', 'Year_Built', 'Garage_Size', 'Neighborhood_Quality','House_Price',  ]:  # Replace with your predictor variables
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.show()


Q-Q plots (Quantile-Quantile plot) to assess if your data is normally distributed *A key assumption in Linear Regression and other statistical models

In [None]:
import matplotlib.pyplot as plt
import scipy.stats as stats

for col in df.columns:  # Loop through predictor variables
    plt.figure(figsize=(6, 4))
    stats.probplot(df[col], dist="norm", plot=plt)
    plt.title(f"Q-Q Plot of {col}")
    plt.show()


Reviewing Correlations to House Price

In [None]:
df.corr()["House_Price"]


Investigate Multicollinearity

If Square Footage is highly correlated with Bedrooms then this features might not contribute much independently.

In [None]:
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.show()


It would be nice to have features such as "Location" or "Condition"

Box Plots to Identify Outliers


In [None]:
# Define the number of rows and columns for the grid layout
num_columns = 3  # Adjust based on preference
num_features = len(df.columns)
num_rows = int(np.ceil(num_features / num_columns))

# Create subplots for boxplots
fig, axes = plt.subplots(num_rows, num_columns, figsize=(15, num_rows * 4))
axes = axes.flatten()  # Flatten in case of fewer plots than grid spots

# Plot each feature separately in the grid
for i, col in enumerate(df.columns):
    sns.boxplot(y=df[col], ax=axes[i])  # Box plot for each variable
    axes[i].set_title(f"Boxplot of {col}")

# Remove empty subplots if any
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()