# 🏡 Exploratory Data Analysis (EDA) for Real Estate Pricing
This notebook provides a comprehensive EDA of a housing dataset, aimed at uncovering key factors that influence real estate pricing.
**Date**: 2025-07-11

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

In [4]:
# Load the dataset
file_path = "housing_data.csv.xlsx"
excel_file = pd.ExcelFile(file_path)
df = excel_file.parse('housing_data')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'housing_data.csv.xlsx'

## 🧹 Data Cleaning

In [None]:
# Check for missing values
df.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
# Drop columns with too many missing values or impute as needed
df_cleaned = df.dropna(thresh=len(df) * 0.7, axis=1)
df_cleaned.dropna(inplace=True)
df_cleaned.drop_duplicates(inplace=True)
df_cleaned.info()

## 📊 Univariate Analysis

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(df_cleaned['SalePrice'], kde=True, color='skyblue')
plt.title('Distribution of Sale Prices')
plt.xlabel('Sale Price')
plt.ylabel('Count')
plt.show()

## 🔗 Multivariate Analysis

In [None]:
plt.figure(figsize=(12,8))
corr = df_cleaned.corr()
sns.heatmap(corr[['SalePrice']].sort_values(by='SalePrice', ascending=False), annot=True, cmap='viridis')
plt.title('Correlation with Sale Price')
plt.show()

## 🏗️ Feature Engineering

In [None]:
df_cleaned['PricePerSqft'] = df_cleaned['SalePrice'] / df_cleaned['GrLivArea']
df_cleaned['PropertyAge'] = df_cleaned['YrSold'] - df_cleaned['YearBuilt']
df_cleaned[['SalePrice', 'GrLivArea', 'PricePerSqft', 'PropertyAge']].head()

## 📐 Size Impact on Sale Price

In [None]:
sns.boxplot(x='BedroomAbvGr', y='SalePrice', data=df_cleaned)
plt.title('Sale Price vs. Number of Bedrooms')
plt.show()

In [None]:
sns.scatterplot(x='GrLivArea', y='SalePrice', data=df_cleaned)
plt.title('Sale Price vs. Living Area')
plt.show()

## 📈 Market Trends and Historical Pricing

In [None]:
df_cleaned['MoSold'] = pd.to_datetime(df_cleaned['MoSold'], errors='coerce')
df_cleaned['YearMonth'] = pd.to_datetime(df_cleaned['YrSold'].astype(str) + '-01')
df_grouped = df_cleaned.groupby('YearMonth')['SalePrice'].mean()
df_grouped.plot(figsize=(12,6), title='Average Sale Price Over Time')
plt.ylabel('Average Sale Price')
plt.xlabel('Date')
plt.show()

## 🏡 Customer Preferences and Amenities

In [5]:
sns.boxplot(x='GarageCars', y='SalePrice', data=df_cleaned)
plt.title('Sale Price vs. Garage Capacity')
plt.show()

NameError: name 'df_cleaned' is not defined

In [6]:
sns.boxplot(x='PoolArea', y='SalePrice', data=df_cleaned)
plt.title('Sale Price vs. Pool Area')
plt.show()

NameError: name 'df_cleaned' is not defined