# House Price Prediction - Data Exploration

This notebook provides exploratory data analysis (EDA) for the USA Real Estate dataset.

## Objectives
- Understand the dataset structure and characteristics
- Identify data quality issues
- Explore relationships between features and target
- Generate insights for feature engineering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
plt.style.use('default')
sns.set_palette('husl')

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Data Loading

In [None]:
# Load the dataset
data_path = "../data/raw/df_imputed.csv"

# Check if file exists
if Path(data_path).exists():
    df = pd.read_csv(data_path)
    print(f"Dataset loaded successfully: {df.shape}")
else:
    print(f"Dataset not found at {data_path}")
    print("Please place the df_imputed.csv file in the data/raw/ directory")

## 2. Basic Dataset Information

In [None]:
# Dataset overview
print("Dataset Info:")
print(df.info())
print("\nDataset Shape:", df.shape)
print("\nColumn Names:", list(df.columns))

In [None]:
# First few rows
df.head()

In [None]:
# Basic statistics
df.describe()

## 3. Data Quality Assessment

In [None]:
# Missing values
missing_data = df.isnull().sum()
missing_percentage = (missing_data / len(df)) * 100

missing_summary = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percentage
}).sort_values('Missing Count', ascending=False)

print("Missing Values Summary:")
print(missing_summary[missing_summary['Missing Count'] > 0])

## 4. Target Variable Analysis

In [None]:
# Price distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Original price distribution
axes[0,0].hist(df['price'], bins=50, alpha=0.7, edgecolor='black')
axes[0,0].set_title('Price Distribution')
axes[0,0].set_xlabel('Price ($)')
axes[0,0].set_ylabel('Frequency')

# Log-transformed price
axes[0,1].hist(np.log1p(df['price']), bins=50, alpha=0.7, edgecolor='black')
axes[0,1].set_title('Log-Transformed Price Distribution')
axes[0,1].set_xlabel('Log(Price)')
axes[0,1].set_ylabel('Frequency')

# Box plot
axes[1,0].boxplot(df['price'])
axes[1,0].set_title('Price Box Plot')
axes[1,0].set_ylabel('Price ($)')

# Price statistics by state (top 10)
top_states = df['state'].value_counts().head(10).index
state_prices = df[df['state'].isin(top_states)]
axes[1,1].boxplot([state_prices[state_prices['state'] == state]['price'] for state in top_states])
axes[1,1].set_title('Price Distribution by State (Top 10)')
axes[1,1].set_xticklabels(top_states, rotation=45)
axes[1,1].set_ylabel('Price ($)')

plt.tight_layout()
plt.show()

## 5. Feature Analysis

In [None]:
# Numerical features analysis
numerical_cols = ['bed', 'bath', 'acre_lot', 'house_size']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    axes[i].hist(df[col], bins=30, alpha=0.7, edgecolor='black')
    axes[i].set_title(f'{col.replace("_", " ").title()} Distribution')
    axes[i].set_xlabel(col.replace('_', ' ').title())
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 6. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
numerical_features = ['price', 'bed', 'bath', 'acre_lot', 'house_size']
correlation_matrix = df[numerical_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.3f')
plt.title('Correlation Matrix - Numerical Features')
plt.tight_layout()
plt.show()

## 7. Categorical Features Analysis

In [None]:
# Categorical feature cardinality
categorical_cols = ['city', 'state', 'status']

print("Categorical Feature Cardinality:")
for col in categorical_cols:
    unique_count = df[col].nunique()
    print(f"{col}: {unique_count} unique values")
    
    # Show top categories
    top_categories = df[col].value_counts().head(10)
    print(f"Top categories in {col}:")
    print(top_categories)
    print()

## 8. Feature-Target Relationships

In [None]:
# Scatter plots of numerical features vs price
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    # Sample data for plotting (to avoid overcrowding)
    sample_df = df.sample(n=min(10000, len(df)), random_state=42)
    
    axes[i].scatter(sample_df[col], sample_df['price'], alpha=0.5)
    axes[i].set_xlabel(col.replace('_', ' ').title())
    axes[i].set_ylabel('Price ($)')
    axes[i].set_title(f'Price vs {col.replace("_", " ").title()}')

plt.tight_layout()
plt.show()

## 9. Outlier Detection

In [None]:
# Outlier analysis using IQR method
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

print("Outlier Analysis:")
for col in numerical_features:
    outliers, lower, upper = detect_outliers_iqr(df, col)
    outlier_percentage = (len(outliers) / len(df)) * 100
    
    print(f"{col}:")
    print(f"  Outliers: {len(outliers)} ({outlier_percentage:.2f}%)")
    print(f"  Bounds: [{lower:.2f}, {upper:.2f}]")
    print(f"  Range: [{df[col].min():.2f}, {df[col].max():.2f}]")
    print()

## 10. Key Insights and Recommendations

Based on the exploratory analysis, document key findings and recommendations for:
- Data preprocessing steps
- Feature engineering opportunities
- Model selection considerations
- Potential data quality issues to address

In [None]:
# Summary statistics
print("=== DATA EXPLORATION SUMMARY ===")
print(f"Dataset size: {df.shape[0]:,} rows, {df.shape[1]} columns")
print(f"Target variable (price) range: ${df['price'].min():,.0f} - ${df['price'].max():,.0f}")
print(f"Median price: ${df['price'].median():,.0f}")
print(f"Missing values: {df.isnull().sum().sum()} total")
print(f"Categorical features with high cardinality: {[col for col in categorical_cols if df[col].nunique() > 1000]}")

# Feature correlation with target
price_correlations = df[numerical_features].corr()['price'].sort_values(ascending=False)
print(f"\nFeatures most correlated with price:")
for feature, corr in price_correlations.items():
    if feature != 'price':
        print(f"  {feature}: {corr:.3f}")