# 📊 Exploratory Data Analysis - California Housing Dataset

This notebook performs comprehensive exploratory data analysis on the California Housing dataset to understand:
- Data distributions and patterns
- Feature correlations
- Outliers and anomalies
- Geographic patterns
- Insights for feature engineering

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("📚 Libraries imported successfully!")

In [None]:
# Load the dataset
import sys
sys.path.append('../src')
from data_loader import load_dataset_from_csv, get_dataset_info

# Load data
X, y = load_dataset_from_csv('../data/california_housing.csv')

# Combine for easier analysis
df = X.copy()
df['MedHouseVal'] = y

print(f"Dataset loaded: {df.shape}")
print("\nFirst 5 rows:")
df.head()

## 📈 Basic Statistical Summary

In [None]:
# Statistical summary
print("📊 STATISTICAL SUMMARY")
print("=" * 50)
display(df.describe())

print("\n📋 DATA TYPES")
print("=" * 30)
print(df.dtypes)

print("\n🔍 MISSING VALUES")
print("=" * 30)
missing_data = df.isnull().sum()
print(missing_data[missing_data > 0] if missing_data.sum() > 0 else "✅ No missing values!")

## 🎯 Target Variable Analysis

In [None]:
# Target variable analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('🎯 Target Variable (Median House Value) Analysis', fontsize=16, fontweight='bold')

# Histogram
axes[0,0].hist(df['MedHouseVal'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('Distribution of House Values')
axes[0,0].set_xlabel('Median House Value ($100k)')
axes[0,0].set_ylabel('Frequency')

# Box plot
axes[0,1].boxplot(df['MedHouseVal'])
axes[0,1].set_title('Box Plot of House Values')
axes[0,1].set_ylabel('Median House Value ($100k)')

# Q-Q plot
from scipy import stats
stats.probplot(df['MedHouseVal'], dist="norm", plot=axes[1,0])
axes[1,0].set_title('Q-Q Plot (Normal Distribution)')

# Log transformation
log_values = np.log(df['MedHouseVal'])
axes[1,1].hist(log_values, bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
axes[1,1].set_title('Log-Transformed House Values')
axes[1,1].set_xlabel('Log(Median House Value)')
axes[1,1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Statistical insights
print(f"📊 Target Variable Statistics:")
print(f"   Mean: ${df['MedHouseVal'].mean()*100:.0f}k")
print(f"   Median: ${df['MedHouseVal'].median()*100:.0f}k")
print(f"   Std Dev: ${df['MedHouseVal'].std()*100:.0f}k")
print(f"   Skewness: {df['MedHouseVal'].skew():.3f}")
print(f"   Kurtosis: {df['MedHouseVal'].kurtosis():.3f}")

## 🔗 Feature Correlation Analysis

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()

# Create heatmap
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, 
            mask=mask,
            annot=True, 
            cmap='RdYlBu_r', 
            center=0,
            square=True,
            fmt='.3f',
            cbar_kws={"shrink": .8})

plt.title('🔗 Feature Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Top correlations with target
target_corr = correlation_matrix['MedHouseVal'].abs().sort_values(ascending=False)
print("\n🎯 Features most correlated with House Value:")
print("=" * 45)
for feature, corr in target_corr.items():
    if feature != 'MedHouseVal':
        print(f"   {feature:<12}: {corr:.3f}")

## 📊 Feature Distribution Analysis

In [None]:
# Feature distributions
features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('📊 Feature Distributions', fontsize=16, fontweight='bold')

for i, feature in enumerate(features):
    row = i // 3
    col = i % 3
    
    # Histogram with KDE
    axes[row, col].hist(df[feature], bins=50, alpha=0.7, density=True, color='lightblue')
    
    # Add KDE curve
    from scipy.stats import gaussian_kde
    kde = gaussian_kde(df[feature])
    x_range = np.linspace(df[feature].min(), df[feature].max(), 100)
    axes[row, col].plot(x_range, kde(x_range), 'r-', linewidth=2)
    
    axes[row, col].set_title(f'{feature}')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Density')
    
    # Add statistics text
    mean_val = df[feature].mean()
    std_val = df[feature].std()
    axes[row, col].axvline(mean_val, color='red', linestyle='--', alpha=0.7, label=f'Mean: {mean_val:.2f}')
    axes[row, col].legend()

plt.tight_layout()
plt.show()

## 🗺️ Geographic Analysis

In [None]:
# Geographic scatter plot
plt.figure(figsize=(15, 10))

# Create scatter plot with house values as color
scatter = plt.scatter(df['Longitude'], df['Latitude'], 
                     c=df['MedHouseVal'], 
                     cmap='viridis', 
                     alpha=0.6, 
                     s=20)

plt.colorbar(scatter, label='Median House Value ($100k)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('🗺️ Geographic Distribution of House Values in California', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.show()

# Interactive geographic plot with Plotly
fig = px.scatter(df.sample(5000),  # Sample for performance
                 x='Longitude', 
                 y='Latitude',
                 color='MedHouseVal',
                 size='Population',
                 hover_data=['MedInc', 'HouseAge', 'AveRooms'],
                 title='🗺️ Interactive California Housing Map',
                 color_continuous_scale='Viridis')

fig.update_layout(width=800, height=600)
fig.show()

## 🔍 Outlier Detection

In [None]:
# Outlier detection using IQR method
def detect_outliers_iqr(data, feature):
    Q1 = data[feature].quantile(0.25)
    Q3 = data[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = data[(data[feature] < lower_bound) | (data[feature] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Analyze outliers for each feature
outlier_summary = {}
numeric_features = df.select_dtypes(include=[np.number]).columns

print("🔍 OUTLIER ANALYSIS")
print("=" * 40)

for feature in numeric_features:
    outliers, lower, upper = detect_outliers_iqr(df, feature)
    outlier_percentage = (len(outliers) / len(df)) * 100
    outlier_summary[feature] = {
        'count': len(outliers),
        'percentage': outlier_percentage,
        'lower_bound': lower,
        'upper_bound': upper
    }
    
    print(f"{feature:<12}: {len(outliers):>4} outliers ({outlier_percentage:>5.1f}%)")

# Visualize outliers
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
fig.suptitle('🔍 Outlier Detection - Box Plots', fontsize=16, fontweight='bold')

for i, feature in enumerate(numeric_features):
    row = i // 4
    col = i % 4
    
    axes[row, col].boxplot(df[feature])
    axes[row, col].set_title(f'{feature}')
    axes[row, col].set_ylabel('Value')
    
    # Add outlier count
    outlier_count = outlier_summary[feature]['count']
    axes[row, col].text(0.5, 0.95, f'Outliers: {outlier_count}', 
                       transform=axes[row, col].transAxes, 
                       ha='center', va='top',
                       bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

plt.tight_layout()
plt.show()

## 📈 Feature Relationships with Target

In [None]:
# Scatter plots of features vs target
features_to_plot = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('📈 Feature Relationships with House Values', fontsize=16, fontweight='bold')

for i, feature in enumerate(features_to_plot):
    row = i // 3
    col = i % 3
    
    # Scatter plot with regression line
    axes[row, col].scatter(df[feature], df['MedHouseVal'], alpha=0.5, s=10)
    
    # Add regression line
    z = np.polyfit(df[feature], df['MedHouseVal'], 1)
    p = np.poly1d(z)
    axes[row, col].plot(df[feature], p(df[feature]), "r--", alpha=0.8, linewidth=2)
    
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Median House Value')
    axes[row, col].set_title(f'{feature} vs House Value')
    
    # Add correlation coefficient
    corr = df[feature].corr(df['MedHouseVal'])
    axes[row, col].text(0.05, 0.95, f'r = {corr:.3f}', 
                       transform=axes[row, col].transAxes,
                       bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))

plt.tight_layout()
plt.show()

## 🎯 Key Insights and Recommendations

In [None]:
# Generate insights
print("🎯 KEY INSIGHTS FROM EXPLORATORY DATA ANALYSIS")
print("=" * 60)

# 1. Target variable insights
print("\n📊 TARGET VARIABLE (House Values):")
print(f"   • Distribution is right-skewed (skewness: {df['MedHouseVal'].skew():.3f})")
print(f"   • Range: ${df['MedHouseVal'].min()*100:.0f}k - ${df['MedHouseVal'].max()*100:.0f}k")
print(f"   • Mean > Median indicates positive skew")
print(f"   • Consider log transformation for modeling")

# 2. Feature correlations
print("\n🔗 STRONGEST PREDICTORS:")
target_corr_sorted = correlation_matrix['MedHouseVal'].abs().sort_values(ascending=False)
for i, (feature, corr) in enumerate(target_corr_sorted.items()):
    if feature != 'MedHouseVal' and i <= 3:
        print(f"   • {feature}: {corr:.3f} correlation")

# 3. Geographic insights
print("\n🗺️ GEOGRAPHIC PATTERNS:")
coastal_high = df[(df['Longitude'] > -122) & (df['MedHouseVal'] > 3)]
print(f"   • Coastal areas show higher property values")
print(f"   • {len(coastal_high)} high-value coastal properties identified")
print(f"   • Location features (Lat/Long) are important predictors")

# 4. Outlier insights
print("\n🔍 OUTLIER ANALYSIS:")
high_outlier_features = [f for f, info in outlier_summary.items() if info['percentage'] > 5]
for feature in high_outlier_features[:3]:
    pct = outlier_summary[feature]['percentage']
    print(f"   • {feature}: {pct:.1f}% outliers - consider capping or transformation")

# 5. Feature engineering recommendations
print("\n🛠️ FEATURE ENGINEERING RECOMMENDATIONS:")
print("   • Create 'RoomsPerHousehold' = AveRooms / AveOccup")
print("   • Create 'BedroomRatio' = AveBedrms / AveRooms")
print("   • Create 'PopulationDensity' = Population / (some area measure)")
print("   • Consider polynomial features for MedInc (strongest predictor)")
print("   • Geographic clustering based on Lat/Long")

# 6. Preprocessing recommendations
print("\n⚙️ PREPROCESSING RECOMMENDATIONS:")
print("   • Apply StandardScaler or RobustScaler for feature scaling")
print("   • Consider log transformation for target variable")
print("   • Handle outliers using IQR-based capping")
print("   • No missing values - dataset is clean!")

print("\n" + "=" * 60)
print("✅ EDA COMPLETE - Ready for preprocessing and modeling!")