In [1]:
"""
Complete Visualization Generator for House Price Prediction Project
Creates all charts and plots needed for Report and Presentation

Run this after training your model to generate all visualizations
Saves all plots to 'outputs/visualizations/' folder
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Create output directory
os.makedirs('outputs/visualizations', exist_ok=True)

# Set style for professional-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

print("=" * 70)
print("HOUSE PRICE PREDICTION - VISUALIZATION GENERATOR")
print("=" * 70)

# ============================================
# LOAD DATA
# ============================================
print("\n📂 Loading data...")

try:
    train_df = pd.read_csv('train.csv')
    print(f"✓ Loaded training data: {train_df.shape}")
except:
    print("❌ Error: train.csv not found!")
    exit()

# ============================================
# 1. PRICE DISTRIBUTION
# ============================================
print("\n📊 Creating Price Distribution Plot...")

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Original distribution
axes[0].hist(train_df['SalePrice'], bins=50, color='skyblue', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Sale Price ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of House Prices (Original)', fontweight='bold')
axes[0].axvline(train_df['SalePrice'].median(), color='red', linestyle='--', 
                linewidth=2, label=f'Median: ${train_df["SalePrice"].median():,.0f}')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Log-transformed distribution
axes[1].hist(np.log1p(train_df['SalePrice']), bins=50, color='lightcoral', 
             edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Log(Sale Price)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of House Prices (Log-Transformed)', fontweight='bold')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('outputs/visualizations/01_price_distribution.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved: 01_price_distribution.png")

# ============================================
# 2. CORRELATION HEATMAP
# ============================================
print("\n📊 Creating Correlation Heatmap...")

# Select numerical features
numerical_features = train_df.select_dtypes(include=[np.number]).columns
correlation_matrix = train_df[numerical_features].corr()

# Get top 15 features correlated with SalePrice
top_features = correlation_matrix['SalePrice'].abs().sort_values(ascending=False).head(16).index

plt.figure(figsize=(14, 10))
sns.heatmap(train_df[top_features].corr(), annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap - Top 15 Features vs SalePrice', fontweight='bold', fontsize=16)
plt.tight_layout()
plt.savefig('outputs/visualizations/02_correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved: 02_correlation_heatmap.png")

# ============================================
# 3. TOP FEATURES CORRELATION BAR CHART
# ============================================
print("\n📊 Creating Top Features Correlation Chart...")

top_corr = correlation_matrix['SalePrice'].abs().sort_values(ascending=False).head(11)[1:]

plt.figure(figsize=(12, 8))
colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(top_corr)))
bars = plt.barh(range(len(top_corr)), top_corr.values, color=colors, edgecolor='black')
plt.yticks(range(len(top_corr)), top_corr.index)
plt.xlabel('Correlation Coefficient', fontweight='bold')
plt.title('Top 10 Features Correlated with Sale Price', fontweight='bold', fontsize=16)
plt.grid(axis='x', alpha=0.3)

# Add value labels
for i, (bar, value) in enumerate(zip(bars, top_corr.values)):
    plt.text(value + 0.02, i, f'{value:.3f}', va='center', fontweight='bold')

plt.tight_layout()
plt.savefig('outputs/visualizations/03_top_features_correlation.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved: 03_top_features_correlation.png")

# ============================================
# 4. SCATTER PLOTS - KEY RELATIONSHIPS
# ============================================
print("\n📊 Creating Scatter Plots...")

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: OverallQual vs Price
axes[0, 0].scatter(train_df['OverallQual'], train_df['SalePrice'], 
                   alpha=0.6, c=train_df['SalePrice'], cmap='viridis', s=50)
axes[0, 0].set_xlabel('Overall Quality (1-10)')
axes[0, 0].set_ylabel('Sale Price ($)')
axes[0, 0].set_title('Overall Quality vs Sale Price', fontweight='bold')
axes[0, 0].grid(alpha=0.3)

# Plot 2: GrLivArea vs Price
axes[0, 1].scatter(train_df['GrLivArea'], train_df['SalePrice'], 
                   alpha=0.6, c=train_df['SalePrice'], cmap='plasma', s=50)
axes[0, 1].set_xlabel('Living Area (sq ft)')
axes[0, 1].set_ylabel('Sale Price ($)')
axes[0, 1].set_title('Living Area vs Sale Price', fontweight='bold')
axes[0, 1].grid(alpha=0.3)

# Plot 3: YearBuilt vs Price
axes[1, 0].scatter(train_df['YearBuilt'], train_df['SalePrice'], 
                   alpha=0.6, c=train_df['SalePrice'], cmap='coolwarm', s=50)
axes[1, 0].set_xlabel('Year Built')
axes[1, 0].set_ylabel('Sale Price ($)')
axes[1, 0].set_title('Year Built vs Sale Price', fontweight='bold')
axes[1, 0].grid(alpha=0.3)

# Plot 4: TotalBsmtSF vs Price
axes[1, 1].scatter(train_df['TotalBsmtSF'], train_df['SalePrice'], 
                   alpha=0.6, c=train_df['SalePrice'], cmap='spring', s=50)
axes[1, 1].set_xlabel('Basement Area (sq ft)')
axes[1, 1].set_ylabel('Sale Price ($)')
axes[1, 1].set_title('Basement Area vs Sale Price', fontweight='bold')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('outputs/visualizations/04_scatter_plots.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved: 04_scatter_plots.png")

# ============================================
# 5. NEIGHBORHOOD ANALYSIS
# ============================================
print("\n📊 Creating Neighborhood Analysis...")

neighborhood_prices = train_df.groupby('Neighborhood')['SalePrice'].median().sort_values(ascending=False)

plt.figure(figsize=(14, 8))
colors = ['green' if price > train_df['SalePrice'].median() else 'coral' 
          for price in neighborhood_prices.values]
bars = plt.barh(range(len(neighborhood_prices)), neighborhood_prices.values, color=colors, edgecolor='black')
plt.yticks(range(len(neighborhood_prices)), neighborhood_prices.index)
plt.xlabel('Median Sale Price ($)', fontweight='bold')
plt.title('Median House Prices by Neighborhood', fontweight='bold', fontsize=16)
plt.axvline(train_df['SalePrice'].median(), color='red', linestyle='--', 
            linewidth=2, label=f'Overall Median: ${train_df["SalePrice"].median():,.0f}')
plt.legend()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('outputs/visualizations/05_neighborhood_prices.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved: 05_neighborhood_prices.png")

# ============================================
# 6. BOX PLOTS - CATEGORICAL FEATURES
# ============================================
print("\n📊 Creating Box Plots...")

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# OverallQual
axes[0, 0].boxplot([train_df[train_df['OverallQual']==q]['SalePrice'].values 
                    for q in sorted(train_df['OverallQual'].unique())])
axes[0, 0].set_xticklabels(sorted(train_df['OverallQual'].unique()))
axes[0, 0].set_xlabel('Overall Quality')
axes[0, 0].set_ylabel('Sale Price ($)')
axes[0, 0].set_title('Price Distribution by Quality Rating', fontweight='bold')
axes[0, 0].grid(alpha=0.3)

# House Style
house_styles = train_df.groupby('HouseStyle')['SalePrice'].median().sort_values().index
data_to_plot = [train_df[train_df['HouseStyle']==style]['SalePrice'].values 
                for style in house_styles]
axes[0, 1].boxplot(data_to_plot)
axes[0, 1].set_xticklabels(house_styles, rotation=45)
axes[0, 1].set_xlabel('House Style')
axes[0, 1].set_ylabel('Sale Price ($)')
axes[0, 1].set_title('Price Distribution by House Style', fontweight='bold')
axes[0, 1].grid(alpha=0.3)

# Bedrooms
bedroom_counts = sorted(train_df['BedroomAbvGr'].unique())
data_to_plot = [train_df[train_df['BedroomAbvGr']==bed]['SalePrice'].values 
                for bed in bedroom_counts if bed <= 6]
axes[1, 0].boxplot(data_to_plot)
axes[1, 0].set_xticklabels([b for b in bedroom_counts if b <= 6])
axes[1, 0].set_xlabel('Number of Bedrooms')
axes[1, 0].set_ylabel('Sale Price ($)')
axes[1, 0].set_title('Price Distribution by Bedroom Count', fontweight='bold')
axes[1, 0].grid(alpha=0.3)

# Full Bathrooms
bath_counts = sorted(train_df['FullBath'].unique())
data_to_plot = [train_df[train_df['FullBath']==bath]['SalePrice'].values 
                for bath in bath_counts]
axes[1, 1].boxplot(data_to_plot)
axes[1, 1].set_xticklabels(bath_counts)
axes[1, 1].set_xlabel('Number of Full Bathrooms')
axes[1, 1].set_ylabel('Sale Price ($)')
axes[1, 1].set_title('Price Distribution by Bathroom Count', fontweight='bold')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('outputs/visualizations/06_boxplots_categorical.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved: 06_boxplots_categorical.png")

# ============================================
# 7. MISSING VALUES HEATMAP
# ============================================
print("\n📊 Creating Missing Values Visualization...")

missing_data = train_df.isnull().sum().sort_values(ascending=False)
missing_data = missing_data[missing_data > 0]

if len(missing_data) > 0:
    plt.figure(figsize=(12, 8))
    colors = plt.cm.Reds(np.linspace(0.3, 0.9, len(missing_data)))
    bars = plt.barh(range(len(missing_data)), missing_data.values, color=colors, edgecolor='black')
    plt.yticks(range(len(missing_data)), missing_data.index)
    plt.xlabel('Number of Missing Values', fontweight='bold')
    plt.title('Missing Values by Feature', fontweight='bold', fontsize=16)
    plt.grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, (bar, value) in enumerate(zip(bars, missing_data.values)):
        plt.text(value + 10, i, f'{value} ({value/len(train_df)*100:.1f}%)', 
                va='center', fontweight='bold')
    
    plt.tight_layout()
    plt.savefig('outputs/visualizations/07_missing_values.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("✓ Saved: 07_missing_values.png")
else:
    print("⚠️ No missing values found in dataset")

# ============================================
# 8. MODEL COMPARISON (if models exist)
# ============================================
print("\n📊 Creating Model Comparison Chart...")

# Model performance data (update with your actual results)
model_results = {
    'Model': ['Lasso', 'Ridge', 'Gradient\nBoosting', 'Random\nForest', 'Linear\nRegression'],
    'R2_Score': [0.9002, 0.8967, 0.8945, 0.8876, 0.8789],
    'RMSE': [0.1256, 0.1289, 0.1312, 0.1378, 0.1445]
}

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# R² Score comparison
colors_r2 = ['green' if r2 == max(model_results['R2_Score']) else 'skyblue' 
             for r2 in model_results['R2_Score']]
bars1 = axes[0].bar(model_results['Model'], model_results['R2_Score'], 
                    color=colors_r2, edgecolor='black', linewidth=2)
axes[0].set_ylabel('R² Score', fontweight='bold')
axes[0].set_title('Model Performance - R² Score (Higher is Better)', fontweight='bold')
axes[0].set_ylim([0.85, 0.92])
axes[0].grid(axis='y', alpha=0.3)

# Add value labels
for bar, value in zip(bars1, model_results['R2_Score']):
    height = bar.get_height()
    axes[0].text(bar.get_x() + bar.get_width()/2., height + 0.002,
                f'{value:.4f}', ha='center', va='bottom', fontweight='bold')

# RMSE comparison
colors_rmse = ['green' if rmse == min(model_results['RMSE']) else 'coral' 
               for rmse in model_results['RMSE']]
bars2 = axes[1].bar(model_results['Model'], model_results['RMSE'], 
                    color=colors_rmse, edgecolor='black', linewidth=2)
axes[1].set_ylabel('RMSE', fontweight='bold')
axes[1].set_title('Model Performance - RMSE (Lower is Better)', fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

# Add value labels
for bar, value in zip(bars2, model_results['RMSE']):
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height + 0.002,
                f'{value:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('outputs/visualizations/08_model_comparison.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved: 08_model_comparison.png")

# ============================================
# 9. FEATURE IMPORTANCE (Lasso Coefficients)
# ============================================
print("\n📊 Creating Feature Importance Chart...")

try:
    # Try to load trained model
    with open('house_model.pkl', 'rb') as f:
        model = pickle.load(f)
    with open('feature_names.pkl', 'rb') as f:
        feature_names = pickle.load(f)
    
    # Get feature coefficients
    if hasattr(model, 'coef_'):
        coefficients = pd.Series(model.coef_, index=feature_names)
        top_features = coefficients.abs().sort_values(ascending=False).head(20)
        
        plt.figure(figsize=(12, 10))
        colors = ['green' if coef > 0 else 'red' for coef in coefficients[top_features.index]]
        bars = plt.barh(range(len(top_features)), top_features.values, color=colors, 
                       edgecolor='black', alpha=0.7)
        plt.yticks(range(len(top_features)), top_features.index)
        plt.xlabel('Absolute Coefficient Value', fontweight='bold')
        plt.title('Top 20 Most Important Features (Lasso Model)', fontweight='bold', fontsize=16)
        plt.grid(axis='x', alpha=0.3)
        
        # Add legend
        from matplotlib.patches import Patch
        legend_elements = [Patch(facecolor='green', edgecolor='black', label='Positive Impact'),
                          Patch(facecolor='red', edgecolor='black', label='Negative Impact')]
        plt.legend(handles=legend_elements, loc='lower right')
        
        plt.tight_layout()
        plt.savefig('outputs/visualizations/09_feature_importance.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("✓ Saved: 09_feature_importance.png")
    else:
        print("⚠️ Model doesn't have coefficients (might be tree-based)")
        
except Exception as e:
    print(f"⚠️ Could not load model: {e}")
    print("   Run model training first to generate feature importance")

# ============================================
# 10. PRICE STATISTICS SUMMARY
# ============================================
print("\n📊 Creating Price Statistics Summary...")

price_stats = {
    'Statistic': ['Count', 'Mean', 'Median', 'Std Dev', 'Min', 'Max', 'Q1 (25%)', 'Q3 (75%)'],
    'Value': [
        f"{len(train_df):,}",
        f"${train_df['SalePrice'].mean():,.0f}",
        f"${train_df['SalePrice'].median():,.0f}",
        f"${train_df['SalePrice'].std():,.0f}",
        f"${train_df['SalePrice'].min():,.0f}",
        f"${train_df['SalePrice'].max():,.0f}",
        f"${train_df['SalePrice'].quantile(0.25):,.0f}",
        f"${train_df['SalePrice'].quantile(0.75):,.0f}"
    ]
}

fig, ax = plt.subplots(figsize=(10, 6))
ax.axis('tight')
ax.axis('off')

table = ax.table(cellText=[[stat, val] for stat, val in zip(price_stats['Statistic'], price_stats['Value'])],
                colLabels=['Statistic', 'Value'],
                cellLoc='left',
                loc='center',
                colWidths=[0.4, 0.6])

table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1, 2.5)

# Style the header
for i in range(2):
    table[(0, i)].set_facecolor('#667eea')
    table[(0, i)].set_text_props(weight='bold', color='white')

# Alternate row colors
for i in range(1, len(price_stats['Statistic']) + 1):
    if i % 2 == 0:
        table[(i, 0)].set_facecolor('#f0f0f0')
        table[(i, 1)].set_facecolor('#f0f0f0')

plt.title('Sale Price Statistics Summary', fontweight='bold', fontsize=16, pad=20)
plt.savefig('outputs/visualizations/10_price_statistics.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved: 10_price_statistics.png")

# ============================================
# 11. YEAR BUILT DISTRIBUTION
# ============================================
print("\n📊 Creating Year Built Distribution...")

plt.figure(figsize=(14, 6))
plt.hist(train_df['YearBuilt'], bins=50, color='teal', edgecolor='black', alpha=0.7)
plt.xlabel('Year Built', fontweight='bold')
plt.ylabel('Frequency', fontweight='bold')
plt.title('Distribution of House Construction Years', fontweight='bold', fontsize=16)
plt.axvline(train_df['YearBuilt'].median(), color='red', linestyle='--', 
            linewidth=2, label=f'Median: {train_df["YearBuilt"].median():.0f}')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('outputs/visualizations/11_year_built_distribution.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved: 11_year_built_distribution.png")

# ============================================
# 12. LIVING AREA DISTRIBUTION
# ============================================
print("\n📊 Creating Living Area Distribution...")

plt.figure(figsize=(14, 6))
plt.hist(train_df['GrLivArea'], bins=50, color='orange', edgecolor='black', alpha=0.7)
plt.xlabel('Living Area (sq ft)', fontweight='bold')
plt.ylabel('Frequency', fontweight='bold')
plt.title('Distribution of Above Ground Living Area', fontweight='bold', fontsize=16)
plt.axvline(train_df['GrLivArea'].median(), color='red', linestyle='--', 
            linewidth=2, label=f'Median: {train_df["GrLivArea"].median():.0f} sq ft')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('outputs/visualizations/12_living_area_distribution.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Saved: 12_living_area_distribution.png")

# ============================================
# SUMMARY
# ============================================
print("\n" + "=" * 70)
print("✅ VISUALIZATION GENERATION COMPLETE!")
print("=" * 70)
print(f"\n📁 All visualizations saved to: outputs/visualizations/")
print("\n📊 Generated Files:")
print("   01_price_distribution.png")
print("   02_correlation_heatmap.png")
print("   03_top_features_correlation.png")
print("   04_scatter_plots.png")
print("   05_neighborhood_prices.png")
print("   06_boxplots_categorical.png")
print("   07_missing_values.png")
print("   08_model_comparison.png")
print("   09_feature_importance.png")
print("   10_price_statistics.png")
print("   11_year_built_distribution.png")
print("   12_living_area_distribution.png")

print("\n💡 Usage:")
print("   - Use these images in your PowerPoint presentation")
print("   - Include in your research report")
print("   - Reference in your EDA section")
print("\n🎉 Ready for submission!")

HOUSE PRICE PREDICTION - VISUALIZATION GENERATOR

📂 Loading data...
✓ Loaded training data: (1460, 81)

📊 Creating Price Distribution Plot...
✓ Saved: 01_price_distribution.png

📊 Creating Correlation Heatmap...
✓ Saved: 02_correlation_heatmap.png

📊 Creating Top Features Correlation Chart...
✓ Saved: 03_top_features_correlation.png

📊 Creating Scatter Plots...
✓ Saved: 04_scatter_plots.png

📊 Creating Neighborhood Analysis...
✓ Saved: 05_neighborhood_prices.png

📊 Creating Box Plots...
✓ Saved: 06_boxplots_categorical.png

📊 Creating Missing Values Visualization...
✓ Saved: 07_missing_values.png

📊 Creating Model Comparison Chart...
✓ Saved: 08_model_comparison.png

📊 Creating Feature Importance Chart...
✓ Saved: 09_feature_importance.png

📊 Creating Price Statistics Summary...
✓ Saved: 10_price_statistics.png

📊 Creating Year Built Distribution...
✓ Saved: 11_year_built_distribution.png

📊 Creating Living Area Distribution...
✓ Saved: 12_living_area_distribution.png

✅ VISUALIZATION 