# House Price Prediction - Regression Analysis

This notebook implements a complete machine learning pipeline for predicting house prices using the Housing Prices Dataset from Kaggle.

## Project Overview
- **Problem Type**: Regression
- **Target**: House Price
- **Models**: Linear Regression, Random Forest, XGBoost
- **Approach**: Simple preprocessing first, focus on prediction accuracy

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Data Acquisition

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("yasserh/housing-prices-dataset")

print("Path to dataset files:", path)

In [None]:
# Load the dataset
import os
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
print("Available CSV files:", csv_files)

# Load the main dataset (assuming it's the first CSV file)
data_file = os.path.join(path, csv_files[0])
df = pd.read_csv(data_file)

print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
df.head()

## 2. Data Exploration

In [None]:
# Basic information about the dataset
print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"\nData types:")
print(df.dtypes)
print(f"\nMissing values:")
print(df.isnull().sum())

In [None]:
# Statistical summary
print("Statistical Summary:")
df.describe()

In [None]:
# Identify target variable (assuming it's 'price' or similar)
# Let's check column names first
print("Column names:")
for i, col in enumerate(df.columns):
    print(f"{i}: {col}")

# Identify potential target variable
price_columns = [col for col in df.columns if 'price' in col.lower()]
print(f"\nPotential target columns: {price_columns}")

In [None]:
# Set target variable (adjust this based on actual column name)
target_col = price_columns[0] if price_columns else df.columns[-1]
print(f"Using '{target_col}' as target variable")

# Visualize target distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(df[target_col], bins=50, alpha=0.7)
plt.title(f'Distribution of {target_col}')
plt.xlabel(target_col)
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.boxplot(df[target_col])
plt.title(f'Boxplot of {target_col}')
plt.ylabel(target_col)

plt.tight_layout()
plt.show()

print(f"Target variable stats:")
print(f"Mean: ${df[target_col].mean():,.2f}")
print(f"Median: ${df[target_col].median():,.2f}")
print(f"Std: ${df[target_col].std():,.2f}")

## 3. Data Preprocessing and Cleaning

In [None]:
# Create a copy for preprocessing
df_clean = df.copy()

# Identify numerical and categorical columns
numerical_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()

# Remove target from numerical columns if it's there
if target_col in numerical_cols:
    numerical_cols.remove(target_col)

print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols}")
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")
print(f"Target column: {target_col}")

In [None]:
# Remove extreme outliers from target variable (top and bottom 1%)
Q1 = df_clean[target_col].quantile(0.01)
Q99 = df_clean[target_col].quantile(0.99)

print(f"Before outlier removal: {len(df_clean)} rows")
print(f"Removing values below ${Q1:,.2f} and above ${Q99:,.2f}")

df_clean = df_clean[(df_clean[target_col] >= Q1) & (df_clean[target_col] <= Q99)]
print(f"After outlier removal: {len(df_clean)} rows ({len(df) - len(df_clean)} removed)")

In [None]:
# Handle missing values
print("Missing values before cleaning:")
missing_before = df_clean.isnull().sum()
print(missing_before[missing_before > 0])

# Fill numerical missing values with median
for col in numerical_cols:
    if df_clean[col].isnull().sum() > 0:
        df_clean[col].fillna(df_clean[col].median(), inplace=True)
        print(f"Filled {col} with median: {df_clean[col].median():.2f}")

# Fill categorical missing values with mode
for col in categorical_cols:
    if df_clean[col].isnull().sum() > 0:
        mode_value = df_clean[col].mode()[0] if len(df_clean[col].mode()) > 0 else 'Unknown'
        df_clean[col].fillna(mode_value, inplace=True)
        print(f"Filled {col} with mode: {mode_value}")

print(f"\nMissing values after cleaning: {df_clean.isnull().sum().sum()}")

In [None]:
# Encode categorical variables
df_encoded = df_clean.copy()
label_encoders = {}

for col in categorical_cols:
    # Use label encoding for simplicity (can switch to one-hot if needed)
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
    label_encoders[col] = le
    print(f"Encoded {col}: {len(le.classes_)} unique categories")

print(f"\nDataset shape after encoding: {df_encoded.shape}")
print("All columns are now numerical!")

## 4. Feature-Target Correlation Analysis

In [None]:
# Calculate correlation with target
correlations = df_encoded.corr()[target_col].sort_values(key=abs, ascending=False)
correlations = correlations.drop(target_col)  # Remove self-correlation

print("Top 10 features most correlated with target:")
print(correlations.head(10))

# Plot correlation
plt.figure(figsize=(10, 6))
top_features = correlations.head(15)
plt.barh(range(len(top_features)), top_features.values)
plt.yticks(range(len(top_features)), top_features.index)
plt.xlabel('Correlation with Target')
plt.title('Top 15 Features Correlated with House Price')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Data Splitting

In [None]:
# Prepare features and target
X = df_encoded.drop(target_col, axis=1)
y = df_encoded[target_col]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split into train, validation, and test sets (70/15/15)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.176, random_state=42)  # 0.176 ≈ 0.15/0.85

print(f"\nData splits:")
print(f"Train: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Validation: {X_val.shape[0]} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"Test: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("Features scaled using StandardScaler")
print(f"Train mean: {X_train_scaled.mean():.3f}, std: {X_train_scaled.std():.3f}")
print(f"Validation mean: {X_val_scaled.mean():.3f}, std: {X_val_scaled.std():.3f}")
print(f"Test mean: {X_test_scaled.mean():.3f}, std: {X_test_scaled.std():.3f}")

## 6. Model Development and Training

In [None]:
# Define evaluation function
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{model_name} Results:")
    print(f"MAE: ${mae:,.2f}")
    print(f"MSE: ${mse:,.2f}")
    print(f"RMSE: ${rmse:,.2f}")
    print(f"R²: {r2:.4f}")
    
    return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}

# Store results
results = {}

### 6.1 Linear Regression (Baseline)

In [None]:
# Train Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Predictions
lr_train_pred = lr_model.predict(X_train_scaled)
lr_val_pred = lr_model.predict(X_val_scaled)

# Evaluate
print("=== LINEAR REGRESSION ===")
results['Linear_Regression_Train'] = evaluate_model(y_train, lr_train_pred, "Linear Regression (Train)")
results['Linear_Regression_Val'] = evaluate_model(y_val, lr_val_pred, "Linear Regression (Validation)")

### 6.2 Random Forest

In [None]:
# Train Random Forest (using original unscaled features as RF doesn't require scaling)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Predictions
rf_train_pred = rf_model.predict(X_train)
rf_val_pred = rf_model.predict(X_val)

# Evaluate
print("=== RANDOM FOREST ===")
results['Random_Forest_Train'] = evaluate_model(y_train, rf_train_pred, "Random Forest (Train)")
results['Random_Forest_Val'] = evaluate_model(y_val, rf_val_pred, "Random Forest (Validation)")

### 6.3 XGBoost

In [None]:
# Install and import XGBoost
try:
    import xgboost as xgb
except ImportError:
    print("Installing XGBoost...")
    !pip install xgboost
    import xgboost as xgb

# Train XGBoost
xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)
xgb_model.fit(X_train, y_train)

# Predictions
xgb_train_pred = xgb_model.predict(X_train)
xgb_val_pred = xgb_model.predict(X_val)

# Evaluate
print("=== XGBOOST ===")
results['XGBoost_Train'] = evaluate_model(y_train, xgb_train_pred, "XGBoost (Train)")
results['XGBoost_Val'] = evaluate_model(y_val, xgb_val_pred, "XGBoost (Validation)")

## 7. Model Comparison and Results

In [None]:
# Create comparison DataFrame
comparison_df = pd.DataFrame(results).T
comparison_df['Model'] = ['LR_Train', 'LR_Val', 'RF_Train', 'RF_Val', 'XGB_Train', 'XGB_Val']
comparison_df = comparison_df.set_index('Model')

print("\n=== MODEL COMPARISON ===")
print(comparison_df.round(2))

# Extract validation results only
val_results = comparison_df[comparison_df.index.str.contains('Val')].copy()
val_results.index = ['Linear Regression', 'Random Forest', 'XGBoost']

print("\n=== VALIDATION SET PERFORMANCE ===")
print(val_results.round(2))

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

metrics = ['MAE', 'RMSE', 'R2']
models = val_results.index

# MAE
axes[0,0].bar(models, val_results['MAE'])
axes[0,0].set_title('Mean Absolute Error (Lower is Better)')
axes[0,0].set_ylabel('MAE ($)')
axes[0,0].tick_params(axis='x', rotation=45)

# RMSE
axes[0,1].bar(models, val_results['RMSE'])
axes[0,1].set_title('Root Mean Squared Error (Lower is Better)')
axes[0,1].set_ylabel('RMSE ($)')
axes[0,1].tick_params(axis='x', rotation=45)

# R²
axes[1,0].bar(models, val_results['R2'])
axes[1,0].set_title('R² Score (Higher is Better)')
axes[1,0].set_ylabel('R²')
axes[1,0].tick_params(axis='x', rotation=45)

# Best model summary
best_mae_model = val_results['MAE'].idxmin()
best_r2_model = val_results['R2'].idxmax()

axes[1,1].text(0.1, 0.8, 'Best Model Summary:', fontsize=14, fontweight='bold', transform=axes[1,1].transAxes)
axes[1,1].text(0.1, 0.6, f'Lowest MAE: {best_mae_model}', fontsize=12, transform=axes[1,1].transAxes)
axes[1,1].text(0.1, 0.5, f'MAE: ${val_results.loc[best_mae_model, "MAE"]:,.0f}', fontsize=12, transform=axes[1,1].transAxes)
axes[1,1].text(0.1, 0.3, f'Highest R²: {best_r2_model}', fontsize=12, transform=axes[1,1].transAxes)
axes[1,1].text(0.1, 0.2, f'R²: {val_results.loc[best_r2_model, "R2"]:.3f}', fontsize=12, transform=axes[1,1].transAxes)
axes[1,1].set_xlim(0, 1)
axes[1,1].set_ylim(0, 1)
axes[1,1].axis('off')

plt.tight_layout()
plt.show()

## 8. Final Model Evaluation on Test Set

In [None]:
# Select best model based on validation R²
best_model_name = val_results['R2'].idxmax()
print(f"Best model based on validation R²: {best_model_name}")

# Get the corresponding trained model
if best_model_name == 'Linear Regression':
    best_model = lr_model
    test_pred = best_model.predict(X_test_scaled)
elif best_model_name == 'Random Forest':
    best_model = rf_model
    test_pred = best_model.predict(X_test)
else:  # XGBoost
    best_model = xgb_model
    test_pred = best_model.predict(X_test)

# Evaluate on test set
print(f"\n=== FINAL TEST SET EVALUATION ({best_model_name}) ===")
test_results = evaluate_model(y_test, test_pred, f"{best_model_name} (Test Set)")

In [None]:
# Prediction vs Actual Plot
plt.figure(figsize=(12, 5))

# Scatter plot
plt.subplot(1, 2, 1)
plt.scatter(y_test, test_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title(f'{best_model_name}: Predictions vs Actual')
plt.grid(True, alpha=0.3)

# Residuals plot
plt.subplot(1, 2, 2)
residuals = y_test - test_pred
plt.scatter(test_pred, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Price')
plt.ylabel('Residuals')
plt.title(f'{best_model_name}: Residual Plot')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Feature Importance Analysis

In [None]:
# Feature importance (works for RF and XGBoost)
if best_model_name in ['Random Forest', 'XGBoost']:
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 8))
    top_15_features = feature_importance.head(15)
    plt.barh(range(len(top_15_features)), top_15_features['importance'])
    plt.yticks(range(len(top_15_features)), top_15_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'{best_model_name}: Top 15 Most Important Features')
    plt.gca().invert_yaxis()
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print("Top 10 Most Important Features:")
    print(feature_importance.head(10))
    
elif best_model_name == 'Linear Regression':
    # For linear regression, show coefficients
    coef_df = pd.DataFrame({
        'feature': X.columns,
        'coefficient': lr_model.coef_
    })
    coef_df['abs_coefficient'] = abs(coef_df['coefficient'])
    coef_df = coef_df.sort_values('abs_coefficient', ascending=False)
    
    plt.figure(figsize=(10, 8))
    top_15_coef = coef_df.head(15)
    colors = ['red' if x < 0 else 'blue' for x in top_15_coef['coefficient']]
    plt.barh(range(len(top_15_coef)), top_15_coef['coefficient'], color=colors)
    plt.yticks(range(len(top_15_coef)), top_15_coef['feature'])
    plt.xlabel('Coefficient Value')
    plt.title('Linear Regression: Top 15 Feature Coefficients')
    plt.gca().invert_yaxis()
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print("Top 10 Features by Absolute Coefficient:")
    print(coef_df.head(10))

## 10. Summary and Conclusions

In [None]:
print("\n" + "="*60)
print("                    PROJECT SUMMARY")
print("="*60)

print(f"\n📊 Dataset Information:")
print(f"   • Original size: {df.shape[0]:,} samples, {df.shape[1]} features")
print(f"   • After cleaning: {len(df_clean):,} samples ({(len(df_clean)/len(df)*100):.1f}% retained)")
print(f"   • Features used: {X.shape[1]} (after encoding)")

print(f"\n🎯 Target Variable ({target_col}):")
print(f"   • Mean: ${df_clean[target_col].mean():,.0f}")
print(f"   • Median: ${df_clean[target_col].median():,.0f}")
print(f"   • Range: ${df_clean[target_col].min():,.0f} - ${df_clean[target_col].max():,.0f}")

print(f"\n🏆 Best Model: {best_model_name}")
print(f"   • Test R²: {test_results['R2']:.4f}")
print(f"   • Test RMSE: ${test_results['RMSE']:,.0f}")
print(f"   • Test MAE: ${test_results['MAE']:,.0f}")

print(f"\n📈 Model Performance Comparison (Validation):")
for model in val_results.index:
    r2 = val_results.loc[model, 'R2']
    mae = val_results.loc[model, 'MAE']
    print(f"   • {model}: R² = {r2:.4f}, MAE = ${mae:,.0f}")

print(f"\n💡 Key Insights:")
print(f"   • Model explains {test_results['R2']*100:.1f}% of price variance")
print(f"   • Average prediction error: ${test_results['MAE']:,.0f}")
print(f"   • Data preprocessing successfully handled missing values and outliers")
print(f"   • Simple approaches work well - complex feature engineering may not be needed")

print("\n" + "="*60)
print("                   PROJECT COMPLETE ✅")
print("="*60)