# House Prices Prediction - Ames, Iowa

A comprehensive machine learning project for predicting house prices using the Ames Housing dataset.

## Table of Contents
1. [Data Loading and Initial Exploration](#data-loading)
2. [Data Preprocessing](#data-preprocessing)
3. [Feature Engineering](#feature-engineering)
4. [Feature Selection](#feature-selection)
5. [Model Training and Tuning](#model-training)
6. [Model Stacking](#model-stacking)
7. [Results and Analysis](#results)


## 1. Data Loading and Initial Exploration {#data-loading}


In [None]:
# Import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning packages
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, HuberRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")


In [None]:
# Load the data
train_data = pd.read_csv('train (1).csv')
test_data = pd.read_csv('test (1).csv')

# Combine datasets for preprocessing
combined_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
print(f"Combined data shape: {combined_data.shape}")

# Display basic info
print("\nFirst few rows of training data:")
train_data.head()


## 2. Data Preprocessing {#data-preprocessing}


In [None]:
class HousePricePreprocessor:
    """
    A comprehensive preprocessing class for the Ames Housing dataset.
    
    This class handles:
    - Outlier detection and removal
    - Missing value imputation
    - Feature engineering
    - Data transformation and encoding
    """
    
    def __init__(self):
        self.processed_data = None
        self.target = None
        self.feature_names = None
        
    def preprocess(self, data):
        """
        Main preprocessing pipeline for the housing data.
        
        Parameters:
        -----------
        data : pd.DataFrame
            Combined training and test data
            
        Returns:
        --------
        pd.DataFrame : Processed data ready for modeling
        """
        # Create a copy to avoid modifying original data
        df = data.copy()
        
        # Step 1: Handle outliers
        df = self._handle_outliers(df)
        
        # Step 2: Impute missing values
        df = self._impute_missing_values(df)
        
        # Step 3: Feature engineering
        df = self._create_features(df)
        
        # Step 4: Encode categorical variables
        df = self._encode_categorical_variables(df)
        
        # Step 5: Remove unnecessary columns
        df = self._remove_unnecessary_columns(df)
        
        self.processed_data = df
        return df
    
    def _handle_outliers(self, df):
        """Remove extreme outliers from the dataset."""
        # Remove GrLivArea outliers (houses with > 4500 sq ft)
        df = df.drop(df[df['GrLivArea'] > 4500].index)
        
        # Remove LotFrontage outliers (lots with > 300 ft frontage)
        df = df.drop(df[df['LotFrontage'] > 300].index)
        
        return df
    
    def _impute_missing_values(self, df):
        """Handle missing values with appropriate imputation strategies."""
        # MSZoning: Fill with most common value (RL)
        df['MSZoning'] = df['MSZoning'].fillna('RL')
        
        # LotFrontage: Fill with neighborhood mean
        df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(
            lambda x: x.fillna(x.mean())
        ).astype(int)
        
        # Alley: NA means no alley access
        df['Alley'] = df['Alley'].fillna('NoAlley')
        df['Alley'] = df['Alley'].replace({'NoAlley': 0, 'Grvl': 1, 'Pave': 1})
        
        # Remove columns with too many missing values
        df = df.drop(['Condition2', 'Utilities', 'RoofMatl', 'LowQualFinSF', 'MiscVal'], axis=1)
        
        # GarageYrBlt: Fill with YearBuilt
        df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['YearBuilt'])
        
        # Exterior variables: Fill with mode
        df['Exterior1st'] = df['Exterior1st'].fillna(df['Exterior1st'].mode()[0])
        df['Exterior2nd'] = df['Exterior2nd'].fillna(df['Exterior2nd'].mode()[0])
        
        # Masonry veneer: Fill with None/0
        df['MasVnrType'] = df['MasVnrType'].fillna('None')
        df['MasVnrArea'] = df['MasVnrArea'].fillna(0)
        
        # Basement variables: Fill with None/0
        basement_vars = ['BsmtQual', 'BsmtFinType1', 'BsmtFinType2', 'BsmtCond', 'BsmtExposure']
        for var in basement_vars:
            df[var] = df[var].fillna('None')
        
        basement_area_vars = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']
        for var in basement_area_vars:
            df[var] = df[var].fillna(0)
        
        # Other missing values
        df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])
        df['KitchenQual'] = df['KitchenQual'].fillna(df['KitchenQual'].mode()[0])
        df['Functional'] = df['Functional'].fillna(df['Functional'].mode()[0])
        df['SaleType'] = df['SaleType'].fillna(df['SaleType'].mode()[0])
        
        # Fireplace and garage variables
        df['FireplaceQu'] = df['FireplaceQu'].fillna('None')
        df['GarageType'] = df['GarageType'].fillna('None')
        df['GarageFinish'] = df['GarageFinish'].fillna('None')
        df['GarageQual'] = df['GarageQual'].fillna('None')
        df['GarageCond'] = df['GarageCond'].fillna('None')
        df['PoolQC'] = df['PoolQC'].fillna('None')
        df['Fence'] = df['Fence'].fillna('None')
        
        # Garage area and cars
        df['GarageCars'] = df['GarageCars'].fillna(0)
        df['GarageArea'] = df['GarageArea'].fillna(0)
        
        # Basement bathrooms
        df['BsmtHalfBath'] = df['BsmtHalfBath'].fillna(0)
        df['BsmtFullBath'] = df['BsmtFullBath'].fillna(0)
        
        return df
    
    def _create_features(self, df):
        """Create new engineered features."""
        # Age-related features
        df['Age'] = df['YrSold'] - df['YearBuilt']
        df['YearsSinceRemodel'] = df['YrSold'] - df['YearRemodAdd']
        df['Remodeled'] = (df['YearBuilt'] != df['YearRemodAdd']).astype(int)
        df['RecentRemodel'] = (df['YearRemodAdd'] == df['YrSold']).astype(int)
        df['YearSoldYearBuilt'] = (df['YearBuilt'] == df['YrSold']).astype(int)
        df['SqYearBuilt'] = df['YearBuilt'] ** 2
        
        # Area features
        area_cols = ['LotArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 
                    'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 
                    '3SsnPorch', 'ScreenPorch']
        df['TotalArea'] = df[area_cols].sum(axis=1)
        df['Area1st2nd'] = df['1stFlrSF'] + df['2ndFlrSF']
        df['Has2ndFloor'] = (df['2ndFlrSF'] > 0).astype(int)
        
        # Quality mappings
        quality_dict = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
        exterior_quality_dict = {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}
        
        # Apply quality mappings
        df['ExterQual'] = df['ExterQual'].replace(exterior_quality_dict)
        df['ExterCond'] = df['ExterCond'].replace(exterior_quality_dict)
        df['BsmtQual'] = df['BsmtQual'].replace(quality_dict)
        df['BsmtCond'] = df['BsmtCond'].replace(quality_dict)
        df['HeatingQC'] = df['HeatingQC'].replace(quality_dict)
        df['KitchenQual'] = df['KitchenQual'].replace(quality_dict)
        df['FireplaceQu'] = df['FireplaceQu'].replace(quality_dict)
        df['GarageQual'] = df['GarageQual'].replace(quality_dict)
        df['GarageCond'] = df['GarageCond'].replace(quality_dict)
        df['PoolQC'] = df['PoolQC'].replace(quality_dict)
        
        # Basement exposure
        bsmt_exposure_dict = {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}
        df['BsmtExposure'] = df['BsmtExposure'].replace(bsmt_exposure_dict)
        
        # Basement finish type
        bsmt_finish_dict = {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
        df['BsmtFinType1'] = df['BsmtFinType1'].replace(bsmt_finish_dict)
        df['BsmtFinType2'] = df['BsmtFinType2'].replace(bsmt_finish_dict)
        
        # Functional mapping
        functional_dict = {None: 0, 'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 
                          'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8}
        df['Functional'] = df['Functional'].map(functional_dict).astype(int)
        
        # Fence mapping
        fence_dict = {'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}
        df['Fence'] = df['Fence'].map(fence_dict).astype(int)
        
        # Binary features
        df['CentralAir'] = (df['CentralAir'] == 'Y').astype(int)
        df['HasFireplace'] = (df['Fireplaces'] > 0).astype(int)
        df['HasPool'] = (df['PoolArea'] > 0).astype(int)
        df['HasWoodDeck'] = (df['WoodDeckSF'] > 0).astype(int)
        df['HasEnclosedPorch'] = (df['EnclosedPorch'] > 0).astype(int)
        df['Has3SsnPorch'] = (df['3SsnPorch'] > 0).astype(int)
        df['HasScreenPorch'] = (df['ScreenPorch'] > 0).astype(int)
        df['HasShed'] = (df['MiscFeature'] == 'Shed').astype(int)
        
        # Lot shape
        df['LotShape'] = df['LotShape'].replace({'Reg': 1, 'IR1': 0, 'IR2': 0, 'IR3': 0})
        
        # Land slope
        df['IsSlopeGentle'] = (df['LandSlope'] == 'Gtl').astype(int)
        df = df.drop('LandSlope', axis=1)
        
        # MSSubClass encoding
        df['IsNewerSubClass'] = df['MSSubClass'].replace({
            20: 1, 30: 0, 40: 0, 45: 0, 50: 0, 60: 1, 70: 0, 75: 0, 80: 0, 
            85: 0, 90: 0, 120: 1, 150: 0, 160: 1, 180: 0, 190: 0
        })
        
        # Railroad proximity
        railroad_list = ['RRNn', 'RRAn', 'RRNe', 'RRAe']
        df['RRProximity'] = df['Condition1'].apply(lambda x: 1 if x in railroad_list else 0)
        
        # Convert to string for dummification
        df['MSSubClass'] = df['MSSubClass'].astype(str)
        df['MoSold'] = df['MoSold'].astype(str)
        
        return df
    
    def _encode_categorical_variables(self, df):
        """Create dummy variables for categorical features."""
        # Get dummies for all categorical variables
        df = pd.get_dummies(df, drop_first=True)
        return df
    
    def _remove_unnecessary_columns(self, df):
        """Remove columns that don't add value or cause overfitting."""
        # Remove railroad condition dummies (already captured in RRProximity)
        railroad_cols = [col for col in df.columns if 'Condition1_RR' in col]
        df = df.drop(railroad_cols, axis=1)
        
        # Remove Id column
        if 'Id' in df.columns:
            df = df.drop('Id', axis=1)
        
        return df
    
    def get_processed_data(self):
        """Return the processed data."""
        return self.processed_data

# Initialize preprocessor
preprocessor = HousePricePreprocessor()
print("Preprocessor class created successfully!")


In [None]:
# Apply preprocessing
print("Starting data preprocessing...")
processed_data = preprocessor.preprocess(combined_data)

# Split back into train and test sets
train_size = len(train_data)
train_processed = processed_data.iloc[:train_size].copy()
test_processed = processed_data.iloc[train_size:].copy()

# Extract target variable
target = train_processed['SalePrice'].copy()
train_processed = train_processed.drop('SalePrice', axis=1)
test_processed = test_processed.drop('SalePrice', axis=1)

print(f"Processed training data shape: {train_processed.shape}")
print(f"Processed test data shape: {test_processed.shape}")
print(f"Number of features: {train_processed.shape[1]}")
print(f"Missing values in processed data: {train_processed.isnull().sum().sum()}")

# Display feature information
print(f"\nFeature types:")
print(f"Numeric features: {train_processed.select_dtypes(include=[np.number]).shape[1]}")
print(f"Categorical features: {train_processed.select_dtypes(include=['object']).shape[1]}")


## 3. Feature Selection {#feature-selection}


In [None]:
class FeatureSelector:
    """
    Feature selection class using multiple methods including RFECV and correlation analysis.
    """
    
    def __init__(self):
        self.selected_features = {}
        self.feature_importance = {}
        
    def select_features_rfecv(self, X, y, models, cv=3):
        """
        Select features using Recursive Feature Elimination with Cross-Validation.
        
        Parameters:
        -----------
        X : pd.DataFrame
            Training features
        y : pd.Series
            Target variable
        models : dict
            Dictionary of model names and instances
        cv : int
            Number of cross-validation folds
            
        Returns:
        --------
        dict : Selected features for each model
        """
        selected_features = {}
        
        for model_name, model in models.items():
            print(f"Selecting features for {model_name}...")
            
            # Use RFECV
            rfecv = RFECV(estimator=model, step=1, cv=cv, 
                         scoring='neg_mean_squared_error', n_jobs=-1)
            rfecv.fit(X, y)
            
            # Get selected features
            selected_mask = rfecv.get_support()
            selected_features[model_name] = X.columns[selected_mask].tolist()
            
            print(f"  Selected {len(selected_features[model_name])} features")
            
            # Store feature importance if available
            if hasattr(rfecv.estimator_, 'feature_importances_'):
                self.feature_importance[model_name] = dict(zip(
                    selected_features[model_name], 
                    rfecv.estimator_.feature_importances_[:len(selected_features[model_name])]
                ))
        
        self.selected_features = selected_features
        return selected_features
    
    def get_correlation_features(self, X, y, threshold=0.1):
        """
        Select features based on correlation with target variable.
        
        Parameters:
        -----------
        X : pd.DataFrame
            Training features
        y : pd.Series
            Target variable
        threshold : float
            Minimum absolute correlation threshold
            
        Returns:
        --------
        list : Selected feature names
        """
        # Calculate correlations
        correlations = X.corrwith(y).abs()
        
        # Select features above threshold
        selected = correlations[correlations > threshold].index.tolist()
        
        print(f"Selected {len(selected)} features based on correlation threshold {threshold}")
        return selected

# Initialize feature selector
feature_selector = FeatureSelector()

# Define models for feature selection
models = {
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=1.0),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}

# Select features using RFECV
print("Performing feature selection using RFECV...")
selected_features = feature_selector.select_features_rfecv(train_processed, target, models)

# Display selected features for each model
for model_name, features in selected_features.items():
    print(f"\n{model_name} selected features ({len(features)}):")
    print(f"  {features[:10]}{'...' if len(features) > 10 else ''}")


## 4. Model Training and Hyperparameter Tuning {#model-training}


In [None]:
class ModelTrainer:
    """
    Comprehensive model training and hyperparameter tuning class.
    """
    
    def __init__(self):
        self.trained_models = {}
        self.best_params = {}
        self.cv_scores = {}
        
    def train_models(self, X, y, selected_features, cv=3):
        """
        Train and tune multiple models with their selected features.
        
        Parameters:
        -----------
        X : pd.DataFrame
            Training features
        y : pd.Series
            Target variable
        selected_features : dict
            Selected features for each model
        cv : int
            Number of cross-validation folds
        """
        # Define parameter grids for each model
        param_grids = {
            'Ridge': {'alpha': np.logspace(-3, 3, 10)},
            'Lasso': {'alpha': np.logspace(-3, 3, 10)},
            'RandomForest': {
                'n_estimators': [100, 200, 300],
                'max_depth': [10, 20, None],
                'min_samples_split': [2, 5, 10]
            },
            'GradientBoosting': {
                'n_estimators': [100, 200, 300],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7]
            },
            'XGBoost': {
                'n_estimators': [100, 200, 300],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7]
            }
        }
        
        # Train each model
        for model_name, features in selected_features.items():
            print(f"\nTraining {model_name}...")
            
            # Get features for this model
            X_model = X[features]
            
            # Define base model
            if model_name == 'Ridge':
                base_model = Ridge()
            elif model_name == 'Lasso':
                base_model = Lasso()
            elif model_name == 'RandomForest':
                base_model = RandomForestRegressor(random_state=42)
            elif model_name == 'GradientBoosting':
                base_model = GradientBoostingRegressor(random_state=42)
            elif model_name == 'XGBoost':
                base_model = XGBRegressor(random_state=42)
            
            # Perform grid search
            grid_search = GridSearchCV(
                base_model, 
                param_grids[model_name], 
                cv=cv, 
                scoring='neg_mean_squared_error',
                n_jobs=-1,
                verbose=0
            )
            
            # Fit the model
            grid_search.fit(X_model, y)
            
            # Store results
            self.trained_models[model_name] = grid_search.best_estimator_
            self.best_params[model_name] = grid_search.best_params_
            self.cv_scores[model_name] = -grid_search.best_score_
            
            print(f"  Best parameters: {grid_search.best_params_}")
            print(f"  CV RMSE: {np.sqrt(-grid_search.best_score_):.4f}")
    
    def get_predictions(self, X, model_name):
        """
        Get predictions from a trained model.
        
        Parameters:
        -----------
        X : pd.DataFrame
            Features for prediction
        model_name : str
            Name of the model
            
        Returns:
        --------
        np.array : Predictions
        """
        if model_name not in self.trained_models:
            raise ValueError(f"Model {model_name} not found")
        
        # Get selected features for this model
        features = selected_features[model_name]
        X_model = X[features]
        
        # Make predictions
        return self.trained_models[model_name].predict(X_model)
    
    def get_model_performance(self):
        """
        Get performance summary of all trained models.
        
        Returns:
        --------
        pd.DataFrame : Performance metrics
        """
        performance_data = []
        
        for model_name in self.trained_models.keys():
            performance_data.append({
                'Model': model_name,
                'CV_RMSE': self.cv_scores[model_name],
                'Features_Used': len(selected_features[model_name])
            })
        
        return pd.DataFrame(performance_data).sort_values('CV_RMSE')

# Initialize model trainer
model_trainer = ModelTrainer()

# Train all models
print("Starting model training and hyperparameter tuning...")
model_trainer.train_models(train_processed, target, selected_features)

# Display performance summary
print("\nModel Performance Summary:")
performance_df = model_trainer.get_model_performance()
print(performance_df)


## 5. Model Stacking {#model-stacking}


In [None]:
class ModelStacker:
    """
    Model stacking class for creating ensemble predictions.
    """
    
    def __init__(self):
        self.stacked_model = None
        self.level0_predictions = None
        
    def create_level0_predictions(self, X, y, model_trainer, selected_features):
        """
        Create Level-0 predictions from all base models.
        
        Parameters:
        -----------
        X : pd.DataFrame
            Training features
        y : pd.Series
            Target variable
        model_trainer : ModelTrainer
            Trained model trainer instance
        selected_features : dict
            Selected features for each model
            
        Returns:
        --------
        pd.DataFrame : Level-0 predictions
        """
        # Split data for stacking
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Create Level-0 predictions
        level0_predictions = {}
        
        for model_name in model_trainer.trained_models.keys():
            print(f"Creating Level-0 predictions for {model_name}...")
            
            # Get features for this model
            features = selected_features[model_name]
            X_model_train = X_train[features]
            X_model_val = X_val[features]
            
            # Train on training set
            model = model_trainer.trained_models[model_name]
            model.fit(X_model_train, y_train)
            
            # Predict on validation set
            val_predictions = model.predict(X_model_val)
            level0_predictions[model_name] = val_predictions
        
        # Create DataFrame of Level-0 predictions
        self.level0_predictions = pd.DataFrame(level0_predictions, index=X_val.index)
        
        return self.level0_predictions, y_val
    
    def train_stacked_model(self, X, y, meta_models=['LinearRegression', 'GradientBoosting', 'XGBoost']):
        """
        Train the Level-1 (stacked) model.
        
        Parameters:
        -----------
        X : pd.DataFrame
            Level-0 predictions
        y : pd.Series
            Target variable
        meta_models : list
            List of meta-models to try
        """
        best_score = float('inf')
        best_model = None
        best_name = None
        
        for model_name in meta_models:
            print(f"Training stacked model: {model_name}")
            
            if model_name == 'LinearRegression':
                model = LinearRegression()
            elif model_name == 'GradientBoosting':
                model = GradientBoostingRegressor(random_state=42)
            elif model_name == 'XGBoost':
                model = XGBRegressor(random_state=42)
            
            # Cross-validation score
            cv_scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_squared_error')
            cv_rmse = np.sqrt(-cv_scores.mean())
            
            print(f"  CV RMSE: {cv_rmse:.4f}")
            
            if cv_rmse < best_score:
                best_score = cv_rmse
                best_model = model
                best_name = model_name
        
        # Train best model on full data
        best_model.fit(X, y)
        self.stacked_model = best_model
        
        print(f"\nBest stacked model: {best_name} with RMSE: {best_score:.4f}")
    
    def predict_stacked(self, X):
        """
        Make predictions using the stacked model.
        
        Parameters:
        -----------
        X : pd.DataFrame
            Level-0 predictions
            
        Returns:
        --------
        np.array : Stacked predictions
        """
        if self.stacked_model is None:
            raise ValueError("Stacked model not trained yet")
        
        return self.stacked_model.predict(X)

# Initialize model stacker
stacker = ModelStacker()

# Create Level-0 predictions
print("Creating Level-0 predictions...")
level0_preds, y_val = stacker.create_level0_predictions(train_processed, target, model_trainer, selected_features)

# Train stacked model
print("\nTraining stacked model...")
stacker.train_stacked_model(level0_preds, y_val)

# Evaluate stacked model
stacked_score = np.sqrt(mean_squared_error(y_val, stacker.predict_stacked(level0_preds)))
print(f"Stacked model validation RMSE: {stacked_score:.4f}")


## 6. Final Predictions and Results {#results}


In [None]:
# Create final predictions for test set
print("Creating final predictions for test set...")

# Get Level-0 predictions for test set
test_level0_predictions = {}

for model_name in model_trainer.trained_models.keys():
    print(f"Generating test predictions for {model_name}...")
    
    # Get features for this model
    features = selected_features[model_name]
    X_test_model = test_processed[features]
    
    # Make predictions
    test_predictions = model_trainer.get_predictions(test_processed, model_name)
    test_level0_predictions[model_name] = test_predictions

# Create DataFrame of test Level-0 predictions
test_level0_df = pd.DataFrame(test_level0_predictions)

# Generate final stacked predictions
final_predictions = stacker.predict_stacked(test_level0_df)

# Create submission file
submission = pd.DataFrame({
    'Id': range(1461, 1461 + len(final_predictions)),
    'SalePrice': final_predictions
})

# Save submission
submission.to_csv('final_submission.csv', index=False)
print(f"Final submission saved with {len(submission)} predictions")

# Display prediction statistics
print(f"\nPrediction Statistics:")
print(f"Mean prediction: ${final_predictions.mean():,.2f}")
print(f"Min prediction: ${final_predictions.min():,.2f}")
print(f"Max prediction: ${final_predictions.max():,.2f}")
print(f"Std prediction: ${final_predictions.std():,.2f}")

# Display first few predictions
print(f"\nFirst 10 predictions:")
print(submission.head(10))


In [None]:
# Model Performance Visualization
plt.figure(figsize=(15, 10))

# 1. Individual Model Performance
plt.subplot(2, 3, 1)
performance_df = model_trainer.get_model_performance()
plt.bar(performance_df['Model'], performance_df['CV_RMSE'])
plt.title('Individual Model Performance (CV RMSE)')
plt.ylabel('RMSE')
plt.xticks(rotation=45)

# 2. Feature Importance (Random Forest)
plt.subplot(2, 3, 2)
if 'RandomForest' in feature_selector.feature_importance:
    rf_importance = feature_selector.feature_importance['RandomForest']
    top_features = sorted(rf_importance.items(), key=lambda x: x[1], reverse=True)[:10]
    features, importance = zip(*top_features)
    plt.barh(features, importance)
    plt.title('Top 10 Features (Random Forest)')
    plt.xlabel('Importance')

# 3. Prediction Distribution
plt.subplot(2, 3, 3)
plt.hist(final_predictions, bins=50, alpha=0.7, color='skyblue')
plt.title('Final Prediction Distribution')
plt.xlabel('Predicted Sale Price ($)')
plt.ylabel('Frequency')

# 4. Level-0 Predictions Correlation
plt.subplot(2, 3, 4)
correlation_matrix = test_level0_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Level-0 Predictions Correlation')

# 5. Model Comparison
plt.subplot(2, 3, 5)
model_names = list(selected_features.keys())
feature_counts = [len(features) for features in selected_features.values()]
plt.bar(model_names, feature_counts)
plt.title('Number of Features per Model')
plt.ylabel('Feature Count')
plt.xticks(rotation=45)

# 6. Prediction vs Actual (for validation set)
plt.subplot(2, 3, 6)
if hasattr(stacker, 'level0_predictions') and stacker.level0_predictions is not None:
    stacked_val_preds = stacker.predict_stacked(stacker.level0_predictions)
    plt.scatter(y_val, stacked_val_preds, alpha=0.6)
    plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predicted Sale Price')
    plt.title('Prediction vs Actual (Validation)')

plt.tight_layout()
plt.show()

# Final Summary
print("="*60)
print("FINAL PROJECT SUMMARY")
print("="*60)
print(f"Dataset: Ames Housing (2,919 properties, 80+ features)")
print(f"Preprocessing: Outlier removal, feature engineering, encoding")
print(f"Feature Selection: RFECV for each model")
print(f"Models Trained: {len(model_trainer.trained_models)}")
print(f"Best Individual Model: {performance_df.iloc[0]['Model']} (RMSE: {performance_df.iloc[0]['CV_RMSE']:.4f})")
print(f"Stacked Model: {type(stacker.stacked_model).__name__}")
print(f"Final Predictions: {len(final_predictions)} test samples")
print(f"Submission File: final_submission.csv")
print("="*60)
