<a href="https://colab.research.google.com/github/nazimulrahmann/house_price_prediction/blob/main/price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing Dependencies**

In [None]:
!pip install catboost

In [2]:
# === Suppress warnings ===
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings in notebook output

# === Typing and datetime ===
from typing import Union, Tuple, Dict, List, Optional  # For type hints and static typing in function definitions
from datetime import datetime  # To handle date and time data

# === Data manipulation ===
import numpy as np  # For numerical computing and array handling
import pandas as pd  # For handling tabular data using DataFrames

# === Visualization ===
import matplotlib.pyplot as plt  # Basic plotting
import seaborn as sns  # Statistical visualizations with better aesthetics

# === Statistical tools ===
from scipy import stats  # General statistical functions and tests
from scipy.stats import zscore, randint, loguniform, uniform  # Specific functions for normalization and sampling

# === Scikit-learn core tools for modeling ===
from sklearn.model_selection import train_test_split  # Split data into training and testing sets
from sklearn.model_selection import GridSearchCV  # Exhaustive hyperparameter tuning using cross-validation
from sklearn.model_selection import KFold, cross_val_score  # K-Fold cross-validation and scoring models

# === Evaluation Metrics ===
from sklearn.metrics import (
    mean_squared_error,  # Measures average squared difference between estimated and actual values
    mean_absolute_error,  # Measures average absolute difference
    r2_score,  # R-squared (coefficient of determination)
    explained_variance_score,  # How well future samples are likely to be explained
    max_error,  # Maximum residual error
    mean_absolute_percentage_error  # MAE as a percentage of true values
)

# === Preprocessing utilities ===
from sklearn.impute import SimpleImputer  # Fill missing values using mean/median/mode etc.
from sklearn.preprocessing import (
    StandardScaler,  # Standardize features (mean=0, std=1)
    MinMaxScaler,  # Scale features to a given range (default 0 to 1)
    RobustScaler,  # Scale features using statistics robust to outliers
    OneHotEncoder,  # Convert categorical variables into binary vectors
    OrdinalEncoder  # Encode categorical features as integer values
)

# === Feature engineering ===
from sklearn.feature_selection import (
    SelectKBest,  # Select top k features based on a scoring function
    f_regression,  # Scoring function used in regression-based feature selection
    VarianceThreshold  # Remove features with low variance (constant features)
)

# === Pipelines and column transformation ===
from sklearn.pipeline import Pipeline, make_pipeline  # Combine preprocessing and modeling steps
from sklearn.compose import ColumnTransformer, make_column_transformer  # Handle preprocessing for different column types

# === Regression models ===

# --- Linear Models ---
from sklearn.linear_model import (
    LinearRegression,  # Ordinary least squares regression
    Ridge,  # L2 regularized linear regression
    Lasso,  # L1 regularized linear regression (feature selection)
    ElasticNet,  # Combination of L1 and L2 penalties
    BayesianRidge,  # Bayesian linear regression
    ARDRegression,  # Bayesian regression with automatic relevance determination
    SGDRegressor,  # Linear model fitted by stochastic gradient descent
    HuberRegressor,  # Robust to outliers using Huber loss
    TheilSenRegressor,  # Robust linear model using median slopes
    RANSACRegressor,  # Fits a model robust to outliers
    PassiveAggressiveRegressor,  # Online learning algorithm
    OrthogonalMatchingPursuit  # Sparse linear regression
)

# --- Tree-based Models ---
from sklearn.tree import DecisionTreeRegressor  # Non-linear regression using decision trees
from sklearn.ensemble import (
    RandomForestRegressor,  # Ensemble of decision trees with averaging
    GradientBoostingRegressor,  # Boosted ensemble of trees for better accuracy
    AdaBoostRegressor,  # Boosting technique using weighted averages
    ExtraTreesRegressor,  # Similar to Random Forest but with more randomness
    BaggingRegressor,  # Bagging ensemble method
    StackingRegressor,  # Combines multiple models using a meta-model
    VotingRegressor,  # Averages predictions from multiple models
    HistGradientBoostingRegressor  # Faster implementation of Gradient Boosting with histogram binning
)

# --- Support Vector Machines ---
from sklearn.svm import SVR, LinearSVR, NuSVR  # Regression using support vector machines

# --- Nearest Neighbors ---
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor  # Instance-based regressors

# --- Neural Networks ---
from sklearn.neural_network import MLPRegressor  # Multi-layer perceptron for regression

# --- Gaussian Processes ---
from sklearn.gaussian_process import GaussianProcessRegressor  # Non-parametric kernel-based probabilistic model
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, RationalQuadratic  # Kernels used in Gaussian Processes

# --- Other regressors ---
from sklearn.kernel_ridge import KernelRidge  # Ridge regression with kernels
from sklearn.cross_decomposition import PLSRegression  # Partial Least Squares Regression
from sklearn.isotonic import IsotonicRegression  # Non-linear regression that preserves order
from sklearn.dummy import DummyRegressor  # Baseline model for comparison
from sklearn.compose import TransformedTargetRegressor  # Transform target variable during training

# === Advanced ensemble models from external libraries ===
from xgboost import XGBRegressor  # Gradient boosting model from XGBoost
from lightgbm import LGBMRegressor  # Gradient boosting from LightGBM
from catboost import CatBoostRegressor  # Gradient boosting from CatBoost, handles categorical variables natively

# === Model stacking ===
from mlxtend.regressor import StackingCVRegressor  # Cross-validated stacking of multiple regressors

**Data Loading**

In [3]:
train_df = pd.read_csv('/content/house_price_train_data.csv')
test_df = pd.read_csv('/content/house_price_test_data.csv')

In [None]:
print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

**Data Overview**

In [None]:
def safe_sample(train_df: pd.DataFrame, num: int):
    # Corrected: Use the passed argument train_df instead of an undefined variable df
    return train_df.sample(min(num, len(train_df)), random_state=42)

# Display first 3 rows of the DataFrame (or fewer if there aren't enough rows)
print("\nFirst 3 rows:")
display(train_df.head(3))

# Display last 3 rows
print("\nLast 3 rows:")
display(train_df.tail(3))

# Display 3 random rows
print("\nRandom 3 rows:")
display(safe_sample(train_df, num=3))

In [None]:
# Getting info form data
train_df.info()

**Columns Analysis**

In [None]:
def column_types_analysis(df: pd.DataFrame):
    """Analyze and visualize column data types distribution.

    Args:
        df: pandas DataFrame to analyze
    """
    print("="*50)
    print("COLUMN DATA TYPES ANALYSIS")
    print("="*50)

    type_counts = df.dtypes.value_counts()
    display(type_counts)

    # Pass figsize as a tuple (width, height)
    plt.figure(figsize=(12, 8))
    type_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90,
                    colors=sns.color_palette('pastel'))
    plt.title('Distribution of Column Data Types')
    plt.ylabel('')
    plt.show()
# Run Function
column_types_analysis(df=train_df)

**Unique Values Analysis**

In [None]:
def unique_values_analysis(df: pd.DataFrame,
                           columns: Optional[Union[str, List[str]]] = None,
                           plot: bool = True,
                           max_unique_plot: int = 20) -> None:
    """
    Analyze and visualize unique values in specified DataFrame columns.

    Args:
        df: pandas DataFrame to analyze.
        columns: Column(s) to analyze. If None, all columns are used.
        plot: Whether to generate visualizations.
        max_unique_plot: Max number of unique values to plot per column.
    """

    print("="*50)
    print("UNIQUE VALUES ANALYSIS")
    print("="*50)

    # If columns is None, analyze all columns
    if columns is None:
        columns = df.columns.tolist()
    # Ensure columns is a list for iteration, even if a single column name was passed as a string
    elif isinstance(columns, str):
        columns = [columns]

    # Filter only valid columns that exist in the DataFrame
    columns = [col for col in columns if col in df.columns]

    # Check if any valid columns were found
    if not columns:
        print("No valid columns found to analyze.")
        return

    # Calculate number of unique values per column, sort them in descending order
    unique_counts = df[columns].nunique(dropna=False).sort_values(ascending=False)

    print("\nNumber of Unique Values per Column:")
    print(unique_counts)

    if plot:
        # Filter columns that have a reasonable number of unique values to plot
        # Also ensure the column exists in the original df to avoid KeyError later
        cols_to_plot = [col for col in unique_counts[unique_counts <= max_unique_plot].index if col in df.columns]


        for col in cols_to_plot:
            plt.figure(figsize=(10, 5))  # Set figure size within the function

            # For categorical or low-cardinality columns
            if df[col].dtype in ['object', 'category'] or unique_counts[col] <= 10:
                # Get value counts including NaN, normalize to percentage
                value_counts = df[col].value_counts(normalize=True, dropna=False) * 100

                # Truncate to top N if too many unique categories
                if len(value_counts) > max_unique_plot:
                    value_counts = value_counts.head(max_unique_plot)
                    title_suffix = f" (Top {max_unique_plot})"
                else:
                    title_suffix = ""

                # Bar plot for categorical distribution
                value_counts.plot(kind='bar', color='skyblue')
                plt.title(f'Distribution of {col}{title_suffix}')
                plt.ylabel('Percentage')
                plt.xticks(rotation=45)

            # For numerical columns
            else:
                # Drop NaN for plotting distributions
                numeric_data = df[col].dropna()

                # Use seaborn histogram with KDE (kernel density estimate)
                sns.histplot(numeric_data, kde=True, color='skyblue', bins=30)
                plt.title(f'Distribution of {col}')
                plt.xlabel(col)
                plt.ylabel('Frequency')

            plt.tight_layout()
            plt.show()
# Usage
unique_values_analysis(train_df)

**Statistical Analysis**

In [None]:
def describe_data(df: pd.DataFrame, include_all: bool = False) -> None:
    """
    Generate and display descriptive statistics for a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to analyze.
        include_all (bool): If True, include all columns (numeric + categorical) in the statistics.
                            If False, show numeric and categorical stats separately.
    """
    print("=" * 50)
    print("DESCRIPTIVE STATISTICS")
    print("=" * 50)

    # If DataFrame is empty, print a message and exit
    if df.empty:
        print("The DataFrame is empty. No statistics to display.")
        return

    if include_all:
        # Show descriptive stats for all column types
        print(df.describe(include='all').to_string())
    else:
        # Select numeric columns
        numeric_cols = df.select_dtypes(include=np.number).columns

        if not numeric_cols.empty:
            print("\nNumeric Columns Statistics:")
            # Describe numeric columns and print as a string for readability
            print(df[numeric_cols].describe().to_string())
        else:
            print("\nNo numeric columns found.")

        # Select categorical (object or category) columns
        cat_cols = df.select_dtypes(include=['object', 'category']).columns

        if not cat_cols.empty:
            print("\nCategorical Columns Statistics:")
            # Describe categorical columns and print as a string
            print(df[cat_cols].describe().to_string())
        else:
            print("\nNo categorical columns found.")

# Describe numeric and categorical separately
describe_data(train_df)

# Describe all columns at once
describe_data(train_df, include_all=True)

**Univariate Numerical Column Analysis**


In [None]:
def numerical_cols_analysis(df, figsize=(15, 10), bins=30, color='skyblue'):
    """
    Perform univariate analysis on all numerical columns in a DataFrame
    with multiple visualization types.

    Parameters:
    - df: Pandas DataFrame
    - figsize: Tuple for figure size
    - bins: Number of bins for histograms
    - color: Color for plots

    Returns:
    - Dictionary containing skewness and kurtosis for each column
    """

    # Select only numerical columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns

    if len(numerical_cols) == 0:
        print("No numerical columns found in the DataFrame.")
        return None

    stats_results = {}

    for col in numerical_cols:
        print(f"\n{'='*50}")
        print(f"UNIVARIATE ANALYSIS FOR: {col}")
        print(f"{'='*50}")

        # Create figure with subplots
        fig, axes = plt.subplots(2, 3, figsize=figsize)
        fig.suptitle(f'Univariate Analysis: {col}', fontsize=16)

        # Data summary
        col_data = df[col].dropna()
        n_missing = df[col].isna().sum()
        stats_results[col] = {
            'count': len(col_data),
            'missing': n_missing,
            'mean': col_data.mean(),
            'median': col_data.median(),
            'min': col_data.min(),
            'max': col_data.max(),
            'std': col_data.std(),
            'skew': col_data.skew(),
            'kurtosis': col_data.kurtosis()
        }

        print(f"\nSummary Statistics:")
        print(f"- Missing values: {n_missing} ({n_missing/len(df):.1%})")
        print(f"- Mean: {stats_results[col]['mean']:.2f}")
        print(f"- Median: {stats_results[col]['median']:.2f}")
        print(f"- Range: {stats_results[col]['min']:.2f} to {stats_results[col]['max']:.2f}")

        # Box plot
        sns.boxplot(x=df[col], color=color, ax=axes[0, 1])
        axes[0, 1].set_title(f'Box Plot of {col}')

        # Q-Q plot
        stats.probplot(df[col].dropna(), dist="norm", plot=axes[1, 0])
        axes[1, 0].set_title(f'Q-Q Plot of {col}')

        plt.tight_layout()
        plt.show()

    return stats_results
    # Run the analysis
numerical_cols_analysis(train_df)

**Univariate Categorical Column Analysis**

In [None]:
def categorical_univariate_analysis(df, figsize=(15, 10), top_n=20,
                                  color='skyblue', palette='viridis'):
    """
    Perform univariate analysis on all categorical columns in a DataFrame
    with multiple visualization types.

    Parameters:
    - df: Pandas DataFrame
    - figsize: Tuple for figure size
    - top_n: Show top N categories (for high-cardinality features)
    - color: Color for single-color plots
    - palette: Color palette for multi-color plots

    Returns:
    - Dictionary containing summary statistics for each column
    """

    # Select categorical columns (object, category, bool types)
    categorical_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns

    if len(categorical_cols) == 0:
        print("No categorical columns found in the DataFrame.")
        return None

    stats_results = {}

    for col in categorical_cols:
        print(f"\n{'='*50}")
        print(f"CATEGORICAL ANALYSIS FOR: {col}")
        print(f"{'='*50}")

        # Create figure with subplots
        fig, axes = plt.subplots(2, 2, figsize=figsize)
        fig.suptitle(f'Categorical Analysis: {col}', fontsize=16)

        # Data summary
        # Convert 'object' type columns to 'category' for better handling
        col_data = df[col].astype('category') if df[col].dtype == 'object' else df[col]
        value_counts = col_data.value_counts(dropna=False)
        value_counts_pct = col_data.value_counts(dropna=False, normalize=True)
        n_unique = col_data.nunique()
        n_missing = col_data.isna().sum()

        stats_results[col] = {
            'count': len(col_data),
            'missing': n_missing,
            'missing_pct': n_missing / len(col_data),
            'n_unique': n_unique,
            'value_counts': value_counts,
            'value_counts_pct': value_counts_pct
        }

        print(f"\nSummary Statistics:")
        print(f"- Missing values: {n_missing} ({n_missing/len(df):.1%})")
        print(f"- Unique values: {n_unique}")
        if n_unique <= 20:
            print("\nValue Counts:")
            print(value_counts.to_string())
            print("\nValue Percentages:")
            print(value_counts_pct.apply(lambda x: f"{x:.1%}").to_string())
        else:
            print(f"\nTop {top_n} Values (of {n_unique}):")
            print(value_counts.head(top_n).to_string())
            print("\nTop Value Percentages:")
            print(value_counts_pct.head(top_n).apply(lambda x: f"{x:.1%}").to_string())

        # Handle high cardinality (show only top_n categories)
        plot_data = col_data.copy()
        if n_unique > top_n:
            top_categories = value_counts.index[:top_n].tolist() # Get top categories as a list
            # Add 'Other' to categories *before* using .where()
            if 'Other' not in plot_data.cat.categories:
                 plot_data = plot_data.cat.add_categories('Other')
            plot_data = plot_data.where(plot_data.isin(top_categories), 'Other')


        # Count Plot (Bar plot)
        if n_unique > 15:
            # Horizontal for many categories
            sns.countplot(y=plot_data, order=plot_data.value_counts().index,
                         color=color, ax=axes[0, 0])
        else:
            sns.countplot(x=plot_data, order=plot_data.value_counts().index,
                         color=color, ax=axes[0, 0])
        axes[0, 0].set_title(f'Count Plot of {col}')
        axes[0, 0].tick_params(axis='x', rotation=45)

        # Pie Chart (only if reasonable number of categories)
        if n_unique <= 10 and n_unique > 1:
            plot_data.value_counts().plot.pie(autopct='%1.1f%%',
                                            colors=sns.color_palette(palette),
                                            ax=axes[0, 1])
            axes[0, 1].set_ylabel('')  # Remove y-label
            axes[0, 1].set_title(f'Pie Chart of {col}')
        else:
            # If too many categories, show donut chart of top N
            top_data = plot_data.value_counts().head(10)
            axes[0, 1].pie(top_data, labels=top_data.index,
                          autopct='%1.1f%%', pctdistance=0.85,
                          colors=sns.color_palette(palette))
            centre_circle = plt.Circle((0,0), 0.70, fc='white')
            axes[0, 1].add_artist(centre_circle)
            axes[0, 1].set_title(f'Donut Chart (Top {len(top_data)} Categories)')


        plt.tight_layout()
        plt.show()
        print("\n" + "-"*50)

    return stats_results
# Run the analysis
categorical_univariate_analysis(train_df)

**Missing Values Analysis and Imoutation**

In [None]:
def handle_missing_values(df, impute=True, categorical_strategy='most_frequent', numerical_strategy='mean'):
    """
    Analyze and handle missing values in a DataFrame.

    Parameters:
    - df: pandas DataFrame
    - impute: bool, whether to impute missing values (default True)
    - categorical_strategy: strategy for categorical imputation ('most_frequent', 'constant')
    - numerical_strategy: strategy for numerical imputation ('mean', 'median', 'mode')

    Returns:
    - DataFrame with missing values imputed (if impute=True)
    - Displays a bar plot of missing value percentages
    """

    # Calculate missing value percentages
    missing_percent = df.isnull().mean() * 100
    missing_percent = missing_percent[missing_percent > 0].sort_values(ascending=False)

    # Plot missing values
    plt.figure(figsize=(10, 6))
    sns.barplot(x=missing_percent.index, y=missing_percent.values, palette='viridis')
    plt.axhline(y=30, color='red', linestyle='--', label='30% threshold')
    plt.title('Percentage of Missing Values by Column')
    plt.ylabel('Percentage Missing (%)')
    plt.xlabel('Columns')
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()

    print(f"Total missing values before handling: {df.isnull().sum().sum()}")

    if not impute:
        return df

    # Make a copy of the original dataframe
    df_imputed = df.copy()

    # Separate categorical and numerical columns
    categorical_cols = df_imputed.select_dtypes(include=['object', 'category']).columns
    numerical_cols = df_imputed.select_dtypes(include=['int64', 'float64']).columns

    # Impute categorical columns
    for col in categorical_cols:
        if df_imputed[col].isnull().any():
            if categorical_strategy == 'most_frequent':
                df_imputed[col].fillna(df_imputed[col].mode()[0], inplace=True)
            elif categorical_strategy == 'constant':
                df_imputed[col].fillna('Missing', inplace=True)

    # Impute numerical columns
    for col in numerical_cols:
        if df_imputed[col].isnull().any():
            if numerical_strategy == 'mean':
                df_imputed[col].fillna(df_imputed[col].mean(), inplace=True)
            elif numerical_strategy == 'median':
                df_imputed[col].fillna(df_imputed[col].median(), inplace=True)
            elif numerical_strategy == 'mode':
                df_imputed[col].fillna(df_imputed[col].mode()[0], inplace=True)

    print(f"Total missing values after imputation: {df_imputed.isnull().sum().sum()}")

    return df_imputed
# Example usage:
handle_missing_values(train_df)

**Handling Outliers**

In [None]:
def outliers_iqr(
    data: pd.DataFrame,
    columns: list = None,
    threshold: float = 3.0,
    show_summary: bool = False
) -> pd.DataFrame:
    """
    Remove only extreme outliers from a DataFrame using the IQR method.

    Parameters:
    - data: Input DataFrame
    - columns: List of columns to process (None for all numeric columns)
    - threshold: IQR multiplier for extreme outlier detection (default: 3.0)
    - show_summary: If True, prints number of rows removed

    Returns:
    - Cleaned DataFrame with extreme outliers removed
    """
    if not isinstance(data, pd.DataFrame) or data.empty:
        raise ValueError("Input must be a non-empty DataFrame")

    df = data.copy()
    if columns is None:
        columns = df.select_dtypes(include=np.number).columns.tolist()

    rows_to_remove = set()

    for col in columns:
        if col not in df.columns:
            continue

        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - threshold * IQR
        upper = Q3 + threshold * IQR

        outliers = df[(df[col] < lower) | (df[col] > upper)]
        rows_to_remove.update(outliers.index)

    cleaned_df = df.drop(index=list(rows_to_remove))

    if show_summary:
        print(f"Removed {len(rows_to_remove)} extreme outliers")
        print(f"Original shape: {df.shape}, New shape: {cleaned_df.shape}")

    return cleaned_df
# Usage
train_df = outliers_iqr(train_df, show_summary=True)

**Features Engineering**

In [14]:
# Apply the transformations to both training and testing datasets
for df in [train_df, test_df]:

    # 1. Age-related features
    df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    # Age of the house when sold; older houses may have lower prices unless well-maintained.

    df['RemodelAge'] = df['YrSold'] - df['YearRemodAdd']
    # Years since the last remodel; more recent remodels can positively affect price.

    df['SinceRemodel'] = df['YearRemodAdd'] - df['YearBuilt']
    # Time taken to remodel after initial construction; quick remodels might indicate upgrades.

    # 2. Size-related features
    df['TotalSF'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF']
    # Total square footage including finished basement; overall size is a key price factor.

    df['TotalFinishedSF'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['BsmtFinSF1'] + df['BsmtFinSF2']
    # Total finished space in the house (excluding unfinished basement area).

    df['SFPerRoom'] = df['GrLivArea'] / (df['TotRmsAbvGrd'] + df['FullBath'] + 0.5 * df['HalfBath'] + 1)
    # Average square footage per room, a measure of room spaciousness.
    # +1 to denominator to avoid division by zero.

    df['TotalArea'] = df['GrLivArea'] + df['TotalBsmtSF']
    # Total usable area including basement.

    # 3. Bathroom features
    df['TotalBaths'] = df['FullBath'] + 0.5 * df['HalfBath'] + df['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']
    # Total bathroom count including basement, with half baths weighted at 0.5.

    df['BathRatio'] = (df['FullBath'] + df['BsmtFullBath']) / (df['TotalBaths'] + 0.01)
    # Proportion of full baths out of total baths. Add 0.01 to avoid division by zero.

    # 4. Porch/outdoor features
    df['TotalPorchSF'] = df['OpenPorchSF'] + df['3SsnPorch'] + df['EnclosedPorch'] + df['ScreenPorch'] + df['WoodDeckSF']
    # Total outdoor/porch area including all types.

    df['PorchRatio'] = df['TotalPorchSF'] / (df['LotArea'] + 1)
    # Proportion of lot used for porch or deck. Normalized by lot size.

    # 5. Efficiency metrics
    df['LivingEfficiency'] = df['GrLivArea'] / df['TotalSF']
    # Ratio of above-ground living area to total space; a measure of how well the space is utilized.

    df['BasementFinishRatio'] = (df['BsmtFinSF1'] + df['BsmtFinSF2']) / (df['TotalBsmtSF'] + 1)
    # Portion of basement that is finished. +1 to avoid divide-by-zero errors.

    # 6. Interaction features
    df['AgeTimesQuality'] = df['HouseAge'] * df['OverallQual']
    # Captures combined effect of age and quality. Newer high-quality houses vs. old lower-quality ones.

    df['SizeTimesQuality'] = df['TotalSF'] * (df['OverallQual'] / 10)
    # Blends size and quality into one feature. Normalized quality for scaling.

# 🔻 Drop less informative or now-redundant features
cols_to_drop = [
    'Id',                # Unique ID, not predictive
    'YrSold',            # Already used to calculate age features
    'YearBuilt',         # Used in HouseAge and SinceRemodel
    'YearRemodAdd',      # Used in RemodelAge
    '1stFlrSF', '2ndFlrSF', 'BsmtFinSF1', 'BsmtFinSF2',
    # Used in derived features like TotalSF, TotalFinishedSF

    'OpenPorchSF', '3SsnPorch', 'EnclosedPorch', 'ScreenPorch', 'WoodDeckSF',
    # Used in TotalPorchSF

    'BsmtFullBath', 'BsmtHalfBath', 'HalfBath'
    # Used in TotalBaths and BathRatio
]

# Drop columns from training and testing datasets
train_df = train_df.drop(columns=cols_to_drop)
test_df = test_df.drop(columns=[col for col in cols_to_drop if col in test_df.columns])

**Correlation Analysis**

In [None]:
# Compute correlation matrix (numeric columns only)
corr_matrix = train_df.corr(numeric_only=True)

# Sort correlations by absolute correlation with target ('SalePrice')
target = 'SalePrice'
sorted_features = corr_matrix[target].abs().sort_values(ascending=False).index
sorted_corr = corr_matrix.loc[sorted_features, sorted_features]

# Set up the matplotlib figure
plt.figure(figsize=(18, 14))
plt.title('Correlation Matrix Sorted by Target Variable (SalePrice)', fontsize=16)

# Draw heatmap with annotations and mask
sns.heatmap(
    sorted_corr,
    cmap='coolwarm',
    annot=True,
    fmt=".2f",
    linewidths=0.5,
    linecolor='gray',
    cbar_kws={"shrink": 0.8},
    square=True
)

plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(rotation=0, fontsize=10)
plt.tight_layout()
plt.show()

**Target Column Analysis**

In [None]:
# target column analysis
sns.histplot(
    train_df,
    x=train_df['SalePrice'],color='royalblue'
)
# Show summary statistics
print("Original Price Statistics:")
train_df['SalePrice'].describe()

In [17]:
# transforming the target
train_df['SalePrice'] = np.log1p(train_df['SalePrice'])

In [None]:
# transformed target
sns.histplot(
    train_df,
    x=train_df['SalePrice'],color='coral'
)
print("Log-Transformed Price Statistics:")
train_df['SalePrice'].describe()

**Column Preparation | Data Preparation**

In [19]:
# Columns for ordinal encoding (have a natural order)
ode_cols = [
    'LotShape', 'LandContour', 'Utilities', 'LandSlope', 'BsmtQual', 'BsmtFinType1',
    'CentralAir', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual',
    'PavedDrive', 'ExterCond', 'KitchenQual', 'BsmtExposure', 'HeatingQC',
    'ExterQual', 'BsmtCond'
]

# Columns for one-hot encoding (no natural order)
ohe_cols = [
    'Street', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
    'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
    'Foundation', 'Electrical', 'SaleType', 'MSZoning', 'SaleCondition',
    'Heating', 'GarageType', 'RoofMatl'
]

In [20]:
# Numeric columns (excluding the target)
num_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
if 'SalePrice' in num_cols:
    num_cols.remove('SalePrice')

**Project Pipeline**

In [21]:
# Pipeline for numeric columns
num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('var_thresh', VarianceThreshold(threshold=0.0))
])

# Pipeline for ordinal categorical columns
ode_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# Pipeline for nominal categorical columns
ohe_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# ==== Column Transformer ====
col_trans = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('ord', ode_pipeline, ode_cols),
        ('ohe', ohe_pipeline, ohe_cols)
    ],
    remainder='drop',
    verbose_feature_names_out=False,
    n_jobs=-1
)

# ==== Final Preprocessing Pipeline ====
preprocessor = Pipeline([
    ('transform', col_trans),
    ('feature_selector', SelectKBest(f_regression, k='all'))
])


**Data Preparation**

In [22]:
# preparing data
x = train_df.drop('SalePrice', axis=1)
y = train_df['SalePrice']

In [23]:
# Splitting the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

**Building Models**

In [24]:
# Define ALL Models with Parameter Grid
models = {
    # ========== Linear Models ==========
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {
            'fit_intercept': [True, False],
            'positive': [True, False]
        }
    },

    'Ridge Regression': {
        'model': Ridge(random_state=42),
        'params': {
            'alpha': [0.1, 0.5, 1.0, 2.0, 5.0],
            'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
        }
    },

    'Lasso Regression': {
        'model': Lasso(random_state=42),
        'params': {
            'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0],
            'selection': ['cyclic', 'random']
        }
    },

    'ElasticNet': {
        'model': ElasticNet(random_state=42),
        'params': {
            'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0],
            'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
            'selection': ['cyclic', 'random']
        }
    },

    'Bayesian Ridge': {
        'model': BayesianRidge(),
        'params': {
            'alpha_1': [1e-6, 1e-5, 1e-4],
            'alpha_2': [1e-6, 1e-5, 1e-4],
            'lambda_1': [1e-6, 1e-5, 1e-4],
            'lambda_2': [1e-6, 1e-5, 1e-4]
        }
    },

    'ARD Regression': {
        'model': ARDRegression(),
        'params': {
            'alpha_1': [1e-6, 1e-5, 1e-4],
            'alpha_2': [1e-6, 1e-5, 1e-4],
            'lambda_1': [1e-6, 1e-5, 1e-4],
            'lambda_2': [1e-6, 1e-5, 1e-4],
            'threshold_lambda': [100, 1000, 10000]
        }
    },

    'SGD Regressor': {
        'model': SGDRegressor(random_state=42),
        'params': {
            'loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
            'penalty': ['l1', 'l2', 'elasticnet'],
            'alpha': [0.0001, 0.001, 0.01],
            'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive']
        }
    },

    'Huber Regressor': {
        'model': HuberRegressor(),
        'params': {
            'epsilon': [1.1, 1.35, 1.5],
            'alpha': [0.0001, 0.001, 0.01],
            'max_iter': [100, 200, 300]
        }
    },

    'Theil-Sen Regressor': {
        'model': TheilSenRegressor(random_state=42),
        'params': {
            'max_subpopulation': [1000, 5000, 10000],
            'n_subsamples': [None, 100, 200],
            'max_iter': [100, 300, 500]
        }
    },

    'RANSAC Regressor': {
        'model': RANSACRegressor(random_state=42),
        'params': {
            'min_samples': [None, 0.1, 0.5, 0.9],
            'residual_threshold': [None, 1.0, 2.0],
            'max_trials': [50, 100, 200]
        }
    },

    # ========== Tree-based Models ==========
    'Decision Tree': {
        'model': DecisionTreeRegressor(random_state=42),
        'params': {
            'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
            'max_depth': [None, 5, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', None]
        }
    },

    'Random Forest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', None],
            'bootstrap': [True, False]
        }
    },

    'Extra Trees': {
        'model': ExtraTreesRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
            'max_features': ['sqrt', 'log2', None]
        }
    },

    # ========== Boosting Models ==========
    'Gradient Boosting': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'subsample': [0.8, 1.0],
            'min_samples_split': [2, 5]
        }
    },

    'XGBoost': {
        'model': XGBRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 6, 9],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'gamma': [0, 0.1, 0.2],
            'reg_alpha': [0, 0.1, 1],
            'reg_lambda': [0, 0.1, 1]
        }
    },

    'LightGBM': {
        'model': LGBMRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'num_leaves': [31, 63, 127],
            'max_depth': [-1, 10, 20],
            'min_child_samples': [20, 50],
            'reg_alpha': [0, 0.1, 1],
            'reg_lambda': [0, 0.1, 1]
        }
    },

    'CatBoost': {
        'model': CatBoostRegressor(random_state=42, verbose=0),
        'params': {
            'iterations': [100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'depth': [4, 6, 8],
            'l2_leaf_reg': [1, 3, 5]
        }
    },

    'AdaBoost': {
        'model': AdaBoostRegressor(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 1.0],
            'loss': ['linear', 'square', 'exponential']
        }
    },

    'Hist Gradient Boosting': {
        'model': HistGradientBoostingRegressor(random_state=42),
        'params': {
            'learning_rate': [0.01, 0.1, 0.2],
            'max_iter': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_leaf': [20, 50, 100]
        }
    },

    # ========== SVM Models ==========
    'SVR': {
        'model': SVR(),
        'params': {
            'C': [0.1, 1, 10, 100],
            'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
            'gamma': ['scale', 'auto', 0.1, 1],
            'degree': [2, 3, 4],
            'epsilon': [0.01, 0.1, 0.5]
        }
    },

    'Linear SVR': {
        'model': LinearSVR(random_state=42),
        'params': {
            'C': [0.1, 1, 10],
            'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
            'dual': [True, False],
            'epsilon': [0.01, 0.1, 0.5]
        }
    },

    'NuSVR': {
        'model': NuSVR(),
        'params': {
            'nu': [0.1, 0.5, 0.8],
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
            'gamma': ['scale', 'auto', 0.1, 1]
        }
    },

    # ========== Nearest Neighbors ==========
    'KNN Regressor': {
        'model': KNeighborsRegressor(),
        'params': {
            'n_neighbors': [3, 5, 7, 9, 11],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'p': [1, 2]
        }
    },

    'Radius Neighbors Regressor': {
        'model': RadiusNeighborsRegressor(),
        'params': {
            'radius': [1.0, 2.0, 5.0],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
        }
    },

    # ========== Neural Networks ==========
    'MLP Regressor': {
        'model': MLPRegressor(random_state=42, early_stopping=True),
        'params': {
            'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
            'activation': ['relu', 'tanh', 'logistic'],
            'alpha': [0.0001, 0.001, 0.01],
            'learning_rate': ['constant', 'invscaling', 'adaptive'],
            'learning_rate_init': [0.001, 0.01]
        }
    },

    # ========== Gaussian Processes ==========
    'Gaussian Process': {
        'model': GaussianProcessRegressor(random_state=42),
        'params': {
            'kernel': [None,
                      RBF(),
                      ConstantKernel() * RBF(),
                      RationalQuadratic()],
            'alpha': [1e-10, 1e-5, 1e-2],
            'normalize_y': [True, False]
        }
    },

    # ========== Other Regressors ==========
    'Kernel Ridge': {
        'model': KernelRidge(),
        'params': {
            'alpha': [0.1, 1.0, 10.0],
            'kernel': ['linear', 'rbf', 'polynomial'],
            'gamma': [None, 0.1, 1.0],
            'degree': [2, 3, 4]
        }
    },

    'PLS Regression': {
        'model': PLSRegression(),
        'params': {
            'n_components': [1, 2, 3, 5],
            'scale': [True, False],
            'max_iter': [500, 1000]
        }
    },

    'Dummy Regressor': {
        'model': DummyRegressor(),
        'params': {
            'strategy': ['mean', 'median', 'quantile', 'constant']
        }
    }
}

**Model Evaluation**

In [26]:
# Evaluation Function
def evaluate_regression_model(model, x_test, y_test):
    y_pred = model.predict(x_test)

    metrics = {
        'mse': mean_squared_error(y_test, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
        'mae': mean_absolute_error(y_test, y_pred),
        'r2': r2_score(y_test, y_pred),
        'explained_variance': explained_variance_score(y_test, y_pred),
        'max_error': max_error(y_test, y_pred),
        'mape': mean_absolute_percentage_error(y_test, y_pred)
    }

    print("\nRegression Metrics:")
    for name, value in metrics.items():
        print(f"{name}: {value:.4f}")

    return metrics

**Model Training**

In [None]:
# Model Training
results = {}
best_models = {}
cv = KFold(n_splits=5, shuffle=True, random_state=42)

for name, config in models.items():
    print(f"\n{'=' * 50}")
    print(f"Training and tuning {name}")
    print(f"{'=' * 50}")

    try:
        # Create pipeline with preprocessing and model
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', config['model'])
        ])

        # Update params to include pipeline prefix
        params = {f'model__{key}': value for key, value in config['params'].items()}

        grid_search = GridSearchCV(
            pipeline,
            param_grid=params,
            cv=cv,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )

        grid_search.fit(x_train, y_train)
        best_models[name] = grid_search.best_estimator_
        results[name] = evaluate_regression_model(best_models[name], x_test, y_test)

        print(f"\nBest parameters for {name}:")
        print(grid_search.best_params_)

    except Exception as e:
        print(f"Error with {name}: {str(e)}")
        results[name] = {'error': str(e)}

**Model Comparison**

In [None]:
# Model Comparison
comparison = pd.DataFrame.from_dict(results, orient='index')
# Remove models that failed
comparison = comparison[~comparison.index.isin([k for k, v in results.items() if 'error' in v])]
print(comparison.sort_values(by='r2', ascending=False))

**Ensemble Methods**

In [None]:
# Select top models for ensemble
top_models = comparison.nlargest(5, 'r2').index.tolist()

# Create a list of base estimators for stacking
estimators = [(name, best_models[name].named_steps['model']) for name in top_models]

# Stacking Regressor
stacking_reg = Pipeline([
    ('preprocessor', preprocessor),
    ('model', StackingRegressor(
        estimators=estimators,
        final_estimator=LinearRegression(),
        cv=5,
        n_jobs=-1
    ))
])

stacking_reg.fit(x_train, y_train)
print("\nStacking Regressor Performance:")
stacking_metrics = evaluate_regression_model(stacking_reg, x_test, y_test)
results['Stacking'] = stacking_metrics

# Voting Regressor
voting_reg = Pipeline([
    ('preprocessor', preprocessor),
    ('model', VotingRegressor(
        estimators=estimators,
        n_jobs=-1
    ))
])

voting_reg.fit(x_train, y_train)
print("\nVoting Regressor Performance:")
voting_metrics = evaluate_regression_model(voting_reg, x_test, y_test)
results['Voting'] = voting_metrics

**Model Comparison**

In [None]:
# Final Comparison

print("\nFinal Model Comparison:")
final_comparison = pd.DataFrame.from_dict(results, orient='index')
final_comparison = final_comparison[~final_comparison.index.isin([k for k, v in results.items() if 'error' in v])]
print(final_comparison.sort_values(by='r2', ascending=False))

**Saving The Model**

In [None]:
# Save Best Model
best_model_name = final_comparison['r2'].idxmax()
best_model = best_models.get(best_model_name,
                             stacking_reg if best_model_name == 'Stacking' else voting_reg)

print(f"\nBest model is: {best_model_name}")

# Save the best model
from joblib import dump

dump(best_model, 'best_regressor_model.joblib')

# Save all results to CSV
final_comparison.sort_values(by='r2', ascending=False).to_csv('regression_model_comparison.csv')

print("\nModel training and evaluation complete!")