## 174 - Model Selection and Stepwise Regression

In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import OLSInfluence

from pygam import LinearGAM, s, l
from pygam.datasets import wage


import seaborn as sns
import matplotlib.pyplot as plt

# dmba = Data Mining for Business Analytics
from dmba import stepwise_selection
from dmba import AIC_score


no display found. Using non-interactive Agg backend


In [15]:
house = pd.read_csv('house_sales.csv', sep='\t')



In [45]:
# Here we are dealing with both numerical and categorical predictors.
# we need to convert the categorical and boolean variables into numbers

outcome = 'AdjSalePrice'
predictors = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms',
              'BldgGrade', 'PropertyType', 'NbrLivingUnits',
              'SqFtFinBasement', 'YrBuilt', 'YrRenovated', 
              'NewConstruction']

    # Some of these, like 'PropertyType', are categorical - they don't have numerical values but rather categories, in this case:
    # 'Multiplex', 'Single Family', 'Townhouse'
    # We can't use categorical variables directly in regression - they need to be converted to numbers first.
    # We do this with "pd.get_dummies()"

    # This function performs what's called "one-hot encoding". 
    # 'PropertyType' has three categories: 'Multiplex', 'Single Family', 'Townhouse'. 
    # The function creates new binary columns (0s and 1s) for each category except one. We drop one category (drop_first=True) 
    # to avoid what's called the "dummy variable trap" / multicollinearity,
    # a situation where perfect correlation between our predictors would make the math impossible.

X = pd.get_dummies(house[predictors], drop_first=True, dtype=int)


    # The goal of this list comprehension is to ensure our NewConstruction column contains only 1s and 0s. 
    # We have 'True' and 'False' values, that we want to turn into 0s and 1s

    # that below is the same as (longer, clearer, step-by-step version):
    # new_values = []  # Create an empty list to store our converted values
    # for nc in X['NewConstruction']:  # Loop through each value in the NewConstruction column
    # if nc:  # If the value is "truthy"
    #    new_value = 1
    # else:  # If the value is "falsy"
    #    new_value = 0
    # new_values.append(new_value)  # Add our converted value to the list
    # # Replace the old column with our new values:
    # X['NewConstruction'] = new_values

    #  It says "Create a list. For each value (nc) in X['NewConstruction'], include 1 if the value is truthy, otherwise include 0."

    # The "if nc else 0" part is using Python's "truthiness" concept:
    #    - Values like True, 'Y', 1, and non-empty strings are considered "truthy" and will become 1
    #    - Values like False, 'N', 0, None, and empty strings are considered "falsy" and will become 0

X['NewConstruction'] = [1 if nc else 0 for nc in X['NewConstruction']]


    # We create and fit our model:
    # The basic syntax is: sm.OLS(dependent_variable, independent_variables)

    # The first argument house[outcome] is our dependent variable (what we're trying to predict). 
    # The sale price of houses. Think of this as the "answer" we're trying to predict.

    # The second argument X.assign(const=1) is more complex. 
    # It contains all our independent variables (predictors) plus a special constant term:
    #    - X is our DataFrame containing all our predictor variables
    #    - .assign(const=1) adds a new column called 'const' filled with 1s

    # Why do we need this column of 1s? 
    # It's related to the equation of a line: y = mx + b. 
    # In regression, that 'b' (the intercept) needs its own predictor column of all 1s to be estimated properly. 

    # Why 1 specifically?
    # If we use a column of 1s, when all other x's are 0, 
    # y = β₀ * 1 = β₀
    # So β₀ directly represents our intercept
    # Using 1s is the convention because it gives us the intercept directly, it's clean and simple

    # When we call this function, statsmodels creates an OLS object that's ready to be fitted. 
    # It's like setting up the equation y = mx + b, but we haven't solved for m and b yet. 
    # That solving happens when we call .fit() later.

house_full = sm.OLS(house[outcome], X.assign(const=1))


    # We fit the model:
results = house_full.fit()


    # We print the summary:

print(results.summary())

    # When we print the summary now, we'll see coefficients for all our numeric variables plus separate coefficients 
    # for each category of our categorical variables (except the dropped reference categories). 
    # Each coefficient tells us how much the house price changes when:
    #    - For numeric variables: the variable increases by one unit
    #    - For categorical variables: the house is in that category compared to the reference category



                            OLS Regression Results                            
Dep. Variable:           AdjSalePrice   R-squared:                       0.595
Model:                            OLS   Adj. R-squared:                  0.594
Method:                 Least Squares   F-statistic:                     2771.
Date:                Tue, 14 Jan 2025   Prob (F-statistic):               0.00
Time:                        16:12:41   Log-Likelihood:            -3.1375e+05
No. Observations:               22687   AIC:                         6.275e+05
Df Residuals:                   22674   BIC:                         6.276e+05
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
SqFtTotLiving       

### Using the dmba package



We can use the stepwise_selection method from the dmba package.

scikit-learn has no implementation for stepwise regression. 
The functions "stepwise_selection", "forward_selection", and "backward_elimination" have been implemented in the dmba package


In [60]:
# Using the DMBA Package (Data Mining for Business Analytics)
    # Stepwise selection method to build a regression model, leveraging the DMBA package

    # Stepwise selection is a technique to iteratively add or remove predictors (variables) 
    # from a regression model based on their contribution to the model's performance, 
    # measured using a criterion like AIC (Akaike Information Criterion).


    # Defining the target variable (dependent variable) for the regression
    
outcome = 'AdjSalePrice'
y = house[outcome]


    # Define a function that returns a fitted model for a given set of variables, the train_model function
    # It trains a linear regression model using the provided predictors (variables).
    # Returns a  fitted LinearRegression model, or None if no variables are provided.
def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(X[variables], y)
    return model


    # Define the score_model Function
    # Define a function that returns a score for a given model and set of variables. 
    # In this case, we use the AIC_score implemented in the dmba package.
    # The purpose is to valuates the performance of a model using AIC (Akaike Information Criterion).
    # Parameters:
    #    - model: A fitted regression model
    #    - variables: A list of column names used as predictors in the model
    # Returns the AIC score for the model.
    
def score_model(model, variables):


    # It computes the Akaike Information Criterion (AIC) score for a baseline model that predicts the mean of y for all observations. 
    # This is used as a reference to compare other models in the stepwise selection process.

    # How AIC Works
    # AIC measures the tradeoff between model complexity and goodness of fit.
    # Lower AIC values indicate a better model.
    
    # AIC parameters:
    #    - y: the true (observed) values of the dependent variable. Used to compute the residual errors of the model.
    #    - y_pred: The predicted values from the model for the dependent variable.
    #      In the baseline model (predicting the mean of y), y_pred is a constant array equal to [y.mean()] * len(y).
    #    - model: The fitted model object. 
    #      This provides necessary details like the residual sum of squares (RSS) or log-likelihood of the model.
    #    - df: The degrees of freedom for the model. 
    #      This is typically the number of parameters in the model, including the intercept.
    # AIC = 2k − 2 ln (L)
    # L: The likelihood of the model given the data (based on the residual errors). (The residuals are calculated as y− ypred)
    

    # Computes the AIC score for a baseline model that predicts the mean value of y for all data points.
    # This baseline AIC score serves as the "starting point" or reference for the stepwise selection process.

    # It's important because if no predictors are selected (len(variables) == 0), 
    # this line ensures the stepwise selection process can still evaluate and compare the baseline model.
    # Models with predictors will be scored against this baseline to determine whether they improve model performance.
        
    # Parameters:
    #     - y : he true values of the dependent (target) variable.
    #     - [y.mean()] * len(y) : creates a list where every element is the mean of y. 
    #       Its length matches the number of observations in y.
    #       This represents the predictions of a baseline model that always predicts the mean value of y.
    #     - model: Refers to the model object. It can be None in this case, 
    #       as the function is computing AIC for the baseline (mean-only) model. 
    #     - df=1: The degrees of freedom for the baseline model. Since it predicts a single value (the mean), 
    #       it has only 1 degree of freedom.
        
    if len(variables) == 0:
        return AIC_score(y, [y.mean()] * len(y), model, df=1)
        
    return AIC_score(y, model.predict(X[variables]), model)


    # Perform Stepwise Selection
    # We use stepwise selection to find the best combination of predictors for the regression model.
    # Parameters:
    #    - X.columns: The initial set of all predictors (independent variables).
    #    - train_model: The function to train the model.
    #    - score_model: The function to score the model.
    #    - verbose=True: Prints the selection process (which variables are added/removed).

    # Results
    #    - best_model: The final regression model
    #    - best_variables: The set of predictors selected by stepwise selection

best_model, best_variables = stepwise_selection(X.columns, train_model, score_model, 
                                                verbose=True)
    # Print values
print()
    # Intercept: The value of the dependent variable when all predictors are 0:

print(f'Intercept: {best_model.intercept_:.3f}')

    # Coefficients: The relationship between each selected predictor and the target variable:
print('Coefficients:')
for name, coef in zip(best_variables, best_model.coef_):
    print(f' {name}: {coef}')

# The code uses stepwise selection to find the best predictors for a regression model.
# It trains and evaluates models using AIC as the criterion.
# Finally, it prints the intercept and coefficients of the best model
# X is a DataFrame of predictors, and y is the target variable

Variables: SqFtTotLiving, SqFtLot, Bathrooms, Bedrooms, BldgGrade, NbrLivingUnits, SqFtFinBasement, YrBuilt, YrRenovated, NewConstruction, PropertyType_Single Family, PropertyType_Townhouse
Start: score=647988.32, constant
Step: score=633013.35, add SqFtTotLiving
Step: score=630793.74, add BldgGrade
Step: score=628230.29, add YrBuilt
Step: score=627784.16, add Bedrooms
Step: score=627602.21, add Bathrooms
Step: score=627525.65, add PropertyType_Townhouse
Step: score=627525.08, add SqFtFinBasement
Step: score=627524.98, add PropertyType_Single Family
Step: score=627524.98, unchanged None

Intercept: 6178645.017
Coefficients:
 SqFtTotLiving: 199.27755304201884
 BldgGrade: 137159.56022619773
 YrBuilt: -3565.4249392492993
 Bedrooms: -51947.38367361318
 Bathrooms: 42396.164527717796
 PropertyType_Townhouse: 84479.16203300405
 SqFtFinBasement: 7.046974967553979
 PropertyType_Single Family: 22912.055187017682


The function chose a model in which several variables were dropped from house_full: SqFtLot, NbrLivingUnits, YrRenovated, and NewConstruction.