## 188 - Correlated Predictors (Interpreting the Regression Equation)

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import OLSInfluence

from pygam import LinearGAM, s, l
from pygam.datasets import wage


import seaborn as sns
import matplotlib.pyplot as plt

# dmba = Data Mining for Business Analytics
from dmba import stepwise_selection
from dmba import AIC_score


no display found. Using non-interactive Agg backend


### Previous stepwise regression

In [4]:
house = pd.read_csv('house_sales.csv', sep='\t')

In [15]:
# Here we use the results from the stepwise regression:
# More explaination and details in "174 -  Model Selection and Stepwise Regression"

outcome = 'AdjSalePrice'
y = house[outcome]

predictors = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms',
              'BldgGrade', 'PropertyType', 'NbrLivingUnits',
              'SqFtFinBasement', 'YrBuilt', 'YrRenovated', 
              'NewConstruction']

X = pd.get_dummies(house[predictors], drop_first=True, dtype=int)

X['NewConstruction'] = [1 if nc else 0 for nc in X['NewConstruction']]

    
    # Define a function that returns a fitted model for a given set of variables, the train_model function

def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(X[variables], y)
    return model


    # Define the score_model Function
    # Returns the AIC score for the model.
    
def score_model(model, variables):
        
    if len(variables) == 0:
        return AIC_score(y, [y.mean()] * len(y), model, df=1)
        
    return AIC_score(y, model.predict(X[variables]), model)


    # Perform Stepwise Selection
    # We use stepwise selection to find the best combination of predictors for the regression model.
best_model, best_variables = stepwise_selection(X.columns, train_model, score_model, 
                                                verbose=True)
    # Print values
print()
    # Intercept: The value of the dependent variable when all predictors are 0:

print(f'Intercept: {best_model.intercept_:.3f}')

    # Coefficients: The relationship between each selected predictor and the target variable:
print('Coefficients:')
for name, coef in zip(best_variables, best_model.coef_):
    print(f' {name}: {coef}')

# The code uses stepwise selection to find the best predictors for a regression model.
# It trains and evaluates models using AIC as the criterion.
# Finally, it prints the intercept and coefficients of the best model

Variables: SqFtTotLiving, SqFtLot, Bathrooms, Bedrooms, BldgGrade, NbrLivingUnits, SqFtFinBasement, YrBuilt, YrRenovated, NewConstruction, PropertyType_Single Family, PropertyType_Townhouse
Start: score=647988.32, constant
Step: score=633013.35, add SqFtTotLiving
Step: score=630793.74, add BldgGrade
Step: score=628230.29, add YrBuilt
Step: score=627784.16, add Bedrooms
Step: score=627602.21, add Bathrooms
Step: score=627525.65, add PropertyType_Townhouse
Step: score=627525.08, add SqFtFinBasement
Step: score=627524.98, add PropertyType_Single Family
Step: score=627524.98, unchanged None

Intercept: 6178645.017
Coefficients:
 SqFtTotLiving: 199.27755304201884
 BldgGrade: 137159.56022619773
 YrBuilt: -3565.4249392492993
 Bedrooms: -51947.38367361318
 Bathrooms: 42396.164527717796
 PropertyType_Townhouse: 84479.16203300405
 SqFtFinBasement: 7.046974967553979
 PropertyType_Single Family: 22912.055187017682


### Creation of ZipGroups from previous Notebook

In [45]:
print(pd.DataFrame(house['ZipCode'].value_counts()).transpose())

predictors = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 
              'Bedrooms', 'BldgGrade']
outcome = 'AdjSalePrice'

##

house_lm = LinearRegression()
house_lm.fit(house[predictors], house[outcome])

## 

zip_residuals = pd.DataFrame({
    'ZipCode': house['ZipCode'],
    'residual' : house[outcome] - house_lm.predict(house[predictors]),
})

##

zip_groups = pd.DataFrame([
    {
        'ZipCode': zipCode,
        'count': len(x),
        'median_residual': x.residual.median()
    } 
    for zipCode, x in zip_residuals.groupby('ZipCode')
]).sort_values('median_residual')




zip_groups['cum_count'] = np.cumsum(zip_groups['count'])



zip_groups['ZipGroup'] = pd.qcut(zip_groups['cum_count'], 5, labels=False, retbins=False)
zip_groups.head()

#print(zip_groups.ZipGroup.value_counts().sort_index())

# Joining the ZipGroup column

to_join = zip_groups[['ZipCode', 'ZipGroup']].set_index('ZipCode')
house = house.join(to_join, on='ZipCode')
house['ZipGroup'] = house['ZipGroup'].astype('category')

ZipCode  98038  98103  98042  98115  98117  98052  98034  98033  98059  98074  \
count      788    671    641    620    619    614    575    517    513    502   

ZipCode  ...  98051  98024  98354  98050  98057  98288  98224  98068  98113  \
count    ...     32     31      9      7      4      4      3      1      1   

ZipCode  98043  
count        1  

[1 rows x 80 columns]


## Correlated predictors

In [39]:
# We print the results from the stepwise regression:

print(f'Intercept: {best_model.intercept_:.3f}')
print('Coefficients:')
for name, coef in zip(best_variables, best_model.coef_):
    print(f' {name}: {coef}')

# The coefficient for Bedrooms is negative! This implies that adding a bedroom to a house will reduce its value. How can this be? 
# This is because the predictor variables are correlated: larger houses tend to have more bedrooms, 
# and it is the size that drives house value, not the number of bedrooms.

# Having correlated predictors can make it difficult to interpret the sign and value of regression coefficients 
# (and can inflate the standard error of the estimates). 


Intercept: 6178645.017
Coefficients:
 SqFtTotLiving: 199.27755304201884
 BldgGrade: 137159.56022619773
 YrBuilt: -3565.4249392492993
 Bedrooms: -51947.38367361318
 Bathrooms: 42396.164527717796
 PropertyType_Townhouse: 84479.16203300405
 SqFtFinBasement: 7.046974967553979
 PropertyType_Single Family: 22912.055187017682


In [41]:
# The variables for bedrooms, house size, and number of bathrooms are all correlated

# This is illustrated by the following example, which fits another regression 
# removing the variables SqFtTotLiving, SqFtFinBasement, and Bathrooms from the equation:

predictors = ['Bedrooms', 'BldgGrade', 'PropertyType', 'YrBuilt']
outcome = 'AdjSalePrice'

X = pd.get_dummies(house[predictors], drop_first=True)

reduced_lm = LinearRegression()
reduced_lm.fit(X, house[outcome])


print(f'Intercept: {reduced_lm.intercept_:.3f}')
print('Coefficients:')
for name, coef in zip(X.columns, reduced_lm.coef_):
    print(f' {name}: {coef}')


Intercept: 4913973.344
Coefficients:
 Bedrooms: 27150.537230215377
 BldgGrade: 248997.79366213758
 YrBuilt: -3211.7448621550866
 PropertyType_Single Family: -19898.495340502435
 PropertyType_Townhouse: -47355.4368733449


## Confounding variables


In [47]:
# Here we have added ZipGroup to the predictors

predictors = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms',
              'BldgGrade', 'PropertyType', 'ZipGroup']
outcome = 'AdjSalePrice'

X = pd.get_dummies(house[predictors], drop_first=True)

confounding_lm = LinearRegression()
confounding_lm.fit(X, house[outcome])

print(f'Intercept: {confounding_lm.intercept_:.3f}')
print('Coefficients:')
for name, coef in zip(X.columns, confounding_lm.coef_):
    print(f' {name}: {coef}')

# The coefficient for Bedrooms is still negative. While this is unintuitive, this is a well-known phenomenon in real estate. 
# For homes of the same livable area and number of bathrooms, having more and therefore smaller bedrooms 
# is associated with less valuable homes.


Intercept: -666637.469
Coefficients:
 SqFtTotLiving: 210.61266005580157
 SqFtLot: 0.45498713854659023
 Bathrooms: 5928.425640001543
 Bedrooms: -41682.87184074475
 BldgGrade: 98541.18352725971
 PropertyType_Single Family: 19323.625287919334
 PropertyType_Townhouse: -78198.72092762387
 ZipGroup_1: 53317.17330659817
 ZipGroup_2: 116251.58883563544
 ZipGroup_3: 178360.53178793355
 ZipGroup_4: 338408.60185652005


## Interactions and Main Effects

In Python, we need to use the statsmodels package to train linear regression models with interactions. This package was designed similar to R and allows defining models using a formula interface:

In [53]:
# You include interactions between variables using the * operator.
# Location and house size appear to have a strong interaction

#  It's using statsmodels to create an Ordinary Least Squares (OLS) regression that explores how different features 
# affect house prices, with special attention to how square footage impacts prices differently across zip code groups.

    # The "~" symbol means "is modeled by" or "is predicted by". 
    # The left side (AdjSalePrice) is what we're trying to predict, and everything on the right represents our predictors.

    # *
    # "SqFtTotLiving*ZipGroup": the asterisk (*) tells statsmodels to create what we call an "interaction term". 
    # This means the model will allow the relationship between square footage and price to be different for each zip code group. 
    # Think of it like this: in expensive neighborhoods (high ZipGroup numbers), 
    # an extra square foot might add more value to a house than in less expensive neighborhoods.
    #
    # (The other terms are simpler - they're just regular predictors. The model assumes these features affect price 
    # the same way regardless of zip code group.

model = smf.ols(formula='AdjSalePrice ~  SqFtTotLiving*ZipGroup + SqFtLot + ' +
     'Bathrooms + Bedrooms + BldgGrade + PropertyType', data=house)
results = model.fit()
print(results.summary())

    # The summary shows coefficients for each zip group showing how much an extra square foot of living space is worth in that group

                            OLS Regression Results                            
Dep. Variable:           AdjSalePrice   R-squared:                       0.682
Model:                            OLS   Adj. R-squared:                  0.682
Method:                 Least Squares   F-statistic:                     3247.
Date:                Wed, 15 Jan 2025   Prob (F-statistic):               0.00
Time:                        15:54:56   Log-Likelihood:            -3.1098e+05
No. Observations:               22687   AIC:                         6.220e+05
Df Residuals:                   22671   BIC:                         6.221e+05
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept     

In [57]:
# Having a look at the summary:

# R-squared:
# The R-squared value of 0.682 tells us this model explains about 68.2% of the variation in house prices. 

# Coefficients:
# The base effect of living space (SqFtTotLiving) is $114.77 per square foot in the lowest zip group (Group 0). 
# But look at how this changes across zip groups through the interaction terms (SqFtTotLiving:ZipGroup):
#    - Group 1: adds $32.60 more per sqft (total: $147.37/sqft)
#    - Group 2: adds $41.78 more per sqft (total: $156.55/sqft)
#    - Group 3: adds $69.34 more per sqft (total: $184.11/sqft)
#    - Group 4: adds $226.68 more per sqft (total: $341.45/sqft)
# This shows a dramatic difference - an extra square foot in the highest-value neighborhoods (Group 4) 
# is worth almost three times as much as in the lowest-value areas (Group 0)!

# Building Grade:
# Building Grade has a huge positive impact: each grade point increase adds about $104,700 to the price

# Bedrooms:
# Each additional bedroom actually decreases price by about $41,800 
# (this might seem counterintuitive, but remember we're controlling for all other variables, all else equal",
# which imples that, all else equal, equal footage, having extra bedrooms might not be good, as it will reduce the sizes of all rooms)

# When we see a negative coefficient for bedrooms, it means that if we could somehow add a bedroom without changing any other feature 
# in our model, the price would tend to decrease by $41,800.
# In practice, this situation might be rare - when people add bedrooms, they often change other characteristics too. 
# They might increase the total square footage, add another bathroom, or improve the building grade. 
# But the model helps us isolate these effects to understand each one independently.

# Lot size:
# Lot sizeadds about $0.69 per square foot

# Confidence:
# The tiny P>|t| values (0.000) for most coefficients tell us these relationships are statistically significant. 
# However, some features like Bathrooms (P>|t| = 0.258) don't show a significant relationship with price once we account for everything else.


# NOTES
# There are two cautionary notes at the bottom:
# The high skewness (7.279) suggests some very expensive houses are pulling the distribution
# The large condition number (5.80e+05) suggests some features might be closely related, 
# which could make individual coefficients less reliable

