## 194 - Testing the Assumptions: Regression Diagnostics

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import OLSInfluence

from pygam import LinearGAM, s, l
from pygam.datasets import wage


import seaborn as sns
import matplotlib.pyplot as plt




In [2]:
house = pd.read_csv('house_sales.csv', sep='\t')

### Outliers

The statsmodels package has the most developed support for outlier analysis.


In [3]:
# Let’s fit a regression to the King County house sales data for all sales in zip code 98105

    # First we filter the "house" dataset and create a new subset dataset
house_98105 = house.loc[house['ZipCode'] == 98105, ]


    # These lines establish our model's ingredients. The predictors are the features we think influence house prices, 
    # while the outcome is what we're trying to predict (the adjusted sale price).

    # We set the predicting variables
predictors = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade']

    # And the outcome variable
outcome = 'AdjSalePrice'


    # This creates our regression model, 
    # we're using .assign(const=1) to add a constant term (also known as an intercept) to our predictors. 
    # This is like giving our model a starting point, a base price that gets adjusted up or down based on the house's features. 

house_outlier = sm.OLS(house_98105[outcome], house_98105[predictors].assign(const=1))


    # These lines actually run the regression and show us the results. 
    # The model finds the best way to predict house prices in zip code 98105 using our chosen features.

result_98105 = house_outlier.fit()
print(result_98105.summary())


                            OLS Regression Results                            
Dep. Variable:           AdjSalePrice   R-squared:                       0.795
Model:                            OLS   Adj. R-squared:                  0.792
Method:                 Least Squares   F-statistic:                     238.7
Date:                Thu, 16 Jan 2025   Prob (F-statistic):          1.69e-103
Time:                        10:55:20   Log-Likelihood:                -4226.0
No. Observations:                 313   AIC:                             8464.
Df Residuals:                     307   BIC:                             8486.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
SqFtTotLiving   209.6023     24.408      8.587

In [10]:
# In statsmodels, use OLSInfluence to analyze the residuals
# This helps us find unusual or influential data points in our house price model for zip code 98105.


    # Creates an "influence" object from our regression results
    # This as a special tool that examines each house in our dataset and asks 
    # "How much does this particular house affect our overall understanding of house prices in this zip code?"
    # "OLSInfluence = class to calculate outlier and influence measures for OLS result"
influence = OLSInfluence(result_98105)


    # Calculates what we call "studentized residuals":
    # A residual is the difference between what our model predicted for a house's price and its actual price. 
    # Studentized residuals are standardized in a way that makes them more comparable across different houses.
    #    For example, we have:
    #    - A tiny house where our prediction was off by $50,000
    #    - A mansion where our prediction was off by $50,000
    #    The same dollar amount might be a huge error for the tiny house but a small error for the mansion. 
    #    Studentized residuals help us account for this kind of context.

sresiduals = influence.resid_studentized_internal


    # This line finds the house with the most negative studentized residual. 
    # In other words, it finds the house that sold for much less than our model predicted, 
    # even after accounting for the house's characteristics. 
    # The idxmin() gives us the index (which house it is), and min() tells us how unusually low the price was.


    # This code is designed to identify and analyze a specific outlier in the dataset based on residuals from a regression model:
    # ".idxmin()" returns the index of the smallest value in the series (the observation with the most negative residual).
print(sresiduals.idxmin(), sresiduals.min())

24333 -4.326731804078564


In [12]:
# This helps us examine the details of our most unusual house sale.

    # Previously we found the index of the house with the most negative studentized residual using sresiduals.idxmin() 
    # Now, we're using that index to pull out all information about that particular house. 
    # The ":" after the comma means "give me all columns" - we want to see everything we know about this unusual property.

outlier = house_98105.loc[sresiduals.idxmin(), :]


    # We extracted that specific entry, and then we print information about it:
    # from the series "outlier" that we extracted above, we print "outcome" = the selling price
print('AdjSalePrice', outlier[outcome])

    # And all the physical characteristics we used to predict its price:
    # This information is crucial because it helps us understand why this house might be an outlier
print(outlier[predictors])


AdjSalePrice 119748.0
SqFtTotLiving    2900
SqFtLot          7276
Bathrooms         3.0
Bedrooms            6
BldgGrade           7
Name: 24333, dtype: object


In [14]:
type(outlier)

pandas.core.series.Series