# Section 4.3 — Interpreting linear models

This notebook contains the code examples from [Section 4.3 Interpreting linear models]() from the **No Bullshit Guide to Statistics**.

#### Notebook setup

In [1]:
# load Python modules
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Figures setup
plt.clf()  # needed otherwise `sns.set_theme` doesn't work
from plot_helpers import RCPARAMS
RCPARAMS.update({'figure.figsize': (8, 5)})   # good for screen
# RCPARAMS.update({'figure.figsize': (5, 1.6)})  # good for print
sns.set_theme(
    context="paper",
    style="whitegrid",
    palette="colorblind",
    rc=RCPARAMS,
)

# High-resolution please
%config InlineBackend.figure_format = 'retina'

# Where to store figures
DESTDIR = "figures/lm/interpreting"

<Figure size 640x480 with 0 Axes>

In [3]:
# set random seed for repeatability
np.random.seed(42)
#######################################################

In [4]:
import statsmodels.formula.api as smf

## Definitions

In [5]:
doctors = pd.read_csv("../datasets/doctors.csv")
n = doctors.shape[0]

lm2 = smf.ols('score ~ 1 + alc + weed + exrc', data=doctors).fit()

# the coefficients of the best-fit line
lm2.summary()

0,1,2,3
Dep. Variable:,score,R-squared:,0.842
Model:,OLS,Adj. R-squared:,0.839
Method:,Least Squares,F-statistic:,270.3
Date:,"Sat, 02 Mar 2024",Prob (F-statistic):,1.05e-60
Time:,11:35:28,Log-Likelihood:,-547.63
No. Observations:,156,AIC:,1103.0
Df Residuals:,152,BIC:,1115.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,60.4529,1.289,46.885,0.000,57.905,63.000
alc,-1.8001,0.070,-25.726,0.000,-1.938,-1.662
weed,-1.0216,0.476,-2.145,0.034,-1.962,-0.081
exrc,1.7683,0.138,12.809,0.000,1.496,2.041

0,1,2,3
Omnibus:,1.14,Durbin-Watson:,1.828
Prob(Omnibus):,0.565,Jarque-Bera (JB):,0.9
Skew:,0.182,Prob(JB):,0.638
Kurtosis:,3.075,Cond. No.,31.2


## Model fit metrics

In [6]:
lm2.rsquared

0.8421649167873537

In [7]:
lm2.rsquared_adj

0.8390497506713147

In [8]:
# Explained sum of squares
lm2.ess

54570.51590209805

In [9]:
lm2.ssr / n

65.56013803717558

### Metrics 

In [10]:
from sklearn import metrics
scores = doctors["score"].values
scoreshat = lm2.fittedvalues.values
meanscore = np.mean(scores)

tss = sum( (scores-meanscore)**2 )

In [11]:
# R^2 alt.
lm2.rsquared, metrics.r2_score(scores, scoreshat), lm2.rsquared, 1-lm2.ssr/tss, lm2.ess/tss

(0.8421649167873537,
 0.8421649167873537,
 0.8421649167873537,
 0.8421649167873536,
 0.8421649167873543)

In [12]:
lm2.llf

-547.6259042117637

In [13]:
lm2.aic, lm2.bic

(1103.2518084235273, 1115.4512324525256)

### Other metrics (optional material)

In [14]:
# TSS = Total Sum of Squares
tss = sum( (scores-meanscore)**2 )
lm2.centered_tss, tss

(64797.89743589744, 64797.8974358974)

In [15]:
# SSR = Sum of Squares Residuals
ssr = sum( (scores-scoreshat)**2 )
ssr, lm2.ssr

(10227.381533799391, 10227.381533799391)

In [16]:
# ESS = Explained Sum of Squares
ess = sum( (scoreshat-meanscore)**2 ) 
lm2.ess, ess

(54570.51590209805, 54570.51590209801)

In [17]:
# MSE
metrics.mean_squared_error(scores, scoreshat), lm2.ssr/n

(65.56013803717558, 65.56013803717558)

In [18]:
# RMSE
metrics.root_mean_squared_error(scores, scoreshat), np.sqrt(lm2.ssr/n)

(8.096921516056307, 8.096921516056307)

In [19]:
# MAPE
mape = sum( np.abs(scores - scoreshat)/scores ) / n
metrics.mean_absolute_percentage_error(scores, scoreshat), mape

(0.2223447695690139, 0.2223447695690138)

In [20]:
# MAE
mae = sum( np.abs(scores - scoreshat) ) / n
metrics.mean_absolute_error(scores, scoreshat), mae

(6.415932421491938, 6.415932421491938)

In [21]:
# Mean squared error the model
# The explained sum of squares divided by the model degrees of freedom
lm2.mse_model, lm2.ess/3

(18190.171967366015, 18190.171967366015)

In [22]:
# Mean squared error of the residuals
# The sum of squared residuals divided by the residual degrees of freedom.
lm2.mse_resid, lm2.ssr/(n-4)

(67.28540482762757, 67.28540482762757)

In [23]:
# Total mean squared error
# The centered total sum of squares divided by the number of observations.
lm2.mse_total, lm2.centered_tss/(n-1)

(418.05095119933833, 418.05095119933833)

## Parameter estimates

In [24]:
# estimated parameters
lm2.params

Intercept    60.452901
alc          -1.800101
weed         -1.021552
exrc          1.768289
dtype: float64

In [25]:
# estimated sigma (noise term)
np.sqrt(lm2.scale)

8.202768119825622

## Confidence intervals for model parameters

In [26]:
# standard errors
lm2.bse

Intercept    1.289380
alc          0.069973
weed         0.476166
exrc         0.138056
dtype: float64

In [27]:
lm2.conf_int(alpha=0.05)

Unnamed: 0,0,1
Intercept,57.90548,63.000321
alc,-1.938347,-1.661856
weed,-1.962309,-0.080794
exrc,1.495533,2.041044


## Hypothesis testing for linear models

### F-test for the overall model

In [28]:
lm2.fvalue, lm2.f_pvalue

(270.3435018926583, 1.0512133413866907e-60)

### T-tests for individual parameters

In [29]:
lm2.tvalues

Intercept    46.885245
alc         -25.725654
weed         -2.145371
exrc         12.808529
dtype: float64

In [30]:
lm2.df_resid

152.0

In [31]:
lm2.pvalues

Intercept    2.756807e-92
alc          2.985013e-57
weed         3.351156e-02
exrc         6.136296e-26
dtype: float64

In [32]:
# to reproduce `weed` p-value
from scipy.stats import t as tdist
t = lm2.tvalues["weed"]
pleft = tdist(df=lm2.df_resid).cdf(t)
pright = 1 - tdist(df=lm2.df_resid).cdf(t)
pvalue = 2*min(pleft,pright)
pvalue

0.03351156181342321

## Assumptions checks and diagnostics

### Diagnostics plots

### Linearity checks

### Normality checks

### Homoscedasticity checks

### Independence checks

### Collinearity checks

## Outliers and influential observations

In [33]:
# Model fit diagnostics and plots
# influence = lm2.get_influence()
# influence.summary_frame()

# resid_studentized = influence.resid_studentized_internal
# resid_studentized

# lm2.outlier_test()
# import statsmodels.api as sm
# sm.graphics.influence_plot(lm2, criterion="cooks");
# sm.graphics.plot_ccpr(lm2, "effort");
# sm.graphics.plot_regress_exog(lm2, "effort");

In [34]:
# lm2.fittedvalues
# ALT. 
# lm2.predict(doctors)

### Diagnostic checks

In [35]:
lm2.summary()
lm2.diagn

{'jb': 0.8999138579592739,
 'jbpv': 0.6376556155083204,
 'skew': 0.18224568453683587,
 'kurtosis': 3.074795238556266,
 'omni': 1.1403999852814177,
 'omnipv': 0.5654123490825862,
 'condno': 31.229721453770164,
 'mineigval': 40.14996367643263}

## Model prediction accuracy

### Prediction

In [36]:
new_data = {"alc":3, "weed":1, "exrc":8}
lm2.predict(new_data)

0    68.177355
dtype: float64

In [37]:
pred_score = lm2.get_prediction(new_data)

In [38]:
# observation  +  90% CI
pred_score.predicted, pred_score.se_obs, pred_score.conf_int(obs=True, alpha=0.1)

(array([68.17735504]), array([8.2626007]), array([[54.50324518, 81.85146489]]))

In [39]:
# # ALT.
# from scipy.stats import t as tdist
# obs_dist = tdist(df=pred_score.df, loc=pred_score.predicted, scale=pred_score.se_obs)
# obs_dist.ppf(0.05), obs_dist.ppf(0.95)

In [40]:
pred_score.predicted_mean, pred_score.se_mean, pred_score.conf_int(obs=False, alpha=0.1)

(array([68.17735504]),
 array([0.99255508]),
 array([[66.53473577, 69.81997431]]))

In [41]:
# # ALT.
# from scipy.stats import t as tdist
# mean_dist = tdist(df=pred_score.df, loc=pred_score.predicted, scale=pred_score.se_mean)
# mean_dist.ppf(0.05), mean_dist.ppf(0.95)

### Out-of-sample prediction accuracy

### Leave-one-out cross-validation

### Regularization

## Explanations

## Discussion

### Resampling methods for regression

### Towards machine learning

The out-of-sample prediction accuracy is a common metric used in machine learning (ML) tasks.

## Exercises

## Links

- More details about model checking
  https://ethanweed.github.io/pythonbook/05.04-regression.html#model-checking
- Statistical Modeling: The Two Cultures paper that explains the importance of out-of-sample predictions for statistical modelling.
  https://projecteuclid.org/journals/statistical-science/volume-16/issue-3/Statistical-Modeling--The-Two-Cultures-with-comments-and-a/10.1214/ss/1009213726.full