# Section 4.3 — Interpreting linear models

This notebook contains the code examples from [Section 4.3 Interpreting linear models]() from the **No Bullshit Guide to Statistics**.

#### Notebook setup

In [1]:
# load Python modules
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Figures setup
plt.clf()  # needed otherwise `sns.set_theme` doesn't work
from plot_helpers import RCPARAMS
RCPARAMS.update({'figure.figsize': (8, 5)})   # good for screen
# RCPARAMS.update({'figure.figsize': (5, 1.6)})  # good for print
sns.set_theme(
    context="paper",
    style="whitegrid",
    palette="colorblind",
    rc=RCPARAMS,
)

# High-resolution please
%config InlineBackend.figure_format = 'retina'

# Where to store figures
DESTDIR = "figures/lm/interpreting"

<Figure size 640x480 with 0 Axes>

In [3]:
# set random seed for repeatability
np.random.seed(42)
#######################################################

In [4]:
import statsmodels.formula.api as smf

## Definitions

In [5]:
doctors = pd.read_csv("../datasets/doctors.csv")
n = doctors.shape[0]

lm2 = smf.ols('score ~ 1 + alc + weed + exrc', data=doctors).fit()

# the coefficients of the best-fit line
lm2.summary()

0,1,2,3
Dep. Variable:,score,R-squared:,0.842
Model:,OLS,Adj. R-squared:,0.839
Method:,Least Squares,F-statistic:,270.3
Date:,"Wed, 28 Feb 2024",Prob (F-statistic):,1.05e-60
Time:,07:46:50,Log-Likelihood:,-547.63
No. Observations:,156,AIC:,1103.0
Df Residuals:,152,BIC:,1115.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,60.4529,1.289,46.885,0.000,57.905,63.000
alc,-1.8001,0.070,-25.726,0.000,-1.938,-1.662
weed,-1.0216,0.476,-2.145,0.034,-1.962,-0.081
exrc,1.7683,0.138,12.809,0.000,1.496,2.041

0,1,2,3
Omnibus:,1.14,Durbin-Watson:,1.828
Prob(Omnibus):,0.565,Jarque-Bera (JB):,0.9
Skew:,0.182,Prob(JB):,0.638
Kurtosis:,3.075,Cond. No.,31.2


## Model fit metrics

In [6]:
lm2.rsquared, lm2.rsquared_adj

(0.8421649167873537, 0.8390497506713147)

In [7]:
# Explained sum of squares
lm2.ess

54570.51590209805

In [8]:
lm2.ssr / n

65.56013803717558

### Metrics 

In [9]:
from sklearn import metrics
scores = doctors["score"].values
scoreshat = lm2.fittedvalues.values
meanscore = np.mean(scores)


# TSS = Total Sum of Squares
tss = sum( (scores-meanscore)**2 ) 
# lm2.centered_tss, tss

# SSR = Sum of Squares Residuals
# ssr = sum( (scores-scoreshat)**2 )
# ssr, lm2.ssr

# ESS = Explained Sum of Squares
# ess = sum( (scoreshat-ybar)**2 ) 
# lm2.ess, ess

# R^2
# metrics.r2_score(scores, scoreshat), lm2.rsquared, 1-lm2.ssr/tss, lm2.ess/tss

# MSE
# metrics.mean_squared_error(scores, scoreshat), lm2.ssr/n

# RMSE
# metrics.root_mean_squared_error(scores, scoreshat), np.sqrt(lm2.ssr/n)

# MAPE
# mape = sum( np.abs(scores - scoreshat)/scores ) / n
# metrics.mean_absolute_percentage_error(scores, scoreshat), mape

# MAE
# mae = sum( np.abs(scores - scoreshat) ) / n
# metrics.mean_absolute_error(scores, scoreshat), mae

# Mean squared error the model
# The explained sum of squares divided by the model degrees of freedom
# lm2.mse_model, lm2.ess/3

# Mean squared error of the residuals
# The sum of squared residuals divided by the residual degrees of freedom.
# lm2.mse_resid, lm2.ssr/(n-4)

# Total mean squared error
# The centered total sum of squares divided by the number of observations.
# lm2.mse_total, lm2.centered_tss/(n-1)

## Parameter estimates

## Confidence intervals for model parameters

In [10]:
# estimated parameters
lm2.params

Intercept    60.452901
alc          -1.800101
weed         -1.021552
exrc          1.768289
dtype: float64

In [11]:
# standard errors
lm2.bse

Intercept    1.289380
alc          0.069973
weed         0.476166
exrc         0.138056
dtype: float64

## Hypothesis testing for linear models

In [12]:
lm2.tvalues

Intercept    46.885245
alc         -25.725654
weed         -2.145371
exrc         12.808529
dtype: float64

In [13]:
lm2.df_resid

152.0

In [14]:
lm2.pvalues

Intercept    2.756807e-92
alc          2.985013e-57
weed         3.351156e-02
exrc         6.136296e-26
dtype: float64

In [15]:
# to reproduce `weed` p-value
from scipy.stats import t as tdist
t = lm2.tvalues["weed"]
pleft = tdist(df=lm2.df_resid).cdf(t)
pright = 1 - tdist(df=lm2.df_resid).cdf(t)
pvalue = 2*min(pleft,pright)
pvalue

0.03351156181342321

## Assumptions checks and diagnostics

## Outliers and influential observations

In [16]:
# TODO: move to Sec 4.3
# Model fit diagnostics and plots
# influence = fit1.get_influence()
# influence.summary_frame()

# resid_studentized = influence.resid_studentized_internal
# resid_studentized

# fit1.outlier_test()
# import statsmodels.api as sm
# sm.graphics.influence_plot(fit1, criterion="cooks");
# sm.graphics.plot_ccpr(fit1, "effort");
# sm.graphics.plot_regress_exog(fit1, "effort");

In [17]:
# lm2.fittedvalues
# ALT. 
# m2.predict(doctors)

### Diagnostic checks

In [18]:
lm2.summary()
lm2.diagn

{'jb': 0.8999138579592739,
 'jbpv': 0.6376556155083204,
 'skew': 0.18224568453683587,
 'kurtosis': 3.074795238556266,
 'omni': 1.1403999852814177,
 'omnipv': 0.5654123490825862,
 'condno': 31.229721453770164,
 'mineigval': 40.14996367643263}

## Model prediction accuracy

### Prediction

In [19]:
new_data = {"alc":3, "weed":1, "exrc":8}
lm2.predict(new_data)

0    68.177355
dtype: float64

In [20]:
pred_score = lm2.get_prediction(new_data)

In [21]:
# observation  +  90% CI
pred_score.predicted, pred_score.se_obs, pred_score.conf_int(obs=True, alpha=0.1)

(array([68.17735504]), array([8.2626007]), array([[54.50324518, 81.85146489]]))

In [22]:
# # ALT.
# from scipy.stats import t as tdist
# obs_dist = tdist(df=pred_score.df, loc=pred_score.predicted, scale=pred_score.se_obs)
# obs_dist.ppf(0.05), obs_dist.ppf(0.95)

In [23]:
pred_score.predicted_mean, pred_score.se_mean, pred_score.conf_int(obs=False, alpha=0.1)

(array([68.17735504]),
 array([0.99255508]),
 array([[66.53473577, 69.81997431]]))

In [24]:
# # ALT.
# from scipy.stats import t as tdist
# mean_dist = tdist(df=pred_score.df, loc=pred_score.predicted, scale=pred_score.se_mean)
# mean_dist.ppf(0.05), mean_dist.ppf(0.95)

## Explanations

## Discussion

## Exercises

## Links