# Section 4.4 — Interpreting linear models

This notebook contains the code examples from [Section 4.4 Interpreting linear models]() from the **No Bullshit Guide to Statistics**.

#### Notebook setup

In [1]:
# load Python modules
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Figures setup
plt.clf()  # needed otherwise `sns.set_theme` doesn't work
from plot_helpers import RCPARAMS
RCPARAMS.update({'figure.figsize': (8, 5)})   # good for screen
# RCPARAMS.update({'figure.figsize': (5, 1.6)})  # good for print
sns.set_theme(
    context="paper",
    style="whitegrid",
    palette="colorblind",
    rc=RCPARAMS,
)

# High-resolution please
%config InlineBackend.figure_format = 'retina'

# Where to store figures
DESTDIR = "figures/lm/interpreting"

<Figure size 640x480 with 0 Axes>

In [3]:
# set random seed for repeatability
np.random.seed(42)

In [4]:
import statsmodels.formula.api as smf

In [5]:
# TODO: move to Sec 4.3
# Model fit diagnostics and plots
# influence = fit1.get_influence()
# influence.summary_frame()

# resid_studentized = influence.resid_studentized_internal
# resid_studentized

# fit1.outlier_test()
# import statsmodels.api as sm
# sm.graphics.influence_plot(fit1, criterion="cooks");
# sm.graphics.plot_ccpr(fit1, "effort");
# sm.graphics.plot_regress_exog(fit1, "effort");

In [6]:
students = pd.read_csv("../datasets/students.csv")

model1 = smf.ols('score ~ 1 + effort', data=students)
fit1 = model1.fit()

# the coefficients of the best-fit line
fit1.params

Intercept    32.465809
effort        4.504850
dtype: float64

In [7]:
fit1.rsquared, fit1.rsquared_adj

(0.7734103402591798, 0.7559803664329628)

In [8]:
# Explained sum of squares
fit1.ess

1078.2917900307098

In [9]:
fit1.ssr / 15

21.060813997952703

In [10]:
fit1.fittedvalues
# ALT. 
# fit1.predict(students["effort"])

0     81.838969
1     71.612959
2     71.207522
3     68.144224
4     77.063828
5     81.118193
6     67.648690
7     73.595093
8     55.936080
9     67.198205
10    76.703440
11    84.406734
12    64.450247
13    61.251803
14    86.524013
dtype: float64

### Metrics 

In [11]:
from sklearn import metrics
scores = students["score"].values
scoreshat = fit1.fittedvalues.values
ybar = np.mean(scores)


# TSS = Total Sum of Squares
tss = sum( (scores-ybar)**2 ) 
# fit1.centered_tss, tss

# SSR = Sum of Squares Residuals
# ssr = sum( (scores-scoreshat)**2 )
# ssr, fit1.ssr

# ESS = Explained Sum of Squares
# ess = sum( (scoreshat-ybar)**2 ) 
# fit1.ess, ess

# R^2
# metrics.r2_score(scores, scoreshat), fit1.rsquared, 1-fit1.ssr/tss, fit1.ess/tss

# MSE
# metrics.mean_squared_error(scores, scoreshat), fit1.ssr/15

# RMSE
# metrics.root_mean_squared_error(scores, scoreshat), np.sqrt(fit1.ssr/15)

# MAPE
# metrics.mean_absolute_percentage_error(scores, scoreshat)

# MAE
# metrics.mean_absolute_error(scores, scoreshat)

# Mean squared error the model
# The explained sum of squares divided by the model degrees of freedom
# fit1.mse_model, fit1.ess/1

# Mean squared error of the residuals
# The sum of squared residuals divided by the residual degrees of freedom.
# fit1.mse_resid, fit1.ssr/(15-2)

# Total mean squared error
# The uncentered total sum of squares divided by the number of observations.
# fit1.mse_total, tss/(15-1)

## Confidence intervals

In [12]:
# estimated parameters
fit1.params

Intercept    32.465809
effort        4.504850
dtype: float64

In [13]:
# standard errors
fit1.bse

Intercept    6.155051
effort       0.676276
dtype: float64

## Hypothesis testing

In [14]:
fit1.tvalues

Intercept    5.274661
effort       6.661264
dtype: float64

In [15]:
fit1.df_resid

13.0

In [16]:
fit1.pvalues

Intercept    0.000150
effort       0.000016
dtype: float64

In [17]:
# to reproduce `effort` p-value
# from scipy.stats import t as tdist
# 2*(1-tdist(df=fit1.df_resid).cdf(fit1.tvalues["effort"]))

## Prediction

In [18]:
new_data = {"effort": 7}
fit1.predict(new_data)

0    63.999762
dtype: float64

In [19]:
pred_score = fit1.get_prediction(new_data)


In [20]:
# observation  +  90% CI
pred_score.predicted, pred_score.se_obs, pred_score.conf_int(obs=True, alpha=0.1)

(array([63.99976171]),
 array([5.25168079]),
 array([[54.69938482, 73.30013861]]))

In [21]:
# # ALT.
# from scipy.stats import t as tdist
# obs_dist = tdist(df=pred_score.df, loc=pred_score.predicted, scale=pred_score.se_obs)
# obs_dist.ppf(0.05), obs_dist.ppf(0.95)

In [22]:
pred_score.predicted_mean, pred_score.se_mean, pred_score.conf_int(obs=False, alpha=0.1)

(array([63.99976171]),
 array([1.81085943]),
 array([[60.79285027, 67.20667315]]))

In [23]:
# # ALT.
# from scipy.stats import t as tdist
# mean_dist = tdist(df=pred_score.df, loc=pred_score.predicted, scale=pred_score.se_mean)
# mean_dist.ppf(0.05), mean_dist.ppf(0.95)

In [24]:
pred_score.predicted, pred_score.conf_int(alpha=0.1, obs=True)

(array([63.99976171]), array([[54.69938482, 73.30013861]]))

In [25]:
from scipy.stats import t as tdist
tdist(df=pred_score.df, loc=pred_score.predicted, scale=pred_score.se_mean).interval(0.1)

(array([63.7677199]), array([64.23180352]))