# Section 4.1 — Simple linear regression

This notebook contains the code examples from [Section 4.1 Simple linear regression]() from the **No Bullshit Guide to Statistics**.

#### Notebook setup

In [None]:
# load Python modules
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Figures setup
plt.clf()  # needed otherwise `sns.set_theme` doesn"t work
from plot_helpers import RCPARAMS
RCPARAMS.update({"figure.figsize": (5, 3)})   # good for screen
# RCPARAMS.update({"figure.figsize": (5, 1.6)})  # good for print
sns.set_theme(
    context="paper",
    style="whitegrid",
    palette="colorblind",
    rc=RCPARAMS,
)

# High-resolution please
%config InlineBackend.figure_format = "retina"

# Where to store figures
DESTDIR = "figures/lm/simple"

In [None]:
from ministats import plot_residuals
from ministats import plot_residuals2
from ministats.utils import savefigure

In [None]:
# set random seed for repeatability
np.random.seed(42)

In [None]:
import warnings
# silence kurtosistest warning when using n < 20
warnings.filterwarnings("ignore", category=UserWarning)

$\def\stderr#1{\mathbf{se}_{#1}}$
$\def\stderrhat#1{\hat{\mathbf{se}}_{#1}}$
$\newcommand{\Mean}{\textbf{Mean}}$
$\newcommand{\Var}{\textbf{Var}}$
$\newcommand{\Std}{\textbf{Std}}$
$\newcommand{\Freq}{\textbf{Freq}}$
$\newcommand{\RelFreq}{\textbf{RelFreq}}$
$\newcommand{\DMeans}{\textbf{DMeans}}$
$\newcommand{\Prop}{\textbf{Prop}}$
$\newcommand{\DProps}{\textbf{DProps}}$

$$
\newcommand{\CI}[1]{\textbf{CI}_{#1}}
\newcommand{\CIL}[1]{\textbf{L}_{#1}}
\newcommand{\CIU}[1]{\textbf{U}_{#1}}
\newcommand{\ci}[1]{\textbf{ci}_{#1}}
\newcommand{\cil}[1]{\textbf{l}_{#1}}
\newcommand{\ciu}[1]{\textbf{u}_{#1}}
$$


(this cell contains the macro definitions like $\stderr{\overline{\mathbf{x}}}$, $\stderrhat{}$, $\Mean$, ...)

## Definitions

## Linear model

In [None]:
from scipy.stats import norm

# Define the linear model function
def linear_model(x):
    return 30 + 5 * x

# Define sigma for the normal distribution
sigma = 8

with sns.axes_style("ticks"):
    fig, ax = plt.subplots(figsize=(5, 3))

    # Plot the linear model
    xs = np.linspace(0, 8, 200)
    ys = linear_model(xs)
    sns.lineplot(x=xs, y=ys, ax=ax, label=r"$\mu_Y(x) = \beta_0 + \beta_1x$", linewidth=2)
    
    # Plot Gaussian distributions at specified x positions and add sigma lines
    x_positions = range(1,8)
    for x_pos in x_positions:
        y_pos = linear_model(x_pos)
        ys = np.linspace(y_pos-3.5*sigma, y_pos+3.5*sigma, 100)
        pdf = 0.7*norm(loc=y_pos, scale=sigma).pdf(ys)
        ax.fill_betweenx(ys, x_pos - pdf * sigma, x_pos, color="grey", alpha=0.5)
        # Draw vertical sigma line and label it on the opposite side of the Gaussian shape
        ax.plot([x_pos, x_pos], [y_pos, y_pos - sigma], "k", lw=1)
        ax.text(x_pos + 0.06, y_pos - sigma / 2, r"$\sigma$", fontsize=12, va="center")

    # y-intercept
    ax.text(0 - 0.15, 30, r"$\beta_0$", fontsize=10, va="center", ha="right")

    # Set up x-axis
    ax.set_xlim([0, 8])
    ax.set_xlabel("$x$")
    ax.set_xticklabels([])
    
    # Set up y-axis
    ax.set_ylim([0, 100])
    ax.set_ylabel("$y$")
    ax.set_yticks(range(0,110,10))
    ax.set_yticklabels([])
    
    ax.legend(loc="upper left")

filename = os.path.join(DESTDIR, "linear_model_xy_with_gaussians.pdf")
savefigure(fig, filename)

## Example: students score as a function of effort

In [None]:
students = pd.read_csv("../datasets/students.csv")
students.head()

In [None]:
efforts = students["effort"]
scores = students["score"]
sns.scatterplot(x=efforts, y=scores)

filename = os.path.join(DESTDIR, "students_scores_vs_effort_scatterplot.pdf")
savefigure(plt.gcf(), filename)

#### Compute the correlation

In [None]:
np.corrcoef(efforts, scores)[0,1]
# ALT. students[["effort","score"]].corr()
# np.corrcoef

## Parameter estimation using least squares

In [None]:
meaneffort = efforts.mean()
meanscore = scores.mean()
num = np.sum( (efforts-meaneffort)*(scores-meanscore) )
denom = np.sum( (efforts - meaneffort)**2 )
b1 = num / denom
b1

In [None]:
b0 = meanscore - b1*meaneffort
b0

In [None]:
es = np.linspace(5, 12)
shats = b0 + b1*es
sns.lineplot(x=es, y=shats)
sns.scatterplot(x=efforts, y=scores)

filename = os.path.join(DESTDIR, "students_scores_vs_effort_with_line.pdf")
savefigure(plt.gcf(), filename)

In [None]:
# # ALT.
# sns.regplot(x=efforts, y=scores, ci=None);

### Least squares optimization for the parameters

How do we find the parameter estimates of the model?

In [None]:
plot_residuals(efforts, scores, b0, b1)
sns.scatterplot(x=efforts, y=scores)
es = np.linspace(5, 12.2)
shats = b0 + b1*es
sns.lineplot(x=es, y=shats, color="C4");

filename = os.path.join(DESTDIR, "students_scores_with_residuals.pdf")
savefigure(plt.gcf(), filename)

In [None]:
ax = sns.scatterplot(x=efforts, y=scores, zorder=4)
es = np.linspace(5, 12.2)
shats = b0 + b1*es
sns.lineplot(x=es, y=shats, color="C4", zorder=5)
plot_residuals2(efforts, scores, b0, b1, ax=ax);

filename = os.path.join(DESTDIR, "students_scores_with_residuals_squared.pdf")
savefigure(plt.gcf(), filename)

### Estimating the standard deviation parameter

In [None]:
scorehats = b0 + b1*efforts
residuals = scores - scorehats
residuals[0:4]

In [None]:
SSR = np.sum( residuals**2 )
n = len(students)
sigmahat = np.sqrt( SSR / (n-2) )
sigmahat

## Model diagnostics

### Residuals plots

In [None]:
scorehats = b0 + b1*efforts
residuals = scores - scorehats

In [None]:
ax = sns.scatterplot(x=efforts, y=residuals, color="red")
ax.set_ylabel("residuals ($r_i = s_i - \\hat{s}_i$)")
ax.axhline(y=0, color="b", linestyle="dashed");

filename = os.path.join(DESTDIR, "residuals_plot_vs_effort.pdf")
savefigure(plt.gcf(), filename)

In [None]:
# # ALT.
# sns.residplot(data=students, x="effort", y="score", lowess=True, color="g");

In [None]:
from statsmodels.graphics.api import qqplot
with plt.rc_context({"figure.figsize":(4,3)}):
    qqplot(residuals, line="s")

plt.xlabel("theoretical quantiles (normal)")
plt.ylabel("residuals quantiles")
filename = os.path.join(DESTDIR, "residuals_plot_vs_effort_qqplot.pdf")
savefigure(plt.gcf(), filename)

### Sum of squares quantities

#### Sum of squared residuals

In [None]:
SSR = np.sum( residuals**2 )
SSR

#### Explained sum of squares

In [None]:
meanscore = scores.mean()
ESS = np.sum( (scorehats-meanscore)**2 ) 
ESS

#### Total sum of squares

In [None]:
TSS = np.sum( (scores - meanscore)**2 )
TSS

In [None]:
SSR + ESS  # == TSS

### Coefficient of determination $R^2$

In [None]:
R2 = ESS / TSS
R2

#### Related to the Pearson correlation coefficient

In [None]:
efforts = students["effort"]
scores = students["score"]
pearson_r = efforts.corr(scores)
pearson_r**2

In [None]:
# ALT.
from scipy.stats import pearsonr
r = pearsonr(efforts, scores)[0]
r**2

## Using linear models to make predictions

In [None]:
def predict(x, b0, b1):
    yhat = b0 + b1*x
    return yhat

### Confidence interval for the mean

TODO: add formulas

### Confidence interval for observations

TODO: add formulas

### Example:predicting students' scores

Predict the `score` of a new student who invests 9 hours of `effort` per week.

In [None]:
neweffort = 9
scorehat = predict(neweffort, b0=32.5, b1=4.5)
scorehat

#### Confidence interval for the mean score

In [None]:
#######################################################
newdev = (neweffort - efforts.mean())**2
sum_dev2 = np.sum((efforts - efforts.mean())**2)
se_meanscore = sigmahat*np.sqrt(1/n + newdev/sum_dev2)
se_meanscore

In [None]:
from scipy.stats import t as tdist
alpha = 0.1
t_l, t_u = tdist(df=n-2).ppf([alpha/2, 1-alpha/2])
[scorehat + t_l*se_meanscore, scorehat + t_u*se_meanscore]

In [None]:
from ministats import plot_lm_simple

plot_lm_simple(efforts, scores, ci_mean=True)

filename = os.path.join(DESTDIR, "prediction_mean_score_vs_effort.pdf")
savefigure(plt.gcf(), filename)

#### Confidence interval for predicted scores

In [None]:
se_score = sigmahat*np.sqrt(1 + 1/n + newdev/sum_dev2)
se_score

In [None]:
alpha = 0.1
t_l, t_u = tdist(df=n-2).ppf([alpha/2, 1-alpha/2])
[scorehat + t_l*se_score, scorehat + t_u*se_score]

In [None]:
plot_lm_simple(efforts, scores, ci_obs=True)

filename = os.path.join(DESTDIR, "prediction_values_score_vs_effort.pdf")
savefigure(plt.gcf(), filename)

### Prediction caveats

In [None]:
efforts.min(), efforts.max()

It's not OK to extrapolate the validity of the model outside of the range of values where we have observed data.

For example, there is no reason to believe in the model's predictions about an `effort` of `20` hours per week:

In [None]:
predict(20, b0=32.5, b1=4.5)

Indeed, the model predicts the grade will be above 100\% which is impossible.

## Explanations

### Strategies for fitting linear models

- **Calculus**
  We can obtain the analytical formulas ...
- **Numerical optimization**
- **Linear algebra**

### Software for fitting linear models

- `scipy`
- `statsmodels`
- `scikit-learn`

### Fitting linear models with `statsmodels`

In [None]:
import statsmodels.formula.api as smf

lm1 = smf.ols("score ~ 1 + effort", data=students).fit()

In [None]:
type(lm1)

#### Estimated parameters for the model

In [None]:
lm1.params

In [None]:
type(lm1.params)

We often want to extract the intercept and slope parameters
for use in subsequent calculations.

In [None]:
b0 = lm1.params["Intercept"]  # = lm1.params[0]
b1 = lm1.params["effort"]     # = lm1.params[1]
b0, b1

The estimate $\widehat{\sigma}$ is obtained by taking the square root of the `.scale` attribute.

In [None]:
sigmahat = np.sqrt(lm1.scale)
sigmahat

#### Model fitted values

In [None]:
lm1.fittedvalues  # == scorehats

#### Residuals

In [None]:
lm1.resid  # == scores - scorehats

#### Sum-of-squared quantities

In [None]:
# SSR     # ESS     # TSS              # R2
lm1.ssr,  lm1.ess,  lm1.centered_tss,  lm1.rsquared

#### Predictions

Predict the `score` of a new student who invests 9 hours of `effort` per week.

In [None]:
lm1.predict({"effort":9})

In [None]:
pred = lm1.get_prediction({"effort":9})
pred.se_mean, pred.conf_int(alpha=0.1)

In [None]:
pred.se_obs, pred.conf_int(obs=True, alpha=0.1)

#### Model summary table

In [None]:
lm1.summary()

## Alternative methods for fitting linear models (optional)

### Numerical optimization

In [None]:
from scipy.optimize import minimize

def ssr(betas, xdata, ydata):
    yhat = betas[0] + betas[1]*xdata
    resid = ydata - yhat
    return np.sum(resid**2)

optres = minimize(ssr, x0=[0,0], args=(efforts,scores))
beta0, beta1 = optres.x
beta0, beta1

### Linear algebra

linear algebra solution using `numpy`


In [None]:
import numpy as np

# Prepare the design matrix 
X = np.ndarray((n,2))
X[:,0] = 1
X[:,1] = efforts
X

We obtain the least squares solution using the Moore–Penrose inverse formula:

$$
    \vec{\beta} = (X^{\sf T} X)^{-1}X^{\sf T}\; \mathbf{y}
$$

In [None]:
lares = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(scores)
beta0, beta1 = lares
beta0, beta1

### Fitting linear models using `scipy`

The helper function `scipy.stats.linregress` ...

In [None]:
from scipy.stats import linregress

scipyres = linregress(efforts, scores)
scipyres.intercept, scipyres.slope

### Fitting linear models using `scikit-learn`

The class `sklearn.linear_model.LinearRegression` ...

In [None]:
from sklearn.linear_model import LinearRegression

sklmodel = LinearRegression()
sklmodel.fit(efforts.values[:,np.newaxis], scores)
sklmodel.intercept_, sklmodel.coef_

### Using the low-level `statsmodels` API



In [None]:
import statsmodels.api as sm

X = sm.add_constant(efforts)
y = scores
smres = sm.OLS(y,X).fit()
smres.params["const"], smres.params["effort"]

## Discussion

#### Examples of non-linear relationships

Hare are some examples of the different possible relationships between the `effort` and `score` variables.

![nonlinear relantionships](./attachments/lm/ELV_as_function_of_stats_hours.png)

## Exercises

## Links

### (bonus) Formula for standard error of coefficients

In [None]:
lm1.bse

Formula using summations

$$
    se(\beta_0) = \hat{\sigma} \cdot \sqrt{ \frac{1}{n} + \frac{\overline{x}^2}{\sum (x_i - \overline{x})^2}  }
    \qquad
    se(\beta_1) = \hat{\sigma} \cdot \sqrt{\frac{1}{\sum (x_i - \overline{x})^2}}
$$

TODO: show derivation why these formulas are equiv. to matrix formulas below when p=1

In [None]:
sum_dev2 = np.sum((efforts - efforts.mean())**2)
se_Intercept = sigmahat * np.sqrt(1/n + efforts.mean()**2/sum_dev2)
se_b_effort = sigmahat/np.sqrt(sum_dev2)
se_Intercept, se_b_effort

Alternative formula using design matrix

$$
    [se(\beta_0), se(\beta_1)]
    =
    \hat{\sigma} \cdot \text{diag}\left( \sqrt{ (X^T X)^{-1} } \right)
$$

where $X$ is the design matrix.

In [None]:
# construct the design matrix for the model 
X = sm.add_constant(students[["effort"]])
# calculate the diagonal of the inverse-covariance matrix
inv_covs = np.diag(np.linalg.inv(X.T.dot(X)))
np.sqrt(sigmahat**2 * inv_covs)