# Data Mining in Physics - Presentation 3. - DAGUR 5.3: Standard errors and confidence intervals

In [None]:
library(microbenchmark)
library(graphics)
library(ggplot2)

In [None]:
options(jupyter.plot_scale=1.4)

## 1. Load and fit the `DAAG::roller` dataset again

In [None]:
library(lattice)
library(DAAG)

In [None]:
# Export the `roller` dataset from DAAG
df = DAAG::roller
# Extract X and y data
x = df$weight
y = df$depression

In [None]:
# Fit linear model to the weight(depression) values
roller.lm = lm(depression ~ weight, data=df)
# Use the extractor function summary() to summarize results
summary(roller.lm)

## 2. Calculate confidence interval

In case of linear regression:
\begin{equation}
L_{\mathrm{conf}}
=
b \pm t_{97.5\%} \sigma_{b}
\end{equation}
- $b$ : intercept 
- $t_{97.5\%}$ : $97.5\%$ of values of the t-distribution
- $\sigma_{b}$ : standard error of intercept at a given point

In [None]:
# Confidence interval calculations
SEb = summary(roller.lm)$coefficients[2, 2]
coef(roller.lm)[2] + qt(c(0.025, 0.975), 8)*SEb

In [None]:
# Obtain fitted values and standard errors (SE, then SE.OBS)
fit.with.se = predict(roller.lm, se.fit=TRUE)

In [None]:
# SE : Standard error
fit.with.se$se.fit

In [None]:
# SE.OBS : Precision of predicting an observation
sqrt(fit.with.se$se.fit**2+fit.with.se$residual.scale**2)

In [None]:
## Plot depression vs weight, with 95\% pointwise bounds for the fitted line
plot(depression ~ weight, data=roller,
     xlab = "Weight of Roller [t]",
     ylab = "Depression in Lawn [mm]", pch = 16)

abline(roller.lm$coef, lty = 1)

xy = data.frame(weight = pretty(roller$weight, 20))
yhat = predict(roller.lm, newdata = xy, interval="confidence")
ci = data.frame(lower=yhat[, "lwr"], upper=yhat[, "upr"])
lines(xy$weight, ci$lower, lty = 2, lwd=2, col="grey")
lines(xy$weight, ci$upper, lty = 2, lwd=2, col="grey")