In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm

from ISLP import load_data
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer

# Linear Regression

## Simple Linear Regression

In [None]:
boston = load_data("Boston")
boston.columns

In [None]:
# Fit OLS model
X = pd.DataFrame(
    {
        "intercept": np.ones(boston.shape[0]),
        "lstat": np.array(boston["lstat"]),
    }
)
y = np.array(boston["medv"])

model = sm.OLS(endog=y, exog=X)
results = model.fit()

In [None]:
results.summary()

In [None]:
# Predict
X_test = pd.DataFrame(
    {
        "intercept": np.array([1., 1., 1.]),
        "lstat": np.array([5., 10., 15.])
    }
)
y_hat = results.get_prediction(X_test)

In [None]:
y_hat.predicted_mean

In [None]:
# Confidence intervals
y_hat.conf_int(alpha=0.05)

In [None]:
# Prediction intervals
y_hat.conf_int(obs=True, alpha=0.05)

In [None]:
# Plot

def abline(ax, b, m, *args, **kwargs):
    "Add a line with slope m and intercept b to ax"
    xlim = ax.get_xlim()
    ylim = [m * xlim[0] + b, m * xlim[1] + b]
    ax.plot(xlim, ylim, *args, **kwargs)


ax = boston.plot.scatter(x="lstat", y="medv")
abline(
    ax,
    results.params[0],
    results.params[1],
    "r--",
    lw=3
)

In [None]:
# Residual analysis
fig, ax = plt.subplots()
ax.scatter(results.fittedvalues, results.resid)
ax.set(xlabel="Fitted value", ylabel="Residual")
ax.axhline(0, c="k", ls="--");

In [None]:
infl = results.get_influence()
fig, ax = plt.subplots()
ax.scatter(np.arange(X.shape[0]), infl.hat_matrix_diag)
ax.set(xlabel="Index", ylabel="Levarge")

np.argmax(infl.hat_matrix_diag)

## Multiple Linear Regression

In [None]:
X = pd.DataFrame(
    {
        "intercept": np.ones(boston.shape[0]),
        "lstat": np.array(boston["lstat"]),
        "age": np.array(boston["age"]),
    }
)
y = np.array(boston["medv"])

model = sm.OLS(endog=y, exog=X)
results = model.fit()
results.summary()

In [None]:
# Backward selection based on p-values
X = boston.drop(columns=["medv", "indus", "age"])
X.insert(loc=0, column="intercept", value=np.ones(X.shape[0]))
y = np.array(boston["medv"])
model = sm.OLS(endog=y, exog=X)
results = model.fit()
results.summary()

In [None]:
# Collinearity
vals = [VIF(X, i) for i in range(1, X.shape[1])]
vif = pd.DataFrame({"vif": vals}, index=X.columns[1:])
vif

## Non-Linear and Interaction Transforms

In [None]:
X = boston.drop(columns=["medv", "indus"])
X.insert(loc=0, column="intercept", value=np.ones(X.shape[0]))
y = np.array(boston["medv"])
model = sm.OLS(endog=y, exog=X)
results1 = model.fit()
results.summary()

In [None]:
X = boston.drop(columns=["medv", "indus"])
X.insert(loc=0, column="intercept", value=np.ones(X.shape[0]))
y = np.array(boston["medv"])

# Feature transforms
poly_features = ["lstat", "age"]
poly_transformer = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
col_transformer = ColumnTransformer(
    transformers=[("poly", poly_transformer, poly_features),],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

X = col_transformer.fit_transform(X, y)
X = pd.DataFrame(X, columns=col_transformer.get_feature_names_out())

model = sm.OLS(endog=y, exog=X)
results2 = model.fit()
results.summary()

In [None]:
anova_lm(results1, results2)

In [None]:
# Residual analysis
fig, ax = plt.subplots()
ax.scatter(results2.fittedvalues, results2.resid)
ax.set(xlabel="Fitted value", ylabel="Residual")
ax.axhline(0, c="k", ls="--");

## Confidence vs. Prediction Intervals

### Confidence Intervals for Population Estimates

- The range of values that is expected to contain the true value of a population parameter, such as the population mean, with some specified level of confidence.
- Used to quantify uncertainty in the parameter estimates of a population, e.g. the population mean.
- Confidence intervals are computed using the standard error, the amount by which the sample estimate is expected to deviate from the the true population parameter.
- For the sample mean this is given by:

$$
SE(\hat\mu) = \frac{\sigma}{\sqrt{n}}
$$

- where $\sigma$ is the **sample** standard deviation and $n$ is the sample size.
- Confidence intervals on the sample mean are then computed using the standard error and the critical t-values for the required confidence interval.

$$
CI = \bar{X} \pm t_{\alpha/2} \cdot SE(\hat\mu)
$$

- for example the critical t-value for a 95% confidence interval ($\alpha$ = 0.05) is $\approx$ 2.05

### Confidence Intervals in Regression Analysis

- Instead of estimating a population parameter by sampling, we might have preformed a regression analysis.
- For example, to estimate the height of orange trees as a function of their age we might have measured tree heights for 30, 50, 60, and 80 days old trees. We now want to estimate the average height of a 100-day old tree without going out and sampling heights for 100-day old trees.
- Instead we can use the regression analysis to estimate the population mean height of 100-day old orange trees.
- The confidence interval on that mean is given by

$$
CI = \hat{Y} \pm t_{\alpha/2, n-2} \cdot SE \cdot \sqrt{\frac{1}{n} + \frac{(X_0 - \bar{X})^2}{\sum (X_i - \bar{X})^2}}
$$

- where $SE$ is the residual standard error of the estimate, given by
$$
s = \sqrt{\frac{\sum (Y_i - \hat{Y}_i)^2}{n - 2}}
$$

- Note that $(X_0 - \bar{X})$ is the difference between the point that we are making the estimate for $X_0$ and the mean value of $X$ used in the regression analysis. There the confidence interval widens as the target estimate deviates further from the sample mean. Also note that $(X_i - \bar{X})$, captures the spread of $x$-values used in the regression analysis. More spread gives us more leverage estimate parameters.

- Worth re-iterating the confidence intervals in regression analysis are about capturing the uncertainty in the population estimate where that estimate is derived from a regression analysis. Returning to the example of orange tree heights, the confidence interval will capture the uncertainty in the average height of 100-day old orange trees. Not the uncertainty in the heigh estimate for an individual tree.

### Prediction Intervals.

- Prediction intervals are used to capture the uncertainty in individual predictions. Prediction intervals for linear regression are given by:

$$
PI = \hat{Y} \pm t_{\alpha/2, n-2} \cdot s \cdot \sqrt{1 + \frac{1}{n} + \frac{(X_0 - \bar{X})^2}{\sum (X_i - \bar{X})^2}}
$$

- Comparing this against the formula of the confidence interval shows that prediction intervals are typically wider that confidence intervals to capture the additional uncertainty in making a prediction for a single value vs. a population estimate.

### Summary

| **Confidence Interval** | **Prediction Interval** |
|------------------------|------------------------|
| Used in determining population parameters based on sample statistics. | Not used in determining population parameters based on samples. |
| Used to predict the mean response (average value of the dependent variable for a given independent variable) based on regressions. | Used to predict the future value (of an individual data point for a given independent variable) based on regressions. |
| Usually narrower for a given analysis. | Usually wider for a given analysis. |


### Resources
- [Confidence vs Prediction Intervals](https://www.datacamp.com/blog/confidence-intervals-vs-prediction-intervals?dc_referrer=https%3A%2F%2Fwww.google.com%2F)