In [None]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

## Data generating process

In [None]:
class DataGeneration():
    def __init__(self, Nd=8, poldeg=2, xmin=0.0, xmax=0.5, error=0.1, seed=None):
        self.rng=np.random.default_rng(seed=seed)
        self.xdata = np.sort(self.rng.uniform(low=xmin, high=xmax, size=(Nd,1)),axis=0)
        self.true_params = self.rng.uniform(low=-5.0, high=5, size=poldeg+1,)
        self.sigma_error=error
        self.ydata = self.measurement()
        
    def true_model(self, params, xdata):
        ytrue = np.polynomial.polynomial.polyval(xdata, params)
        return ytrue

    def measurement(self):
        ydata = self.true_model(self.true_params, self.xdata)
        error = self.rng.normal(0,self.sigma_error,len(self.xdata)).reshape(-1,1)
        return ydata+error

In [None]:
xmin=0.0
xmax=0.5
Nd=8
poldeg=2
seed=42
process = DataGeneration(Nd=Nd, poldeg=poldeg, xmin=xmin, xmax=xmax, seed=seed)

In [None]:
fig,ax = plt.subplots(1,1)
ax.errorbar(process.xdata.flatten(),process.ydata.flatten(),yerr=process.sigma_error,fmt='o',label=r'$\mathcal{D}$')
x_plot = np.linspace(xmin,xmax,100)
y_plot = process.true_model(process.true_params, x_plot)
ax.plot(x_plot, y_plot, 'r-', label='True model')
ax.set_xlabel(r'$x$')
ax.set_ylabel(r'$y$')
ax.legend(loc='best');

## Linear Regression class

In [None]:
class LinearRegression:
    def __init__(self, xdata, ydata, poldeg):
        self.xdata = np.array(xdata).reshape(-1, 1)
        self.ydata = np.array(ydata).reshape(-1, 1)
        self.poldeg = poldeg
        self.Np = poldeg+1
        self.Nd = len(xdata)
        assert self.Nd > self.Np, "Must have more data than parameters. Aborting"
        self.design_matrix = self.create_polynomial_design_matrix(self.xdata)
        self.theta_opt, self.XTXinv = self.solve_normal_equation()
        self.residuals_opt = self.ydata - self.predict(self.theta_opt, self.xdata)
        self.variance_opt = self.estimate_variance_opt()

    def create_polynomial_design_matrix(self, xdata):
        """
        Create a design matrix for a polynomial model, and return it.
        """
        xdata = np.array(xdata).reshape(-1,1)
        design_matrix = np.ones_like(xdata)
        for deg in range(1, self.poldeg+1):
            design_matrix = np.hstack((design_matrix, xdata**deg))
        return design_matrix

    def predict(self, theta, xpred):
        """
        Perform a prediction, y_pred = X_pred \theta.
        """
        Xpred = self.create_polynomial_design_matrix(xpred)
        ypred = np.matmul(Xpred, theta)
        return ypred

    def solve_normal_equation(self):
        """
        Solves the normal equation and updates the parameters theta.
        """
        X = self.design_matrix
        # matrix-matrix and matrix-vector operations
        XTX = np.matmul(X.T, X)
        XTy = np.matmul(X.T, self.ydata)
        # pseudoinverse
        XTXinv = np.linalg.inv(XTX)
        # solution of the normal equation
        theta = np.matmul(XTXinv,XTy)
        return theta, XTXinv

    def estimate_variance_opt(self):
        """
        Unbiased estimator of the variance
        """
        raise NotImplementedError

    def confidence_interval_estimate_sigma(self,alpha):
        """
        1-alpha confidence interval when sigma must be estimated from data
        """
        raise NotImplementedError

    def confidence_interval_known_sigma(self,alpha, sigma):
        """
        1-alpha confidence interval when sigma is known
        """
        raise NotImplementedError

### Optimal parameters, estimated variance, confidence interval

### Resample data

## Overfit model

## Confidence intervals

In [None]:
LinearModel.theta_opt

In [None]:
LinearModel.confidence_interval_estimate_sigma(0.05)

In [None]:
LinearModel.confidence_interval_known_sigma(0.05,process.sigma_error)

In [None]:
process.true_params

### Test confidence intervals

In [None]:
N_future_data = 10000
N_pars = len(process.true_params)
in_CI = np.zeros((N_future_data,N_pars))
in_CI_known_sigma = np.zeros((N_future_data,N_pars))
alpha = 0.05
for idata in range(N_future_data):
    LinearModel_i = LinearRegression(process.xdata, process.measurement(), N_pars-1)
    lo,hi = LinearModel_i.confidence_interval_estimate_sigma(alpha)
    in_CI[idata,:] = np.logical_and(process.true_params > lo, process.true_params < hi)
    lo,hi = LinearModel_i.confidence_interval_known_sigma(alpha, process.sigma_error)
    in_CI_known_sigma[idata,:] = np.logical_and(process.true_params > lo, process.true_params < hi)

In [None]:
np.sum(in_CI,axis=0) / N_future_data

In [None]:
np.sum(in_CI_known_sigma,axis=0) / N_future_data

### Overfit model: Confidence interval

In [None]:
poldeg_overfit = 5
LinearModel_overfit = LinearRegression(process.xdata, process.ydata, poldeg_overfit)
LinearModel_overfit.theta_opt

In [None]:
lo,hi = LinearModel_overfit.confidence_interval_estimate_sigma(alpha)
for ipar, par in enumerate(LinearModel_overfit.theta_opt.flatten()):
    print(f'par {ipar}: {par:7.2f} {100*(1-alpha):.0f}% CI = [{lo[ipar]:.2f}, {hi[ipar]:.2f}]')