|<h2>Substack post:</h2>|<h1><a href="https://thepalindrome.org/p/the-anatomy-of-the-least-squares" target="_blank">Least squares part 1: theory, math, code</a></h1>|
|-|:-:|
|<h2>Teacher:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the post may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

In [None]:
### Run this cell only if you're using "dark mode"

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    'figure.facecolor': '#383838',
    'figure.edgecolor': '#383838',
    'axes.facecolor':   '#383838',
    'axes.edgecolor':   '#DDE2F4',
    'axes.labelcolor':  '#DDE2F4',
    'xtick.color':      '#DDE2F4',
    'ytick.color':      '#DDE2F4',
    'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
})

# Regression picture and terms

In [None]:
# some data
N = 9
x = np.linspace(-1,4,N)
y = 1 + x + np.random.randn(N)

# fit model and get predictions
X = np.vstack((np.ones(N),x)).T
betas = np.linalg.lstsq(X,y,rcond=None)[0]

# get predictions
yHat = X@betas

# plt
_,axs = plt.subplots(1,1,figsize=(8,6))

# plot the data
axs.plot(x,y,'ko',markerfacecolor=[.7,.7,.9],markersize=18,alpha=.5,label='Observations ($y$)')
axs.set(xlabel='X (regressor)',ylabel='Y (DV)')

# plot the regression line
axs.plot(x,yHat,linewidth=2,label='Best-fit line')

# plot the intercept
axs.plot(0,betas[0],'kv',markersize=20,markerfacecolor=[.3,.7,.3],label='Intercept',zorder=10)


# data-point-specific projection lines
for i in range(N):
  axs.plot([x[i],x[i]],[y[i],yHat[i]],'--.',color='gray')
  axs.plot([x[i],x[i]],[yHat[i],yHat[i]],'rs',markersize=10,markerfacecolor=[.7,.3,.3],label=r'Predicted ($\hat{y}$)')


# final adjustments
labh,labels = axs.get_legend_handles_labels() # to prevent redundant 'Predicted' labels
axs.legend(labh[:4],labels[:4],fontsize=16) # only the first four (unique) legends
axs.grid(linewidth=.1)

plt.tight_layout()
plt.savefig('fig1.svg')
plt.show()

# A fun example

In [None]:
# the data
HungarianPunk = np.array([ 1, 2, 4, 5, 7 ])
happiness = np.array([ 5, 6.5, 6, 8, 9 ])
n = len(HungarianPunk)

# the plot
plt.figure(figsize=(6,5))
plt.plot(HungarianPunk,happiness,'wo',markerfacecolor=[.9,.7,.7],markersize=15)
plt.gca().set(xlabel='Hungarian punk band concerts',ylabel='Overall life happiness (1-10)',
              yticks=range(4,10) )

plt.tight_layout()
plt.savefig('fig2.svg')
plt.show()

In [None]:
# as numpy arrays
X = np.vstack( (np.ones(n),HungarianPunk)).T
y = happiness

# least-squares solution
beta = np.linalg.inv(X.T@X) @ X.T @ y
print(beta)

In [None]:
# the design matrix
X

In [None]:
# the model predicts are the design matrix times the beta coefficients
predictions = X@beta

# visualization
plt.figure(figsize=(6,5))
plt.plot(HungarianPunk,happiness,'wo',markerfacecolor=[.9,.7,.7],markersize=15,label='Observed data')
plt.plot(HungarianPunk,predictions,'ws-',markerfacecolor=[.3,.3,.7],markersize=10,label='Model-predicted values')

for i in range(n):
  plt.plot([HungarianPunk[i],HungarianPunk[i]],[happiness[i],predictions[i]],
           '--',color='gray',zorder=-4)
plt.plot(4,6,'--',color='gray',label='Residuals')

plt.legend()
plt.gca().set(xlabel='Hungarian punk band concerts',ylabel='Overall life happiness (1-10)',
              yticks=range(4,10) )

plt.tight_layout()
plt.savefig('fig3.svg')
plt.show()

In [None]:
# vector of the residuals
residuals = y - predictions

# the residuals sum to zero (within tolerance of computer precision)
residuals.sum()