|<h2>Substack post:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/correlation-vs-cosine-similarity" target="_blank">Correlation vs. best-fit line (regression)</a></h1>|
|-|:-:|
|<h2>Teacher:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the post may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import scipy.stats as stats

In [None]:
### Run this cell only if you're using "dark mode"

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    'figure.facecolor': '#282a2c',#'#171717', # 17 to match substack background
    'figure.edgecolor': '#282a2c',
    'axes.facecolor':   '#282a2c',
    'axes.edgecolor':   '#DDE2F4',
    'axes.labelcolor':  '#DDE2F4',
    'xtick.color':      '#DDE2F4',
    'ytick.color':      '#DDE2F4',
    'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold'
})

# Demo 1: Intuition for correlation

In [None]:
# 1) population correlation values
rs = [ 1,.7,.2,-.2,-.7,-1 ]
N = 188 # sample size

# 2) stylizing the plot
shapes = 'hso'
colors = [ [.9,.7,.7,.7],[.7,.9,.7,.7],[.7,.7,.9,.7] ]

# start the plotting!
_,axs = plt.subplots(2,3,figsize=(10,6))
for r,ax,i in zip(rs,axs.flatten(),range(6)):

  # 3) generate the data
  x = np.random.randn(N)
  y = x*r + np.random.randn(N)*np.sqrt(1-r**2)

  # 4) empirical correlation
  r = np.corrcoef(x,y)[0,1]

  # 5) plot
  ax.plot(x,y,'w'+shapes[i%3],markerfacecolor=colors[i%3],markersize=6)
  ax.set(xticks=[],yticks=[],title=f'r = {r:.2f}')


plt.tight_layout()
plt.show()

# Demo 2: Intuition for simple regression

In [None]:
# 1) simulation parameters
b0 = -3
b1 = 1.6
epsilon = np.random.normal(0,1.5,N)

# 2) simulate the data
x = np.random.normal(0,1,N)
y = b0 + b1*x + epsilon

# 3) estimate the parameters using scipy.stats
slope,intercept,_,_,_ = stats.linregress(x,y)
yHat = intercept + slope*x


plt.figure(figsize=(9,6))
plt.plot(x,y,'wh',markerfacecolor=[.7,.9,.7,.5],label='Data')
plt.plot(x,yHat,'w',linewidth=2,zorder=-1,label=f'y = {intercept:.2f} + {slope:.2f}x')

plt.title(f'y = {b0} + {b1}x')
plt.legend()
plt.show()

In [None]:
print(f'b0 (truth): {b0:.4f}')
print(f'b0 (esti.): {intercept:.4f}\n')

print(f'b1 (truth): {b1:.4f}')
print(f'b1 (esti.): {slope:.4f}\n')

# Demo 3: Correlation ≠ regression

In [None]:
# 1) generate the original data
popR = .7 # population correlation
x_og = np.random.randn(N)
y_og = x_og*popR + np.random.randn(N)*np.sqrt(1-popR**2)

# # force z-score
# x_og = (x_og-x_og.mean()) / x_og.std()
# y_og = (y_og-y_og.mean()) / y_og.std()


# 2) specify [x,y] means and stds for each simulation
means = [ (0,0),(0,1),(4,0),(-3,-3) ]
stds  = [ (1,1),(1,2),(4,1),(33,33) ]

# 3) start the plotting!
_,axs = plt.subplots(2,2,figsize=(10,6))
for i,ax in enumerate(axs.flatten()):

  # 4) create the sample data
  x = x_og*stds[i][0] + means[i][0]
  y = y_og*stds[i][1] + means[i][1]

  # 5) empirical correlation and regression params
  r = np.corrcoef(x,y)[0,1]
  slope,intercept,_,_,_ = stats.linregress(x,y)
  yHat = intercept + slope*x

  # 6) create and stylize the plot
  ax.plot(x,y,'w'+shapes[i%3],markerfacecolor=colors[i%3],markersize=6)
  ax.plot(x,yHat,'w',linewidth=3)
  ax.set(xticks=[x.mean()],yticks=[y.mean()],
         title=f'{'ABCD'[i]})  $\\beta_1$={slope:.3f}, $r$={r:.3f}')
  ax.grid(linestyle='--',zorder=-10,color=[.3,.3,.3])



plt.tight_layout()
plt.show()

# Demo 4: Numerical simulations of the equations

In [None]:
# 1) Simulate data
popR = .7 # population correlation
N = 123
x = np.random.randn(N)
y = x*popR + np.random.randn(N)*np.sqrt(1-popR**2)

# 2) specify the exact means and stds
x_mean = 0.4
y_mean = 1
x_std  = 1.5
y_std  = 1.9

# 3) shift/scale the data
x = (x-x.mean()) / x.std()
x = x*x_std + x_mean

y = (y-y.mean()) / y.std()
y = y*y_std + y_mean

# 4) confirm the specified characteristics
print('       Mean  |   Std')
print('-------------+---------')
print(f'x  |  {x.mean():.3f}  |  {x.std():.3f}')
print(f'y  |  {y.mean():.3f}  |  {y.std():.3f}')

In [None]:
# 1) convenient to have the means
xmean = x.mean()
ymean = y.mean()

# 2) correlation from numpy
r_numpy = np.corrcoef(x,y)[0,1]

# 3) manual correlation calculation
num = sum( (x-xmean)*(y-ymean) )
den = np.sqrt( sum((x-xmean)**2) ) * np.sqrt( sum((y-ymean)**2) )
r_manu = num/den

# 4) regression parameters from sp.stats
b1_sp,b0_sp,_,_,_ = stats.linregress(x,y)

# 5) manual regression parameters
b1_man = sum( (x-xmean)*(y-ymean) ) / sum( (x-xmean)**2 )
b0_man = ymean - b1_man*xmean

# 6) beta1 from r
b1_r = r_manu * (y.std()/x.std())

# 7) print the results
print(f'Correlation (manual): {r_manu:.5f}')
print(f' Correlation (numpy): {r_numpy:.5f}\n')

print(f'     beta-1 (manual): {b1_man:.5f}')
print(f'      beta-1 (stats): {b1_sp:.5f}')
print(f'       beta-1 from r: {b1_r:.5f}\n')

print(f'  Intercept (manual): {b0_man:.5f}')
print(f'   Intercept (stats): {b0_sp:.5f}')