<a href="https://colab.research.google.com/github/mikexcohen/Substack/blob/main/Correlation_vs_cosineSimilarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

|<h2>Substack post:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/correlation-vs-cosine-similarity?r=6bsj8n&utm_campaign=post&utm_medium=web&showWelcomeOnShare=true" target="_blank">Correlation vs. cosine similarity</a></h1>|
|-|:-:|
|<h2>Teacher:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the text may lead to confusion or errors.</i>

In [None]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt

# import cosine similarity function
from scipy import spatial

# Illustration of data simulation


In [None]:
# population correlations
popCors = [-.7,0,.7]

# sample size
N = 90

# create a figure with three subplots
_,axs = plt.subplots(1,3,figsize=(12,4))

# loop over a range of r values
for i,popr in enumerate(popCors):

  ### sample random data with specified population correlation
  x = np.random.randn(N)
  y = x*popr + np.random.randn(N)*np.sqrt(1-popr**2)

  axs[i].plot(x,y,'ko',markersize=10,markerfacecolor=(.2,.6,.2),alpha=.5)
  axs[i].set(xlabel='Data "x"',ylabel='Data "y"',title=f'Correlation = {np.corrcoef(x,y)[0,1]:.2f}')

plt.tight_layout()
plt.show()

# Systematic comparison of correlation and cosine similarity

In [None]:
# range of population correlation coefficients
popRs = np.linspace(-1,1,100)

# sample size
N = 500

# initialize output matrix
corrs = np.zeros((len(popRs),2))


# loop over a range of r values
for ri in range(len(popRs)):

  ### sample random data with specified population correlation
  x = np.random.randn(N)
  y = x*popRs[ri] + np.random.randn(N)*np.sqrt(1-popRs[ri]**2)

  ## introduce a mean offset to one variable
  x = x-10



  ### compute correlation
  # Pearson formula
  xnorm = np.sum( (x-x.mean())**2 )
  ynorm = np.sum( (y-y.mean())**2 )
  corrs[ri,0] = np.sum( (x-x.mean())*(y-y.mean()) ) / np.sqrt(xnorm*ynorm)

  # equivalent to:
  #corrs[ri,0] = np.corrcoef(x,y)[0,1]



  ### compute cosine similarity
  xnorm = np.sum( x**2 )
  ynorm = np.sum( y**2 )
  corrs[ri,1] = np.sum( x*y ) / np.sqrt(xnorm*ynorm)

  # equivalent to:
  #corrs[ri,1] = 1-spatial.distance.cosine(x,y)




## visualize the results
_,axs = plt.subplots(1,2,figsize=(10,4.5))

axs[0].plot(popRs,corrs[:,0],'rs',markersize=10,markerfacecolor=(.9,.3,.3),alpha=.5,label='Correlation')
axs[0].plot(popRs,corrs[:,1],'bo',markersize=10,markerfacecolor=(.3,.3,.9),alpha=.5,label='Cosine sim.')
axs[0].axhline(y=0,color='gray',linestyle='--')
axs[0].axvline(x=0,color='gray',linestyle='--')
axs[0].legend()
axs[0].set(xlabel='Population correlation',ylabel=r'Measured (sample) $r$ or $S_C$')
axs[0].set_title(r'Correlation and cosine sim.')

axs[1].plot(corrs[:,0],corrs[:,1],'ks',markersize=10,markerfacecolor=(.2,.6,.2),alpha=.5)
axs[1].axhline(y=0,color='gray',linestyle='--')
axs[1].axvline(x=0,color='gray',linestyle='--')
axs[1].set(xlim=[-1.1,1.1],ylim=[-1.1,1.1],xlabel='Correlation',ylabel='Cosine similarity')
axs[1].set_title(f'r={np.corrcoef(corrs.T)[1,0]:.2f}')


plt.tight_layout()
plt.show()