<a href="https://colab.research.google.com/github/mikexcohen/Substack/blob/main/MLonLLMs/effectiveDimensionality_GPT2xl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

|<h2>Substack post:</h2>|<h1><a href="https://mikexcohen.substack.com/p/effective-dimensionality-analysis" target="_blank">Effective dimensionality analysis of LLM transformers</a></h1>|
|-|:-:|
|<h2>Teacher:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the post may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import gridspec
import requests

# pytorch libraries
import torch
import torch.nn.functional as F

# huggingface libraries for GPT2
from transformers import AutoModelForCausalLM, GPT2Tokenizer

In [None]:
### Run this cell only if you're using "dark mode"

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    'figure.facecolor': '#171717',
    'figure.edgecolor': '#171717',
    'axes.facecolor':   '#171717',
    'axes.edgecolor':   '#DDE2F4',
    'axes.labelcolor':  '#DDE2F4',
    'xtick.color':      '#DDE2F4',
    'ytick.color':      '#DDE2F4',
    'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
})

# Concept of effective dimensionality

In [None]:
# a bit of data
x = np.random.uniform(low=0,high=5,size=50)
y = np.random.normal(loc=0,scale=.07,size=len(x))

_,axs = plt.subplots(1,2,figsize=(9,3.5))
axs[0].plot(x,x,'wo',markerfacecolor=[.9,.7,.7,.5],markersize=10)
axs[1].plot(x,x+y,'wo',markerfacecolor=[.7,.9,.9,.5],markersize=10)

for a in axs:
  a.axis('square')
  a.set(xlabel='x',ylabel='y')
  a.plot([0,5],[0,5],color='gray',zorder=-2)

plt.tight_layout()
plt.show()

# Dimensionality analysis demo

In [None]:
# 1) create a tall matrix
m = 10 # number of rows
r = 2  # number of columns (and rank)
v = np.random.randn(m,r)

# 2) a square matrix with rank r
M = v @ v.T

# 3) dimensionality analysis via SVD
s = np.linalg.svd(M)[1]
s

In [None]:
# its rank
np.linalg.matrix_rank(M)

In [None]:
# percent variance explained (cumulative)
pctExplained = 100 * s**2 / np.sum(s**2)
cumu_var = np.cumsum(pctExplained)

# count the components until 95% variance is explained
effective_dim = np.where(cumu_var>95)[0][0]


# plot the matrix
_,axs = plt.subplots(1,3,figsize=(12,3.5))
axs[0].imshow(M,vmin=-3,vmax=3)
axs[0].set(title=f'Rank-{np.linalg.matrix_rank(M)} {m}$\\times${m} matrix',
           xticks=range(m),yticks=range(m))

# and the scree plot
axs[1].plot(s,'wh',markersize=10,markerfacecolor=[.7,.9,.7])
axs[1].set(xlabel='Principal component',ylabel='Singular value',xticks=range(m),
              title='Scree plot')

axs[2].plot([effective_dim,effective_dim],[0,cumu_var[effective_dim]],'--',color='gray')
axs[2].plot([-1,effective_dim],[cumu_var[effective_dim],cumu_var[effective_dim]],'--',color='gray')
axs[2].plot(cumu_var,'ws',markersize=10,markerfacecolor=[.9,.7,.7])

axs[2].set(xlabel='Principal component',ylabel='% total variance explained',xticks=range(m),
           xlim=[-.5,m+.5],ylim=[cumu_var[0]-3,103],title=f'Effective dimensionality = {effective_dim+1}')

plt.tight_layout()
plt.show()

# Import GPT2 and tokenizer

In [None]:
# load GPT2 model and tokenizer
model = AutoModelForCausalLM.from_pretrained('gpt2-xl')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

In [None]:
# Through the looking glass (Alice in Wonderland)
url = 'https://www.gutenberg.org/cache/epub/11/pg11.txt'
# url = 'https://pigeonsarentreal.co.uk/' # a funny website, used in Demo 2

# import text
text = requests.get(url).text

# tokenize and select
allTokens = tokenizer.encode(text,return_tensors='pt')
tokens = allTokens[:,10000:10000+model.config.n_ctx]

print(tokenizer.decode(tokens[0][:100]))

In [None]:
# randomly shuffled takens
randidx = torch.randperm(len(tokens[0]))
tokensShuffle = tokens[0,randidx].unsqueeze(0)
print(tokenizer.decode(tokensShuffle[0][:100]))

In [None]:
# push through the model (~3 mins with gpt2-xl on CPU, or <1s on GPU, lol)
with torch.no_grad():
  outputs_real = model(tokens.to(device),output_hidden_states=True)
  outputs_shuf = model(tokensShuffle.to(device),output_hidden_states=True)

outputs_real.hidden_states[0].shape

In [None]:
numHidden = len(outputs_real.hidden_states)
numHidden

# Effective dimensionality analysis

In [None]:
## (~1 min)

# threshold for considering a dimension "occupied"
threshold = 95

# initialize
cumu_var = np.zeros((numHidden,outputs_real.hidden_states[4].shape[1],2))
effective_dim = np.zeros((numHidden,2),dtype=int)


# loop over layers
for layeri in range(numHidden):

  # 1) extract all the activations from this layer (assuming no batches!)
  acts = outputs_real.hidden_states[layeri].squeeze().cpu().numpy()

  # 2) mean-center the activations
  acts -= acts.mean(axis=0,keepdims=True)

  # 3) get singular values
  s = np.linalg.svd(acts)[1]

  # 4) percent explained (cumulative)
  pctExplained = 100 * s**2 / np.sum(s**2)
  cumu_var[layeri,:,0] = np.cumsum(pctExplained)

  # 5) count the components until variance threshold is exceeded
  effective_dim[layeri,0] = np.where(cumu_var[layeri,:,0]>threshold)[0][0]



  ### repeat for shuffled tokens
  acts = outputs_shuf.hidden_states[layeri].squeeze().cpu().numpy()
  acts -= acts.mean(axis=0,keepdims=True)
  s = np.linalg.svd(acts)[1] # get singular values
  pctExplained = 100 * s**2 / np.sum(s**2) # percent explained
  cumu_var[layeri,:,1] = np.cumsum(pctExplained) # cumulative
  effective_dim[layeri,1] = np.where(cumu_var[layeri,:,1]>threshold)[0][0]+1


In [None]:
_,axs = plt.subplots(1,2,figsize=(9,3))

acts = outputs_real.hidden_states[5].squeeze().cpu().numpy()
acts -= acts.mean(axis=0,keepdims=True)
s = np.linalg.svd(acts)[1] # get singular values
pctExplained = 100 * s**2 / np.sum(s**2) # percent explained

axs[0].plot(pctExplained,'wh-',linewidth=.3,markersize=8,markerfacecolor=[.7,.9,.7,.7])
axs[0].set(xlim=[-1,100],xlabel='Component number',ylabel='Percent variance explained')

axs[1].plot(np.cumsum(pctExplained),'ws',linewidth=.2,markersize=8,markerfacecolor=[.7,.7,.9,.7])
axs[1].axhline(80,linestyle='--',color='gray')
axs[1].axvline(53,linestyle='--',color='gray')
axs[1].set(xlim=[-1,100],xlabel='Component number',ylabel='Cumulative % var. explained')

plt.tight_layout()
plt.show()

# Visualization

In [None]:
# setup the figure and axes
fig,axs = plt.subplots(1,2,figsize=(10,3.4))

# normalization function for mapping layer index onto color
norm = mpl.colors.Normalize(vmin=0,vmax=numHidden)


# plt the cumulative variance explained
for layeri in range(numHidden):
  axs[0].plot(cumu_var[layeri,:,0],color=mpl.cm.rainbow(norm(layeri)))
  axs[1].plot(cumu_var[layeri,:,1],color=mpl.cm.rainbow(norm(layeri)))

axs[0].axhline(threshold,linestyle='--',color='gray')
axs[1].axhline(threshold,linestyle='--',color='gray')


# add colorbars
sm = mpl.cm.ScalarMappable(cmap=mpl.cm.rainbow,norm=norm)
cbar = plt.colorbar(sm,ax=axs[0])
cbar.set_label(r'Hidden layer')
cbar = plt.colorbar(sm,ax=axs[1])
cbar.set_label(r'Hidden layer')

# make it look nicer
axs[0].set(xlabel='Component number',ylabel='% explained (cumulative)',ylim=[50,100.5],xlim=[-2,500],title='(Real) variance explained')
axs[1].set(xlabel='Component number',ylabel='% explained (cumulative)',ylim=[50,100.5],xlim=[-2,500],title='(Shuffled) variance explained')

plt.tight_layout()
plt.show()

In [None]:
## plot the "effective subspace dimensionality" of each layer
plt.figure(figsize=(10,3.5))

plt.plot(effective_dim[:,1],'ws',linewidth=.1,markerfacecolor=[.9,.7,.7,.7],markersize=8,label='Shuffled tokens')
plt.plot(effective_dim[:,0],'wo',linewidth=.1,markerfacecolor=[.7,.9,.7,.7],markersize=8,label='Real tokens')
plt.legend()
plt.gca().set(xlabel='Hidden layer',ylabel='Numer of dimensions',xlim=[-1,numHidden],
              title=f'"Effective dimensionality" ({threshold}% variance)\n(max possible = {sum(s>0)})' )

plt.show()

In [None]:
# convert to percent of total possible dimensionality
effective_dimP = 100*effective_dim / len(s)

plt.figure(figsize=(10,3.5))

plt.plot(effective_dimP[:,1],'ws',linewidth=.1,markerfacecolor=[.9,.7,.7,.7],markersize=8,label='Shuffled tokens')
plt.plot(effective_dimP[:,0],'wo',linewidth=.1,markerfacecolor=[.7,.9,.7,.7],markersize=8,label='Real tokens')
plt.legend()
plt.gca().set(xlabel='Hidden layer',ylabel='Percent total dimensionality',
              title=f'"Effective dimensionality" ({threshold}% variance)\n(max possible = {sum(s>0)})',
              xlim=[-1,numHidden])

plt.show()