In [None]:
import pickle
import os
import numpy as np

from transformers import AutoModel, AutoModelForCausalLM, AutoModelForMaskedLM, AutoModelForSeq2SeqLM
from transformers import BertModel, AlbertModel, DistilBertModel, RobertaModel, OpenAIGPTModel, GPT2Model

In [None]:
def rank_layers(d,h,dh,Wq,Wk):

    RankFull = np.linalg.matrix_rank(Wq @ Wk)
    RankHeads = np.zeros(h)

    for j, head in enumerate(range(0, d, dh)):
        M = Wq[:, head: head + dh] @ Wk[head : head + dh,:]
        RankHeads[j] = np.linalg.matrix_rank(M)
    
    return  RankFull, RankHeads

In [None]:
def getscoresBERT(d,l,h,dh,model):

    RankFullList = np.zeros(l)
    RankHeadsList = np.zeros((l,h))
    for layer in range(l):
        
        Wq = model.encoder.layer[layer].attention.self.query.weight.detach().numpy()
        Wk = model.encoder.layer[layer].attention.self.key.weight.detach().numpy()

        RankFull, RankHeads = rank_layers(d,h,dh,Wq.T,Wk)
        RankFullList[layer] = RankFull
        RankHeadsList[layer,:] = RankHeads

    return  RankFullList, RankHeadsList

def getscoresDistillBERT(d,l,h,dh,model):

    RankFullList = np.zeros(l)
    RankHeadsList = np.zeros((l,h))
    for layer in range(l):
        
        Wq = model.transformer.layer[layer].attention.q_lin.weight.detach().numpy()
        Wk = model.transformer.layer[layer].attention.k_lin.weight.detach().numpy()

        RankFull, RankHeads = rank_layers(d,h,dh,Wq.T,Wk)
        RankFullList[layer] = RankFull
        RankHeadsList[layer,:] = RankHeads

    return  RankFullList, RankHeadsList

def getscoresALBERT(d,l,h,dh,model):

    RankFullList = np.zeros(l)
    RankHeadsList = np.zeros((l,h))
    for layer in range(l):
        
        Wq = model.encoder.albert_layer_groups[layer].albert_layers[0].attention.query.weight.detach().numpy()
        Wk = model.encoder.albert_layer_groups[layer].albert_layers[0].attention.key.weight.detach().numpy()

        RankFull, RankHeads = rank_layers(d,h,dh,Wq.T,Wk)
        RankFullList[layer] = RankFull
        RankHeadsList[layer,:] = RankHeads

    return  RankFullList, RankHeadsList

def getscoresALBERT(d,l,h,dh,model):

    RankFullList = np.zeros(l)
    RankHeadsList = np.zeros((l,h))
    for layer in range(l):
        
        Wq = model.roberta.encoder.layer[layer].attention.self.query.weight.detach().numpy()
        Wk = model.roberta.encoder.layer[layer].attention.self.key.weight.detach().numpy()

        RankFull, RankHeads = rank_layers(d,h,dh,Wq.T,Wk)
        RankFullList[layer] = RankFull
        RankHeadsList[layer,:] = RankHeads

    return  RankFullList, RankHeadsList

def getscoresGPT(d,l,h,dh,model):

    RankFullList = np.zeros(l)
    RankHeadsList = np.zeros((l,h))
    for layer in range(l):
        
        Wq = model.h[layer].attn.c_attn.weight[:,:d].detach().numpy()
        Wk = model.h[layer].attn.c_attn.weight[:,d:2*d].detach().numpy()

        RankFull, RankHeads = rank_layers(d,h,dh,Wq.T,Wk)
        RankFullList[layer] = RankFull
        RankHeadsList[layer,:] = RankHeads

    return  RankFullList, RankHeadsList

def getscoresGPTneo(d,l,h,dh,model):

    RankFullList = np.zeros(l)
    RankHeadsList = np.zeros((l,h))
    for layer in range(l):
        
        Wq = model.transformer.h[layer].attn.attention.q_proj.weight.detach().numpy()
        Wk = model.transformer.h[layer].attn.attention.k_proj.weight.detach().numpy()

        RankFull, RankHeads = rank_layers(d,h,dh,Wq.T,Wk)
        RankFullList[layer] = RankFull
        RankHeadsList[layer,:] = RankHeads

    return  RankFullList, RankHeadsList

def getscoresGPTj(d,l,h,dh,model):

    RankFullList = np.zeros(l)
    RankHeadsList = np.zeros((l,h))
    for layer in range(l):
        
        Wq = model.transformer.h[layer].attn.q_proj.weight.detach().numpy()
        Wk = model.transformer.h[layer].attn.k_proj.weight.detach().numpy()

        RankFull, RankHeads = rank_layers(d,h,dh,Wq.T,Wk)
        RankFullList[layer] = RankFull
        RankHeadsList[layer,:] = RankHeads

    return  RankFullList, RankHeadsList

def getscoresDistillGPT(d,l,h,dh,model):

    RankFullList = np.zeros(l)
    RankHeadsList = np.zeros((l,h))
    for layer in range(l):
        
        Wq = model.transformer.h[layer].attn.c_attn.weight[:,:d].detach().numpy()
        Wk = model.transformer.h[layer].attn.c_attn.weight[:,d:2*d].detach().numpy()

        RankFull, RankHeads = rank_layers(d,h,dh,Wq.T,Wk)
        RankFullList[layer] = RankFull
        RankHeadsList[layer,:] = RankHeads

    return  RankFullList, RankHeadsList

def getscoresOPT(d,l,h,dh,model):

    RankFullList = np.zeros(l)
    RankHeadsList = np.zeros((l,h))
    for layer in range(l):
        
        Wq = model.model.decoder.layers[layer].self_attn.q_proj.weight.detach().numpy()
        Wk = model.model.decoder.layers[layer].self_attn.k_proj.weight.detach().numpy()

        RankFull, RankHeads = rank_layers(d,h,dh,Wq.T,Wk)
        RankFullList[layer] = RankFull
        RankHeadsList[layer,:] = RankHeads

    return  RankFullList, RankHeadsList

def getscoresT5(d,l,h,dh,model):

    RankFullList = np.zeros(l)
    RankHeadsList = np.zeros((l,h))
    for layer in range(l):
        
        Wq = model.encoder.block[layer].layer[0].SelfAttention.q.weight.detach().numpy()
        Wk = model.encoder.block[layer].layer[0].SelfAttention.k.weight.detach().numpy()

        RankFull, RankHeads = rank_layers(d,h,dh,Wq.T,Wk)
        RankFullList[layer] = RankFull
        RankHeadsList[layer,:] = RankHeads

    return  RankFullList, RankHeadsList

In [None]:
"""
- KEY (str): model name
- VALUES (list): [layers (int), embedding dim (int), heads (int), head dim (int), S scores, N scores]
"""

if os.path.isfile('../data/fig_ranks/models.pkl'):
    with open('../data/fig_ranks/models.pkl', 'rb') as file:
        models = pickle.load(file)
else: models = {}

In [None]:
""" 
BERT models 
 - MODEL: Bidirectional, Encoder-only Transformer
 - DATASETS: BookCorpus & English Wikipedia
 - OBJECTIVES: Masked Language Modeling (MLM), Next Sentence Prediction (NSP)
 - METRICS: perplexity, cross-entropy

 MLM: randomly masked some words in the sentence, predict masked words with cross-entropy 
 over the vocabulary 
 NSP: 

The idea is that these models have a better understanding of context, where each word is represented as a 
linear combination of all the other words in the sentence, bi-directionally.
"""
dh = 64

'BERT tiny (l = 2, d = 128, h = 2 ; 4.40M parameters)'
l = 2
d = 128
h = d // dh
model = AutoModel.from_pretrained("google/bert_uncased_L-2_H-128_A-2")
full, heads = getscoresBERT(d,l,h,dh,model)
models['BERTtiny'] = [l,d,h,dh,full,heads]

'BERT mini (l = 4, d = 256, h = 4 ; 11.3M parameters)'
l = 4
d = 256
h = d // dh
model = AutoModel.from_pretrained("google/bert_uncased_L-4_H-256_A-4")
full, heads = getscoresBERT(d,l,h,dh,model)
models['BERTmini'] = [l,d,h,dh,full,heads]

'BERT small (l = 4, d = 512, h = 8 ; 29.1M parameters)'
l = 4
d = 512
h = d // dh
model = AutoModel.from_pretrained("google/bert_uncased_L-4_H-512_A-8")
full, heads = getscoresBERT(d,l,h,dh,model)
models['BERTsmall'] = [l,d,h,dh,full,heads]

'BERT medium (l = 8, d = 512, h = 8 ; 41.7M parameters)'
l = 8
d = 512
h = d // dh
model = AutoModel.from_pretrained("google/bert_uncased_L-8_H-512_A-8")
full, heads = getscoresBERT(d,l,h,dh,model)
models['BERTmedium'] = [l,d,h,dh,full,heads]

'BERT base (l = 12, d = 768, h = 12 ; 110M parameters)'
l = 12
d = 768
h = d // dh
model = BertModel.from_pretrained("bert-base-uncased")
full, heads = getscoresBERT(d,l,h,dh,model)
models['BERTbase'] = [l,d,h,dh,full,heads]

'BERT large (l = 24, d = 1024, h = 16 ; 340M parameters)'
l = 24
d = 1024
h = d // dh
model = BertModel.from_pretrained("bert-large-uncased")
full, heads = getscoresBERT(d,l,h,dh,model)
models['BERTlarge'] = [l,d,h,dh,full,heads]

'BERT large (masking) (l = 24, d = 1024, h = 16 ; 340M parameters)'
l = 24
d = 1024
h = d // dh
model = BertModel.from_pretrained("bert-large-uncased-whole-word-masking")
full, heads = getscoresBERT(d,l,h,dh,model)
models['BERTlarge_mask'] = [l,d,h,dh,full,heads]

'DistillBERT base model (l = 6, d = 768, h = 12 ; tot num parameters 66M)'
l = 6
d = 768
h = d // dh
model = DistilBertModel.from_pretrained("distilbert-base-uncased")
full, heads = getscoresDistillBERT(d,l,h,dh,model)
models['DistillBERT'] = [l,d,h,dh,full,heads]

'save'
with open('../data/fig_ranks/models.pkl', 'wb') as file:
    pickle.dump(models, file)