In [32]:
import pickle
import os
import numpy as np

from transformers import AutoModel, AutoModelForCausalLM, AutoModelForMaskedLM, AutoModelForSeq2SeqLM
from transformers import BertModel, AlbertModel, DistilBertModel, RobertaModel, OpenAIGPTModel, GPT2Model

In [42]:
def scores(A):
    """
    Given a square matrix A, calculate the symmetric (S) and skew-symmetric (N) scores of a matrix.

    Parameters
    ----------
    A : numpy.ndarray
        square numpy matrix.

    Returns
    -------
    tuple :
        Symmetric (S) and skew-symmetric (N) scores.
    """
     
    S = np.linalg.norm(.5 * (A + A.T), 'fro') / np.linalg.norm(A, 'fro')
    N = np.linalg.norm(.5 * (A - A.T), 'fro') / np.linalg.norm(A, 'fro')

    return S, N

def scores_layer(d,h,dh,Wq,Wk):

    Sheads = np.zeros(h)
    Nheads = np.zeros(h)

    for j, head in enumerate(range(0, d, dh)):
        M = Wq[:, head: head + dh] @ Wk[head : head + dh,:]
        Sheads[j], Nheads[j] = scores(M)   

    return  Sheads, Nheads

In [43]:
"""

Let Q and K be
Q = X @ W_q ; K = X @ W_k

it follows that the dot product between queries and keys is 
Q @ K^T = X @ (W_q @ W_k^T) @ X^T = X @ M @ X^T 

where M is a square matrix \in R^{d,d}, that can be decomposed into its
symmetric and skew-symmetric part S and N, respectively,
M = 1/2 * (M + M^T) + 1/2 * (M - M^T) = S + N

important: nn.Linear.weight returns the learnable weights of the module 
of shape (out_features,in_features). Thereby, we get the matrix W_q or 
W_k as nn.Linear.weight^T, and thus M = Wq.T @ Wk in this case.

"""

def getscoresBERT(d,l,h,dh,model):

    S = np.zeros((l,h))
    N = np.zeros((l,h))
    for i in range(l):
        Wq = model.encoder.layer[i].attention.self.query.weight.detach().numpy()
        Wk = model.encoder.layer[i].attention.self.key.weight.detach().numpy()
        S[i, :], N[i, :] = scores_layer(d,h,dh,Wq.T,Wk)

    return  S, N

def getscoresDistillBERT(d,l,h,dh,model):

    S = np.zeros((l,h))
    N = np.zeros((l,h))
    for i in range(l):
        Wq = model.transformer.layer[i].attention.q_lin.weight.detach().numpy()
        Wk = model.transformer.layer[i].attention.k_lin.weight.detach().numpy()
        S[i, :], N[i, :] = scores_layer(d,h,dh,Wq.T,Wk)

    return  S, N

def getscoresALBERT(d,l,h,dh,model):

    S = np.zeros((l,h))
    N = np.zeros((l,h))
    for i in range(l):
        Wq = model.encoder.albert_layer_groups[i].albert_layers[0].attention.query.weight.detach().numpy()
        Wk = model.encoder.albert_layer_groups[i].albert_layers[0].attention.key.weight.detach().numpy()
        S[i, :], N[i, :] = scores_layer(d,h,dh,Wq.T,Wk)

    return  S, N

def getscoresDistillROBERTA(d,l,h,dh,model):

    S = np.zeros((l,h))
    N = np.zeros((l,h))
    for i in range(l):
        Wq = model.roberta.encoder.layer[i].attention.self.query.weight.detach().numpy()
        Wk = model.roberta.encoder.layer[i].attention.self.key.weight.detach().numpy()
        S[i, :], N[i, :] = scores_layer(d,h,dh,Wq.T,Wk)

    return  S, N

def getscoresGPT(d,l,h,dh,model):

    S = np.zeros((l,h))
    N = np.zeros((l,h))
    for i in range(l):
        Wq = model.h[i].attn.c_attn.weight[:,:d].detach().numpy()
        Wk = model.h[i].attn.c_attn.weight[:,d:2*d].detach().numpy()
        S[i, :], N[i, :] = scores_layer(d,h,dh,Wq,Wk.T)

    return  S, N

def getscoresGPTneo(d,l,h,dh,model):

    S = np.zeros((l,h))
    N = np.zeros((l,h))
    for i in range(l):
        Wq = model.transformer.h[i].attn.attention.q_proj.weight.detach().numpy()
        Wk = model.transformer.h[i].attn.attention.k_proj.weight.detach().numpy()
        S[i, :], N[i, :] = scores_layer(d,h,dh,Wq.T,Wk)

    return  S, N

def getscoresGPTneox(d,l,h,dh,model):

    S = np.zeros((l,h))
    N = np.zeros((l,h))
    for i in range(l):
        Wq = model.gpt_neox.layers[i].attention.query_key_value.weight[:d,:].detach().numpy()
        Wk = model.gpt_neox.layers[i].attention.query_key_value.weight[d:2*d,:].detach().numpy()
        S[i, :], N[i, :] = scores_layer(d,h,dh,Wq.T,Wk)

    return  S, N

def getscoresGPTj(d,l,h,dh,model):

    S = np.zeros((l,h))
    N = np.zeros((l,h))
    for i in range(l):
        Wq = model.transformer.h[i].attn.q_proj.weight.detach().numpy()
        Wk = model.transformer.h[i].attn.k_proj.weight.detach().numpy()
        S[i, :], N[i, :] = scores_layer(d,h,dh,Wq.T,Wk)

    return  S, N

def getscoresDistillGPT(d,l,h,dh,model):

    S = np.zeros((l,h))
    N = np.zeros((l,h))
    for i in range(l):
        Wq = model.transformer.h[i].attn.c_attn.weight[:,:d].detach().numpy()
        Wk = model.transformer.h[i].attn.c_attn.weight[:,d:2*d].detach().numpy()
        S[i, :], N[i, :] = scores_layer(d,h,dh,Wq.T,Wk)

    return  S, N

def getscoresOPT(d,l,h,dh,model):

    S = np.zeros((l,h))
    N = np.zeros((l,h))
    for i in range(l):
        Wq = model.model.decoder.layers[i].self_attn.q_proj.weight.detach().numpy()
        Wk = model.model.decoder.layers[i].self_attn.k_proj.weight.detach().numpy()
        S[i, :], N[i, :] = scores_layer(d,h,dh,Wq.T,Wk)

    return  S, N

def getscoresT5(d,l,h,dh,model):

    S = np.zeros((l,h))
    N = np.zeros((l,h))
    for i in range(l):
        Wq = model.encoder.block[i].layer[0].SelfAttention.q.weight.detach().numpy()
        Wk = model.encoder.block[i].layer[0].SelfAttention.k.weight.detach().numpy()
        S[i, :], N[i, :] = scores_layer(d,h,dh,Wq.T,Wk)

    return  S, N

In [44]:
"""
- KEY (str): model name
- VALUES (list): [layers (int), embedding dim (int), heads (int), head dim (int), S scores, N scores]
"""

if os.path.isfile('../data/fig_scores/models.pkl'):
    with open('../data/fig_scores/models.pkl', 'rb') as file:
        models = pickle.load(file)
else: models = {}

In [45]:
""" 
BERT models 
 - MODEL: Bidirectional, Encoder-only Transformer
 - DATASETS: BookCorpus & English Wikipedia
 - OBJECTIVES: Masked Language Modeling (MLM), Next Sentence Prediction (NSP)
 - METRICS: perplexity, cross-entropy

 MLM: randomly masked some words in the sentence, predict masked words with cross-entropy 
 over the vocabulary 
 NSP: 

The idea is that these models have a better understanding of context, where each word is represented as a 
linear combination of all the other words in the sentence, bi-directionally.
"""
dh = 64

'BERT tiny (l = 2, d = 128, h = 2 ; 4.40M parameters)'
l = 2
d = 128
h = d // dh
model = AutoModel.from_pretrained("google/bert_uncased_L-2_H-128_A-2")
S, N = getscoresBERT(d,l,h,dh,model)
models['BERTtiny'] = [l,d,h,dh,S,N]

'BERT mini (l = 4, d = 256, h = 4 ; 11.3M parameters)'
l = 4
d = 256
h = d // dh
model = AutoModel.from_pretrained("google/bert_uncased_L-4_H-256_A-4")
S, N = getscoresBERT(d,l,h,dh,model)
models['BERTmini'] = [l,d,h,dh,S,N]

'BERT small (l = 4, d = 512, h = 8 ; 29.1M parameters)'
l = 4
d = 512
h = d // dh
model = AutoModel.from_pretrained("google/bert_uncased_L-4_H-512_A-8")
S, N = getscoresBERT(d,l,h,dh,model)
models['BERTsmall'] = [l,d,h,dh,S,N]

'BERT medium (l = 8, d = 512, h = 8 ; 41.7M parameters)'
l = 8
d = 512
h = d // dh
model = AutoModel.from_pretrained("google/bert_uncased_L-8_H-512_A-8")
S, N = getscoresBERT(d,l,h,dh,model)
models['BERTmedium'] = [l,d,h,dh,S,N]

'BERT base (l = 12, d = 768, h = 12 ; 110M parameters)'
l = 12
d = 768
h = d // dh
model = BertModel.from_pretrained("bert-base-uncased")
S, N = getscoresBERT(d,l,h,dh,model)
models['BERTbase'] = [l,d,h,dh,S,N]

'BERT large (l = 24, d = 1024, h = 16 ; 340M parameters)'
l = 24
d = 1024
h = d // dh
model = BertModel.from_pretrained("bert-large-uncased")
S, N = getscoresBERT(d,l,h,dh,model)
models['BERTlarge'] = [l,d,h,dh,S,N]

'BERT large (masking) (l = 24, d = 1024, h = 16 ; 340M parameters)'
l = 24
d = 1024
h = d // dh
model = BertModel.from_pretrained("bert-large-uncased-whole-word-masking")
S, N = getscoresBERT(d,l,h,dh,model)
models['BERTlarge_mask'] = [l,d,h,dh,S,N]

'DistillBERT base model (l = 6, d = 768, h = 12 ; tot num parameters 66M)'
l = 6
d = 768
h = d // dh
model = DistilBertModel.from_pretrained("distilbert-base-uncased")
S, N = getscoresDistillBERT(d,l,h,dh,model)
models['DistillBERT'] = [l,d,h,dh,S,N]

'save'
with open('../data/fig_scores/models.pkl', 'wb') as file:
    pickle.dump(models, file)

In [46]:
""" 
ROBERTA models 
 - MODEL: Bidirectional, Encoder-only Transformer
 - DATASETS: BookCorpus & English Wikipedia
 - OBJECTIVES: Masked Language Modeling (MLM), Next Sentence Prediction (NSP)
 - METRICS: perplexity, cross-entropy

 MLM: randomly masked some words in the sentence, predict masked words with cross-entropy 
 over the vocabulary 
 NSP: 

The idea is that these models have a better understanding of context, where each word is represented as a 
linear combination of all the other words in the sentence, bi-directionally.
"""
with open('../data/fig_scores/models.pkl', 'rb') as file:
    models = pickle.load(file)

dh = 64

'ROBERTA base (l = 24, d = 1024, h = 16 ; 340M parameters)'
l = 12
d = 768
h = d // dh
model = RobertaModel.from_pretrained('roberta-base')
S, N = getscoresBERT(d,l,h,dh,model)
models['ROBERTAbase'] = [l,d,h,dh,S,N]

'ROBERTA large (l = 24, d = 1024, h = 16 ; 340M parameters)'
l = 24
d = 1024
h = d // dh
model = RobertaModel.from_pretrained('roberta-large')
S, N = getscoresBERT(d,l,h,dh,model)
models['ROBERTAlarge'] = [l,d,h,dh,S,N]

'DistillROBERTA base (l = 6, d = 768, h = 12 ; tot num parameters 82M)'
l = 6
d = 768
h = d // dh
model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
S, N = getscoresDistillROBERTA(d,l,h,dh,model)
models['DistillROBERTA'] = [l,d,h,dh,S,N]

'save'
with open('../data/fig_scores/models.pkl', 'wb') as file:
    pickle.dump(models, file)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model

In [47]:
""" 
ALBERT models 
 - MODEL: Bidirectional, Encoder-only Transformer
 - DATASETS: BookCorpus & English Wikipedia
 - OBJECTIVES: Masked Language Modeling (MLM), Next Sentence Prediction (NSP)
 - METRICS: perplexity, cross-entropy

 MLM: randomly masked some words in the sentence, predict masked words with cross-entropy 
 over the vocabulary 
 NSP: 

The idea is that these models have a better understanding of context, where each word is represented as a 
linear combination of all the other words in the sentence, bi-directionally.
"""
with open('../data/fig_scores/models.pkl', 'rb') as file:
    models = pickle.load(file)
    
dh = 64

'ALBERT base model (l = 12, d = 768, h = 12 ; tot num parameters 11M)'
l = 1
d = 768
h = d // dh
model = AlbertModel.from_pretrained("albert-base-v2")
S, N = getscoresALBERT(d,l,h,dh,model)
models['ALBERTbase'] = [l,d,h,dh,S,N]

'ALBERT large model (l = 24, d = 1024, h = 16 ; tot num parameters 17M)'
l = 1
d = 1024
h = d // dh
model = AlbertModel.from_pretrained("albert-large-v2")
S, N = getscoresALBERT(d,l,h,dh,model)
models['ALBERTlarge'] = [l,d,h,dh,S,N]

'ALBERT xlarge model (l = 24, d = 2048, h = 16 ; tot num parameters 58M)'
dh = 64
l = 1
d = 2048
h = d // dh
model = AlbertModel.from_pretrained("albert-xlarge-v2")
S, N = getscoresALBERT(d,l,h,dh,model)
models['ALBERTxlarge'] = [l,d,h,dh,S,N]

'ALBERT xxlarge model (l = 12, d = 4096, h = 64 ; tot num parameters 223M)'
dh = 64
l = 1
d = 4096
h = d // dh
model = AlbertModel.from_pretrained("albert-xxlarge-v2")
S, N = getscoresALBERT(d,l,h,dh,model)
models['ALBERTxxlarge'] = [l,d,h,dh,S,N]

'save'
with open('../data/fig_scores/models.pkl', 'wb') as file:
    pickle.dump(models, file)

In [48]:
""" 
Generative Pre-trained Transformers (GPT) models 
 - MODEL: Unidirectional (causal), Decoder-only Transformer
 - DATASETS: BookCorpus & English Wikipedia
 - OBJECTIVES: Masked Language Modeling (MLM), Next Sentence Prediction (NSP)

The idea is that these models have a better understanding of context, where each word is represented as a 
linear combination of all the other words in the sentence.
"""
with open('../data/fig_scores/models.pkl', 'rb') as file:
    models = pickle.load(file)

dh = 64

'GPT 1 (l = 12, d = 768, h = 12 ; 110M parameters)'
l = 12
d = 768
h = d // dh
model = OpenAIGPTModel.from_pretrained("openai-gpt")
S, N = getscoresGPT(d,l,h,dh,model)
models['GPT'] = [l,d,h,dh,S,N]

'GPT2 (l = 12, d = 768, h = 12 ; 117M parameters)'
l = 12
d = 768
h = d // dh
model = GPT2Model.from_pretrained('gpt2')
S, N = getscoresGPT(d,l,h,dh,model)
models['GPT2'] = [l,d,h,dh,S,N]

'GPT2 medium (l = 24, d = 1024, h = 16 ; 345M parameters)'
l = 24
d = 1024
h = d // dh
model = GPT2Model.from_pretrained('gpt2-medium')
S, N = getscoresGPT(d,l,h,dh,model)
models['GPT2medium'] = [l,d,h,dh,S,N]

'GPT2 large (l = 36, d = 1280, h = 20 ; 774M parameters)'
l = 36
d = 1280
h = d // dh
model = GPT2Model.from_pretrained('gpt2-large')
S, N = getscoresGPT(d,l,h,dh,model)
models['GPT2large'] = [l,d,h,dh,S,N]

'GPT2 xl (l = 48, d = 1600, h = 25 ; 1558M parameters)'
l = 48
d = 1600
h = d // dh
model = GPT2Model.from_pretrained('gpt2-xl')
S, N = getscoresGPT(d,l,h,dh,model)
models['GPT2xl'] = [l,d,h,dh,S,N]

'DistillGPT2 base model (l = 12, d = 768, h = 12 ; tot num parameters 82M)'
l = 6
d = 768
h = d // dh
model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
S, N = getscoresDistillGPT(d,l,h,dh,model)
models['DistillGPT2'] = [l,d,h,dh,S,N]

'save'
with open('../data/fig_scores/models.pkl', 'wb') as file:
    pickle.dump(models, file)

In [15]:
""" 
GPT Neo models (EleutherAI)
 - MODEL: Unidirectional (causal), Decoder-only Transformer
 - DATASETS: BookCorpus & English Wikipedia
 - OBJECTIVES: Masked Language Modeling (MLM), Next Sentence Prediction (NSP)

The idea is that these models have a better understanding of context, where each word is represented as a 
linear combination of all the other words in the sentence.
"""
with open('../data/fig_scores/models.pkl', 'rb') as file:
    models = pickle.load(file)

'GPT neo 125m (l = 12, d = 768, h = 12)'
dh = 64
l = 12
d = 768
h = d // dh
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-125m")
S, N = getscoresGPTneo(d,l,h,dh,model)
models['GPTneo-125m'] = [l,d,h,dh,S,N]

'GPT neo 1.3b (l = 12, d = 768, h = 16)'
dh = 128
l = 24
d = 2048
h = d // dh
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
S, N = getscoresGPTneo(d,l,h,dh,model)
models['GPTneo-1.3b'] = [l,d,h,dh,S,N]

'GPT neo 2.7b (l = 12, d = 768, h = 20)'
dh = 128
l = 32
d = 2560
h = d // dh
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B")
S, N = getscoresGPTneo(d,l,h,dh,model)
models['GPTneo-2.7b'] = [l,d,h,dh,S,N]

with open('../data/fig_scores/models.pkl', 'wb') as file:
    pickle.dump(models, file)

# 'GPT neox 20b (l = 44, d = 768, h = 64)'
# dh = 96
# l = 44
# d = 6144
# h = d // dh
# model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b")
# S, N = getscoresGPTneox(d,l,h,dh,model)
# models['GPTneox-20b'] = [l,d,h,dh,S,N]

# with open('../data/fig_scores/models.pkl', 'wb') as file:
#     pickle.dump(models, file)

'GPT-j 6b (l = 28, d = 4096, h = 16)'
dh = 256
l = 28
d = 4096
h = d // dh
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6b")
S, N = getscoresGPTj(d,l,h,dh,model)
models['GPTj-6b'] = [l,d,h,dh,S,N]

'save'
with open('../data/fig_scores/models.pkl', 'wb') as file:
    pickle.dump(models, file)

In [None]:
""" 
Open Pre-trained Transformers (øPT) models 
 - MODEL: Unidirectional (causal), Decoder-only Transformer
 - DATASETS: BookCorpus & English Wikipedia
 - OBJECTIVES: Masked Language Modeling (MLM), Next Sentence Prediction (NSP)

The idea is that these models have a better understanding of context, where each word is represented as a 
linear combination of all the other words in the sentence.
"""
with open('../data/fig_scores/models.pkl', 'rb') as file:
    models = pickle.load(file)

dh = 64

'OPT-125m (l = 12, d = 768, h = 12)'
l = 12
d = 768
h = d // dh
model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
S, N = getscoresOPT(d,l,h,dh,model)
models['OPT-125m'] = [l,d,h,dh,S,N]

'OPT-350m (l = 24, d = 1024, h = 16)'
l = 24
d = 1024
h = d // dh
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
S, N = getscoresOPT(d,l,h,dh,model)
models['OPT-350m'] = [l,d,h,dh,S,N]

'OPT-1.3b (l = 24, d = 2048, h = 32)'
l = 24
d = 2048
h = d // dh
model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b")
S, N = getscoresOPT(d,l,h,dh,model)
models['OPT-1.3b'] = [l,d,h,dh,S,N]

'OPT-2.7b (l = 24, d = 2048, h = 32)'
l = 24
d = 2048
h = d // dh
model = AutoModelForCausalLM.from_pretrained("facebook/opt-2.7b")
S, N = getscoresOPT(d,l,h,dh,model)
models['OPT-2.7b'] = [l,d,h,dh,S,N]

'save'
with open('../data/fig_scores/models.pkl', 'wb') as file:
    pickle.dump(models, file)

'OPT-6.7b (l = 24, d = 2048, h = 32)'
l = 24
d = 2048
h = d // dh
model = AutoModelForCausalLM.from_pretrained("facebook/opt-6.7b")
S, N = getscoresOPT(d,l,h,dh,model)
models['OPT-6.7b'] = [l,d,h,dh,S,N]

'save'
with open('../data/fig_scores/models.pkl', 'wb') as file:
    pickle.dump(models, file)

'OPT-13b (l = 24, d = 2048, h = 32)'
l = 24
d = 2048
h = d // dh
model = AutoModelForCausalLM.from_pretrained("facebook/opt-13b")
S, N = getscoresOPT(d,l,h,dh,model)
models['OPT-13b'] = [l,d,h,dh,S,N]

'save'
with open('../data/fig_scores/models.pkl', 'wb') as file:
    pickle.dump(models, file)

# 'OPT-30b (l = 24, d = 2048, h = 32)'
# l = 24
# d = 2048
# h = d // dh
# model = AutoModelForCausalLM.from_pretrained("facebook/opt-30b")
# S, N = getscoresOPT(d,l,h,dh,model)
# models['OPT-30b'] = [l,d,h,dh,S,N]

# print('done')

# 'save'
# with open('../data/fig_scores/models.pkl', 'wb') as file:
#     pickle.dump(models, file)

# 'OPT-66b (l = 24, d = 2048, h = 32)'
# l = 24
# d = 2048
# h = d // dh
# model = AutoModelForCausalLM.from_pretrained("facebook/opt-66b")
# S, N = getscoresOPT(d,l,h,dh,model)
# models['OPT-66b'] = [l,d,h,dh,S,N]

# print('done')

# 'save'
# with open('../data/fig_scores/models.pkl', 'wb') as file:
#     pickle.dump(models, file)

KeyboardInterrupt: 

In [49]:
""" 
T5 models 
 - MODEL: 
 - DATASETS: BookCorpus & English Wikipedia
 - OBJECTIVES: Masked Language Modeling (MLM), Next Sentence Prediction (NSP)

BERT base (l = 12, d = 768, h = 12 ; 110M parameters)
"""
with open('../data/fig_scores/models.pkl', 'rb') as file:
    models = pickle.load(file)
    
'T5 small model (l = 12, d = 768, h = 12 ; tot num parameters 110M)'
dh = 64
l = 6
d = 512
h = d // dh

model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
S, N = getscoresT5(d,l,h,dh,model)
models['T5small'] = [l,d,h,dh,S,N]

'T5 base model (l = 12, d = 768, h = 12 ; tot num parameters 110M)'
dh = 64
l = 12
d = 768
h = d // dh
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
S, N = getscoresT5(d,l,h,dh,model)
models['T5base'] = [l,d,h,dh,S,N]

'T5 large model (l = 12, d = 768, h = 12 ; tot num parameters 110M)'
dh = 32
l = 24
d = 1024
h = d // dh
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-large")
S, N = getscoresT5(d,l,h,dh,model)
models['T5large'] = [l,d,h,dh,S,N]

'T5 3B model (l = 12, d = 768, h = 12 ; tot num parameters 110M)'
dh = 8
l = 24
d = 1024
h = d // dh
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-3B")
S, N = getscoresT5(d,l,h,dh,model)
models['T53b'] = [l,d,h,dh,S,N]

'save'
with open('../data/fig_scores/models.pkl', 'wb') as file:
    pickle.dump(models, file)