In [1]:
import numpy as np

In [123]:
## BERT
## models with bias terms

from transformers import AutoModel, BertModel, DistilBertModel

'BERT tiny (l = 2, d = 128, h = 2 ; 4.40M parameters)'
model = AutoModel.from_pretrained("google/bert_uncased_L-2_H-128_A-2")

'BERT mini (l = 4, d = 256, h = 4 ; 11.3M parameters)'
model = AutoModel.from_pretrained("google/bert_uncased_L-4_H-256_A-4")

'BERT small (l = 4, d = 512, h = 8 ; 29.1M parameters)'
model = AutoModel.from_pretrained("google/bert_uncased_L-4_H-512_A-8")

'BERT medium (l = 8, d = 512, h = 8 ; 41.7M parameters)'
model = AutoModel.from_pretrained("google/bert_uncased_L-8_H-512_A-8")

'BERT base (l = 12, d = 768, h = 12 ; 110M parameters)'
model = BertModel.from_pretrained("bert-base-uncased")

'BERT large (l = 24, d = 1024, h = 16 ; 340M parameters)'
model = BertModel.from_pretrained("bert-large-uncased")

'BERT large (masking) (l = 24, d = 1024, h = 16 ; 340M parameters)'
model = BertModel.from_pretrained("bert-large-uncased-whole-word-masking")

'DistillBERT base model (l = 6, d = 768, h = 12 ; tot num parameters 66M)'
model = DistilBertModel.from_pretrained("distilbert-base-uncased")


In [134]:
## ROBERTA
## models with bias terms

from transformers import RobertaModel, AutoModelForMaskedLM

'ROBERTA base (l = 24, d = 1024, h = 16 ; 125M parameters)'
model = RobertaModel.from_pretrained('roberta-base')

'ROBERTA large (l = 24, d = 1024, h = 16 ; 355M parameters)'
model = RobertaModel.from_pretrained('roberta-large')

'DistillROBERTA base (l = 6, d = 768, h = 12 ; 82.2M parameters)'
model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
np.where(model.roberta.embeddings.word_embeddings.weight.detach().numpy() != model.lm_head.decoder.weight.detach().numpy())


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [138]:
## ALBERT models
## models with bias terms

from transformers import AlbertModel

'ALBERT base model (l = 12, d = 768, h = 12 ; tot num parameters 11M)'
model = AlbertModel.from_pretrained("albert-base-v2")

'ALBERT large model (l = 24, d = 1024, h = 16 ; tot num parameters 17M)'
model = AlbertModel.from_pretrained("albert-large-v2")

'ALBERT xlarge model (l = 24, d = 2048, h = 16 ; tot num parameters 58M)'
model = AlbertModel.from_pretrained("albert-xlarge-v2")

'ALBERT xxlarge model (l = 12, d = 4096, h = 64 ; tot num parameters 223M)'
model = AlbertModel.from_pretrained("albert-xxlarge-v2")

In [144]:
## GPT models
## models with bias terms

from transformers import OpenAIGPTModel, GPT2Model, AutoModelForCausalLM

'GPT 1 (l = 12, d = 768, h = 12 ; 110M parameters)'
model = OpenAIGPTModel.from_pretrained("openai-gpt")

'GPT2 (l = 12, d = 768, h = 12 ; 117M parameters)'
model = GPT2Model.from_pretrained('gpt2')

'GPT2 medium (l = 24, d = 1024, h = 16 ; 345M parameters)'
model = GPT2Model.from_pretrained('gpt2-medium')

'GPT2 large (l = 36, d = 1280, h = 20 ; 774M parameters)'
model = GPT2Model.from_pretrained('gpt2-large')

'GPT2 xl (l = 48, d = 1600, h = 25 ; 1558M parameters)'
model = GPT2Model.from_pretrained('gpt2-xl')

'DistillGPT2 base model (l = 12, d = 768, h = 12 ; tot num parameters 82M)'
model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
np.where(model.transformer.wte.weight.detach().numpy() != model.lm_head.weight.detach().numpy())

In [151]:
## GEMMA models
## models WITHOUT bias

from transformers import AutoModelForCausalLM

'Gemma model 2b (l = 18, d = 2048, h = 8, h_kv = 1 ; tot num parameters 2B)'
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b")
np.where(model.model.embed_tokens.weight.detach().numpy() != model.lm_head.weight.detach().numpy())

'Gemma model 7b (l = 18, d = 2048, h = 8, h_kv = 1 ; tot num parameters 2B)'
model = AutoModelForCausalLM.from_pretrained("google/gemma-7b")
np.where(model.model.embed_tokens.weight.detach().numpy() != model.lm_head.weight.detach().numpy())

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.92s/it]


(array([], dtype=int64), array([], dtype=int64))

In [3]:
## MISTRAL models (NO weight tying)
## models WITHOUT bias

from transformers import AutoModelForCausalLM

'Mistral model 7b (l = 32, d = 4096, h = 32, h_kv = 8 ; tot num parameters 7B)'
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")

np.where(model.model.embed_tokens.weight.detach().numpy() != model.lm_head.weight.detach().numpy())

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:20<00:00, 10.43s/it]


(array([    0,     0,     0, ..., 31999, 31999, 31999]),
 array([   0,    1,    2, ..., 4093, 4094, 4095]))

In [5]:
## LLAMA2 models (NO weight tying)
## models WITHOUT bias

'LLAMA 2 7b (l = 32, d = 4096, h = 32 ; tot num parameters 2B)'
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
np.where(model.model.embed_tokens.weight.detach().numpy() != model.lm_head.weight.detach().numpy())

Loading checkpoint shards: 100%|██████████| 2/2 [00:27<00:00, 13.92s/it]


(array([    0,     0,     0, ..., 31999, 31999, 31999]),
 array([   0,    1,    2, ..., 4093, 4094, 4095]))

In [7]:
(model.model.embed_tokens.weight.detach().numpy() - model.lm_head.weight.detach().numpy()).max()

0.30423927