In [5]:
from transformers import AutoConfig, AutoModelForCausalLM
import torch

In [2]:
config = AutoConfig.from_pretrained('mistralai/Mistral-7B-v0.1')
config

MistralConfig {
  "_name_or_path": "mistralai/Mistral-7B-v0.1",
  "architectures": [
    "MistralForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.35.0",
  "use_cache": true,
  "vocab_size": 32000
}

In [3]:
config.intermediate_size = 8640
config.num_hidden_layers = 26
config.hidden_size = 3200

In [None]:
model = AutoModelForCausalLM.from_config(config, torch_dtype = torch.bfloat16)

[2023-11-24 05:09:53,968] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [7]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 3200)
    (layers): ModuleList(
      (0-25): 26 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (k_proj): Linear(in_features=3200, out_features=800, bias=False)
          (v_proj): Linear(in_features=3200, out_features=800, bias=False)
          (o_proj): Linear(in_features=3200, out_features=3200, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=3200, out_features=8640, bias=False)
          (up_proj): Linear(in_features=3200, out_features=8640, bias=False)
          (down_proj): Linear(in_features=8640, out_features=3200, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm

In [8]:
model.dtype

torch.bfloat16

In [9]:
sum(
param.numel() for param in model.parameters()
)

3027113600

In [10]:
model.push_to_hub('huseinzol05/dummy-mistral-3b')

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/huseinzol05/dummy-mistral-3b/commit/5e87e2c7fb3c8ebdb16ba8916bb30ac64b61b8a1', commit_message='Upload MistralForCausalLM', commit_description='', oid='5e87e2c7fb3c8ebdb16ba8916bb30ac64b61b8a1', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
config.push_to_hub('huseinzol05/dummy-mistral-3b')

CommitInfo(commit_url='https://huggingface.co/huseinzol05/dummy-mistral-3b/commit/f46a184e628451934de503a1fc6289374b4f9274', commit_message='Upload config', commit_description='', oid='f46a184e628451934de503a1fc6289374b4f9274', pr_url=None, pr_revision=None, pr_num=None)