In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, MistralConfig, GPT2Config, AutoConfig

In [2]:
tokenizer = AutoTokenizer.from_pretrained('malaysia-ai/bpe-tokenizer')

In [3]:
config = AutoConfig.from_pretrained('mesolitica/mistral-7b-4096-fpf')
config.pad_token_id=tokenizer.pad_token_id
config.bos_token_id=tokenizer.bos_token_id
config.eos_token_id=tokenizer.eos_token_id
config.vocab_size = len(tokenizer)
config.hidden_size=1024
config.num_hidden_layers=18
config.num_attention_heads=16
config.max_position_embeddings=32768
config.intermediate_size=4096
config

MistralConfig {
  "_name_or_path": "mesolitica/mistral-7b-4096-fpf",
  "architectures": [
    "MistralForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 16,
  "num_hidden_layers": 18,
  "num_key_value_heads": 8,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.34.1",
  "use_cache": true,
  "vocab_size": 32000
}

In [4]:
model = AutoModelForCausalLM.from_config(config, torch_dtype = torch.float16)
model.config.torch_dtype = 'float16'
model.config

MistralConfig {
  "_name_or_path": "mesolitica/mistral-7b-4096-fpf",
  "architectures": [
    "MistralForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 16,
  "num_hidden_layers": 18,
  "num_key_value_heads": 8,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.34.1",
  "use_cache": true,
  "vocab_size": 32000
}

In [5]:
model.config

MistralConfig {
  "_name_or_path": "mesolitica/mistral-7b-4096-fpf",
  "architectures": [
    "MistralForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 16,
  "num_hidden_layers": 18,
  "num_key_value_heads": 8,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.34.1",
  "use_cache": true,
  "vocab_size": 32000
}

In [6]:
model_size = sum(t.numel() for t in model.parameters())
print(f"Mistral size: {model_size/1000**2:.1f}M parameters")

Mistral size: 348.7M parameters


In [7]:
model.save_pretrained('mistral-349M')